def run_signalp(fasta_files, output_file, tmp_dir): def _consumer(worker_index, worker_fasta_file, output_queue): # process with signalp logging.debug('[Worker %d] Running SignalP' % (worker_index)) args = ['signalp', '-f', 'short', '-t', 'euk', worker_fasta_file] tmp_output_file = os.path.join(tmp_dir, 'worker%d.signalp.txt' % (worker_index)) with open(tmp_output_file, 'w') as outfileh: retcode = subprocess.call(args, stdout=outfileh) if retcode != 0: logging.error('[Worker %d] Error running SignalP' % (worker_index)) # put results into output queue with open(tmp_output_file) as f: for line in f: if not line: continue line = line.strip() if not line: continue if line.startswith('#'): continue output_queue.put(line) output_queue.put(None) logging.debug('[Worker %d] Finished' % (worker_index)) # create multiprocessing queue for passing data output_queue = Queue(maxsize=len(fasta_files)*4) # start consumer processes procs = [] for i in xrange(len(fasta_files)): p = Process(target=_consumer, args=(i, fasta_files[i], output_queue)) p.start() procs.append(p) # get results from consumers num_alive = len(procs) tmp_output_file = os.path.join(tmp_dir, 'signalp.merge.txt') with open(tmp_output_file, 'w') as f: while num_alive > 0: result = output_queue.get() if result is None: num_alive -= 1 logging.debug("Main process detected worker finished, %d still alive" % (num_alive)) else: print >>f, result logging.debug("Joining all processes") # wait for consumers to finish for p in procs: p.join() # sort logging.debug('Sorting signal peptide results') def sort_signalp(line): return line.split(None,1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=tmp_output_file, output=output_file, key=sort_signalp, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) # cleanup temporary files shutil.rmtree(sort_tmp_dir) return 0
def sort_classification_results(input_file, output_file, tmp_dir): # sort classification results def sort_by_chrom_start(line): fields = line.strip().split('\t', 2) if fields[0] == "chrom": return chr(0), 0 return fields[0], int(fields[1]) batch_sort(input=input_file, output=output_file, key=sort_by_chrom_start, buffer_size=(1 << 21), tempdirs=[tmp_dir])
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes): # # extract transcript DNA sequences, translate to protein, and # search for ORFs # logging.debug('Finding ORFs in transcript sequences') # output files tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.info("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed') unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt') unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed') orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt') sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt') sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt') signalp_file = os.path.join(output_dir, 'signalp.txt') pfam_file = os.path.join(output_dir, 'pfam.txt') merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt') sorted_merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbytranscript.merged.txt') # open output files orf_fileh = open(orf_file, 'w') orf_bed_fileh = open(orf_bed_file, 'w') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) num_finished = 1 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if first_orf_only: # get first ORF orf = get_first_transcript_orf(t, ref_fa) if len(orf.seq) >= min_orf_length: print >>orf_fileh, '\t'.join(orf.to_table()) print >>orf_bed_fileh, '\t'.join(orf.to_bed()) else: # get all ORFs for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length): print >>orf_fileh, '\t'.join(orf.to_table()) print >>orf_bed_fileh, '\t'.join(orf.to_bed()) if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # cleanup orf_fileh.close() orf_bed_fileh.close() # # sort ORF table by ORF amino acid sequence to group identical ORFs # together # logging.debug('Sorting ORFs by amino acid sequence') def sort_by_seq(line): '''comparison function for batch_sort''' fields = line.strip().split('\t') return fields[ORFInfo.SEQ_COL_NUM] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=orf_file, output=sorted_orf_file, key=sort_by_seq, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # # assign each ORF a unique id and write to FASTA file # logging.debug('Determining unique ORFs') orf_fasta_prefix = os.path.join(tmp_dir, 'orf') orf_fasta_files = [] orf_fasta_sizes = [] for i in xrange(num_processes): orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w')) orf_fasta_sizes.append(0) orf_file_index = 0 outfileh = open(sorted_orf_id_file, 'w') unique_orf_fileh = open(unique_orf_file, 'w') print >>unique_orf_fileh, '\t'.join(['orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences']) unique_orf_bed_fileh = open(unique_orf_bed_file, 'w') with open(sorted_orf_file) as infileh: for orfs in group_unique_orfs(infileh): # write to master transcript/ORF table for orf in orfs: print >>outfileh, '\t'.join(orf.to_table()) # write ORF to fasta file lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*')) print >>orf_fasta_files[orf_file_index], lines orf_fasta_sizes[orf_file_index] += 1 # advance to next fasta file orf_file_index = (orf_file_index + 1) % (num_processes) # group by genomic position and write ORFs to BED file unique_genome_orfs = {} for orf in orfs: k = (orf.chrom, orf.strand, tuple(orf.exons)) if k in unique_genome_orfs: continue unique_genome_orfs[k] = orf for orf in unique_genome_orfs.itervalues(): print >>unique_orf_bed_fileh, '\t'.join(orf.to_bed(orf.orf_id)) # write unique ORF to tab-delimited text file fields = [orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs)] print >>unique_orf_fileh, '\t'.join(map(str,fields)) # cleanup unique_orf_bed_fileh.close() outfileh.close() # get fasta files with lines written orf_fasta_file_names = [] for i in xrange(len(orf_fasta_files)): orf_fasta_files[i].close() if orf_fasta_sizes[i] > 0: orf_fasta_file_names.append(orf_fasta_files[i].name) # # search FASTA file against signalp # logging.debug('Searching for signal peptides') retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir) if retcode != 0: logging.error('Error searching for signal peptides') return 1 # # search FASTA file against Pfam # logging.error('Scanning for Pfam domains') retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') # # merge results from Pfam and signalp # logging.debug('Merging SignalP and Pfam results') merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file) # # sort by transcript id # logging.debug('Sorting ORFs by transcript ID') def sort_by_transcript_id(line): return line.split('\t', 1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=merged_orf_id_file, output=sorted_merged_orf_id_file, key=sort_by_transcript_id, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # cleanup ref_fa.close() return 0
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir, min_orf_length, first_orf_only, num_processes): # # extract transcript DNA sequences, translate to protein, and # search for ORFs # logging.debug('Finding ORFs in transcript sequences') # output files tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.info("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed') unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt') unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed') orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt') sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt') sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt') signalp_file = os.path.join(output_dir, 'signalp.txt') pfam_file = os.path.join(output_dir, 'pfam.txt') merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt') sorted_merged_orf_id_file = os.path.join( output_dir, 'transcript_orfs.sortbytranscript.merged.txt') # open output files orf_fileh = open(orf_file, 'w') orf_bed_fileh = open(orf_bed_file, 'w') # open genome fasta file ref_fa = pysam.Fastafile(genome_fasta_file) num_finished = 1 for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: if first_orf_only: # get first ORF orf = get_first_transcript_orf(t, ref_fa) if len(orf.seq) >= min_orf_length: print >> orf_fileh, '\t'.join(orf.to_table()) print >> orf_bed_fileh, '\t'.join(orf.to_bed()) else: # get all ORFs for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length): print >> orf_fileh, '\t'.join(orf.to_table()) print >> orf_bed_fileh, '\t'.join(orf.to_bed()) if (num_finished % 10000) == 0: logging.debug('Processed %d transcripts' % (num_finished)) num_finished += 1 # cleanup orf_fileh.close() orf_bed_fileh.close() # # sort ORF table by ORF amino acid sequence to group identical ORFs # together # logging.debug('Sorting ORFs by amino acid sequence') def sort_by_seq(line): '''comparison function for batch_sort''' fields = line.strip().split('\t') return fields[ORFInfo.SEQ_COL_NUM] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=orf_file, output=sorted_orf_file, key=sort_by_seq, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # # assign each ORF a unique id and write to FASTA file # logging.debug('Determining unique ORFs') orf_fasta_prefix = os.path.join(tmp_dir, 'orf') orf_fasta_files = [] orf_fasta_sizes = [] for i in xrange(num_processes): orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w')) orf_fasta_sizes.append(0) orf_file_index = 0 outfileh = open(sorted_orf_id_file, 'w') unique_orf_fileh = open(unique_orf_file, 'w') print >> unique_orf_fileh, '\t'.join([ 'orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences' ]) unique_orf_bed_fileh = open(unique_orf_bed_file, 'w') with open(sorted_orf_file) as infileh: for orfs in group_unique_orfs(infileh): # write to master transcript/ORF table for orf in orfs: print >> outfileh, '\t'.join(orf.to_table()) # write ORF to fasta file lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*')) print >> orf_fasta_files[orf_file_index], lines orf_fasta_sizes[orf_file_index] += 1 # advance to next fasta file orf_file_index = (orf_file_index + 1) % (num_processes) # group by genomic position and write ORFs to BED file unique_genome_orfs = {} for orf in orfs: k = (orf.chrom, orf.strand, tuple(orf.exons)) if k in unique_genome_orfs: continue unique_genome_orfs[k] = orf for orf in unique_genome_orfs.itervalues(): print >> unique_orf_bed_fileh, '\t'.join(orf.to_bed( orf.orf_id)) # write unique ORF to tab-delimited text file fields = [ orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs) ] print >> unique_orf_fileh, '\t'.join(map(str, fields)) # cleanup unique_orf_bed_fileh.close() outfileh.close() # get fasta files with lines written orf_fasta_file_names = [] for i in xrange(len(orf_fasta_files)): orf_fasta_files[i].close() if orf_fasta_sizes[i] > 0: orf_fasta_file_names.append(orf_fasta_files[i].name) # # search FASTA file against signalp # logging.debug('Searching for signal peptides') retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir) if retcode != 0: logging.error('Error searching for signal peptides') return 1 # # search FASTA file against Pfam # logging.error('Scanning for Pfam domains') retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir) if retcode != 0: logging.error('Error running pfam_scan.pl') # # merge results from Pfam and signalp # logging.debug('Merging SignalP and Pfam results') merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file) # # sort by transcript id # logging.debug('Sorting ORFs by transcript ID') def sort_by_transcript_id(line): return line.split('\t', 1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=merged_orf_id_file, output=sorted_merged_orf_id_file, key=sort_by_transcript_id, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) shutil.rmtree(sort_tmp_dir) # cleanup ref_fa.close() return 0
def run_pfam(fasta_files, pfam_dir, output_file, tmp_dir): def _consumer(worker_index, worker_fasta_file, output_queue): # process with pfam logging.debug('[Worker %d] Running pfam_scan.pl' % (worker_index)) tmp_output_file = os.path.join(tmp_dir, 'worker%d.pfam.txt' % (worker_index)) args = [ 'pfam_scan.pl', '-cpu', '1', '-pfamB', '-fasta', worker_fasta_file, '-dir', pfam_dir, '-outfile', tmp_output_file ] retcode = subprocess.call(args) if retcode != 0: logging.error('[Worker %d] Error running pfam_scan.pl' % (worker_index)) # put results into output queue with open(tmp_output_file) as f: for line in f: if not line: continue line = line.strip() if not line: continue if line.startswith('#'): continue output_queue.put(line) output_queue.put(None) logging.debug('[Worker %d] Finished' % (worker_index)) # create multiprocessing queues for passing data output_queue = Queue(maxsize=len(fasta_files) * 4) # start consumer processes procs = [] for i in xrange(len(fasta_files)): p = Process(target=_consumer, args=(i, fasta_files[i], output_queue)) p.start() procs.append(p) # get results from consumers num_alive = len(procs) tmp_output_file = os.path.join(tmp_dir, 'pfam.merge.txt') with open(tmp_output_file, 'w') as f: while num_alive > 0: result = output_queue.get() if result is None: num_alive -= 1 logging.debug( "Main process detected worker finished, %d still alive" % (num_alive)) else: print >> f, result logging.debug("Joining all processes") # wait for consumers to finish for p in procs: p.join() # sort logging.debug('Sorting pfam results') def sort_pfam(line): return line.split(None, 1)[0] sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp') os.makedirs(sort_tmp_dir) batch_sort(input=tmp_output_file, output=output_file, key=sort_pfam, buffer_size=SORT_BUFFER_SIZE, tempdirs=[sort_tmp_dir]) # cleanup temporary files shutil.rmtree(sort_tmp_dir) return 0