def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, use_top=500, cpus=8): sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = r.seq.tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = r.seq.reverse_complement().tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '-')) write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.cds.nr90 -c 0.90 -n 5".format( o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix + '.cds.nr90'), 'fasta') ] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:use_top] picked = [] with open(output_prefix + '.training_' + str(use_top) + '.cds', 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) picked.append(r.id) with open(output_prefix + '.training_' + str(use_top) + '.utr', 'w') as f: for r in SeqIO.parse(open(output_prefix + '.utr'), 'fasta'): if r.id in picked: f.write(">{0}\n{1}\n".format(r.description, r.seq))
def ANGEL_predict_worker(input_fasta, output_prefix, bdt, o_all, min_ANGEL_aa_length=50, min_dumb_aa_length=100, use_rev_strand=False, output_rev_only_if_longer=False, starting_index=1): for rec in SeqIO.parse(open(input_fasta), 'fasta'): ORFs = [] seq_len = len(rec.seq) n, m = len(rec.seq)/3, len(rec.seq)%3 print >> sys.stderr, "predicting for", rec.id # (1a) predict on + strand result = defaultdict(lambda: []) # frame --> list of (type, start, end) max_angle_predicted_orf_len = min_dumb_aa_length flag, name, good = ORFscores.predict_ORF(rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length) #print >> sys.stderr, flag, name, good for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0) result[_frame].append((flag, s, e)) max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e - s)/3 + 1) ORFs.append((rec, result, '+')) # (1b) run dumb ORFs, if better than longest of ANGEL's output it as well dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(), max_angle_predicted_orf_len) if sum(len(v) for v in dumb.itervalues()) > 0: ORFs.append((rec, dumb, '+')) for v in dumb.itervalues(): if len(v) > 0: for _flag, _s, _e in v: max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (_e - _s)/3 + 1) # (2a) see if need to predict on - strand # if need to, create a rec2 that has the rev complement if use_rev_strand: #print "output_rev_only_if_longer:", output_rev_only_if_longer if output_rev_only_if_longer: # min aa length must be longer than the forward strand longest prediction min_dumb_aa_length_for_rev = max_angle_predicted_orf_len min_ANGEL_aa_length_for_rev = max(max_angle_predicted_orf_len, min_ANGEL_aa_length) else: min_dumb_aa_length_for_rev = min_dumb_aa_length min_ANGEL_aa_length_for_rev = min_ANGEL_aa_length #print min_dumb_aa_length, min_ANGEL_aa_length rec2 = SeqRecord(rec.seq.reverse_complement(), id=rec.id, description=rec.description) result = defaultdict(lambda: []) # frame --> list of (type, start, end) max_angle_predicted_orf_len = min_dumb_aa_length_for_rev #print "calling rev with min_aa_len", min_ANGEL_aa_length flag, name, good = ORFscores.predict_ORF(rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length_for_rev) for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0) result[_frame].append((flag, s, e)) max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e-s)/3+1) ORFs.append((rec, result, '-')) # NOTE: sending rec instead of rec2 here is CORRECT dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(), max_angle_predicted_orf_len) if sum(len(v) for v in dumb.itervalues()) > 0: ORFs.append((rec, dumb, '-')) # NOTE: sending rec instead of rec2 here is CORRECT starting_index = write_CDS_n_PEP(ORFs, output_prefix, min_utr_length=50, append_file=True, starting_index=starting_index)
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, use_top=500, cpus=8): sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = r.seq.tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = r.seq.reverse_complement().tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '-')) write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.cds.nr90 -c 0.90 -n 5".format(o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix+'.cds.nr90'), 'fasta')] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:use_top] picked = [] with open(output_prefix + '.training_' + str(use_top) + '.cds', 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) picked.append(r.id) with open(output_prefix + '.training_' + str(use_top) + '.utr', 'w') as f: for r in SeqIO.parse(open(output_prefix + '.utr'), 'fasta'): if r.id in picked: f.write(">{0}\n{1}\n".format(r.description, r.seq))
def ANGEL_predict_worker(input_fasta, output_prefix, bdt, o_all, min_ANGEL_aa_length=50, min_dumb_aa_length=100, use_rev_strand=False, output_mode='best', max_angel_secondORF_distance=10, starting_index=1): """ Output Mode is either "best" or "all" If "all" and + strand only: ANGEL+, dumb+ (both subject to its length threshold) If "all" and - strand also: ANGEL+, dumb+, ANGEL-, dumb- If "best" and + strand only: argmax_(ANGEL+, dumb+) If "best" and - strand also: argmax_(ANGEL+, dumb+, ANGEL-, dumb-) For dumb, pick only the longest ORF ouf of the 3 possible frames for that strand. For ANGEL, if there are multiple ORFs (suspicious), the longest one is chosen as the "length" to beat dumb, and if ANGEL is chosen as output, all ORFs are output. """ for rec in SeqIO.parse(open(input_fasta), 'fasta'): ORFs = [] # convert any non-ATCG to 'A' rec.seq = Seq(convert_non_ATCG(str(rec.seq), replace_with='A')) seq_len = len(rec.seq) n, m = seq_len // 3, seq_len % 3 print("predicting for", rec.id, file=sys.stderr) # (1a) predict on + strand for ANGEL result = defaultdict( lambda: []) # frame --> list of (type, start, end) stuff = [ ] # (frame, type, start, end) # this should eventually replace result, keeping result for now. flag, name, good = ORFscores.predict_ORF( rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length) #print >> sys.stderr, flag, name, good for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + ( _frame if m >= _frame else 0) result[_frame].append((flag, s, e)) stuff.append((_frame, flag, s, e)) # REGARDLESS OF FRAME, only keep the first ORF unless the later ones overlap or is sufficiently close stuff.sort(key=lambda a_b_c_d: (a_b_c_d[2], a_b_c_d[3] - a_b_c_d[2]) ) # sort by start, then length i = 1 while i < len(stuff): if stuff[i - 1][3] - max_angel_secondORF_distance <= stuff[i][ 2] <= stuff[i - 1][3] + max_angel_secondORF_distance: i += 1 else: # is too far, kick it! stuff.pop(i) # put stuff back into result as a dict result = defaultdict( lambda: [] ) # result is effectively overwritten, in the future I can just remove the result in the lines above for _frame, _flag, _start, _end in stuff: result[_frame].append((_flag, _start, _end)) if len(result) > 0: ORFs.append((rec, result, '+')) # (1b) run dumb ORFs which returns the frame with longest ORF as a dict frame -> (flag,s,e) or None dumb = DumbORF.predict_longest_ORFs( str(rec.seq).upper(), min_dumb_aa_length) if dumb is not None: ORFs.append((rec, dumb, '+')) # (2a) see if need to predict on - strand # if need to, create a rec2 that has the rev complement if use_rev_strand: rec2 = SeqRecord(rec.seq.reverse_complement(), id=rec.id, description=rec.description) result = defaultdict( lambda: []) # frame --> list of (type, start, end) flag, name, good = ORFscores.predict_ORF( rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length) for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + ( _frame if m >= _frame else 0) assert s < e result[_frame].append((flag, s, e)) # for each frame, only keep the first ORF unless the later ones overlap or is sufficiently close for _frame in result: stuff = result[_frame] stuff.sort(key=lambda a_b_c: (a_b_c[1], a_b_c[2] - a_b_c[1]) ) # sort by start, then length i = 1 while i < len(stuff): if stuff[i - 1][2] - max_angel_secondORF_distance <= stuff[ i][1] <= stuff[ i - 1][2] + max_angel_secondORF_distance: i += 1 else: # is too far, kick it! break result[_frame] = stuff if len(result) > 0: ORFs.append( (rec, result, '-')) # NOTE: sending rec instead of rec2 here is CORRECT dumb = DumbORF.predict_longest_ORFs( str(rec2.seq).upper(), min_dumb_aa_length) if dumb is not None: ORFs.append((rec, dumb, '-')) # now decide what to output from ORFs # if output_mode:all, just output everything # if output_mode:best, pick the longest one if output_mode == 'best' and len(ORFs) > 0: #print >> sys.stderr, "output mode: best" #print >> sys.stderr, ORFs best_rec, best_result, best_strand = ORFs[0] best_len = max( max(e - s for (flag, s, e) in v) for v in best_result.values()) for _rec, _result, _strand in ORFs[1:]: _len = max( max(e - s for (flag, s, e) in v) for v in _result.values()) if _len > best_len: best_rec, best_result, best_strand, best_len = \ _rec, _result, _strand, _len ORFs = [(best_rec, best_result, best_strand)] print("writing result for", rec.id, "to", output_prefix, file=sys.stderr) #print >> sys.stderr, "current ORFs:", ORFs starting_index = write_CDS_n_PEP(ORFs, output_prefix, min_utr_length=50, append_file=True, starting_index=starting_index) print("ALL DONE for", output_prefix, file=sys.stderr) os.system("touch {0}.DONE".format(output_prefix))
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, cpus=8): """ 1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep 2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds 3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2) 4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores 5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4) """ sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = r.seq.tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = r.seq.reverse_complement().tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '-')) write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix+'.nr90.cds'), 'fasta')] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:500] cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds' with open(cds_nr_selected_filename, 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename # step 3. get base_freq & hexamer scores print >> sys.stderr, "Calculating base frequency from", fasta_filename base_freq = calculate_base_frequency(fasta_filename, fasta_filename+'.base_freq', use_rev_strand) print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename log_scores = calculate_hexa_penta_score(cds_nr_selected_filename, base_freq, cds_nr_selected_filename+'.hexamer.scores') # step 4. score all predicted longest ORFs using log score print >> sys.stderr, "Scoring predicted ORFs...." scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores) # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5 picked_ids = [] for rec_seq_id, scores in scored_result.iteritems(): if scores[0] > 0 and scores[0] == max(scores): picked_ids.append(rec_seq_id) selective_write(output_prefix + '.cds', output_prefix + '.final.cds', picked_ids) selective_write(output_prefix + '.utr', output_prefix + '.final.utr', picked_ids) selective_write(output_prefix + '.pep', output_prefix + '.final.pep', picked_ids) print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep'
def ANGLE_predict_worker(input_fasta, output_prefix, bdt, o_all, min_ANGLE_aa_length=50, min_dumb_aa_length=100, use_rev_strand=False, starting_index=1): ORFs = [] for rec in SeqIO.parse(open(input_fasta), 'fasta'): seq_len = len(rec.seq) n, m = len(rec.seq) / 3, len(rec.seq) % 3 print >> sys.stderr, "predicting for", rec.id # (1a) predict on + strand result = defaultdict( lambda: []) # frame --> list of (type, start, end) max_angle_predicted_orf_len = min_dumb_aa_length flag, name, good = ORFscores.predict_ORF( rec, bdt, o_all, min_aa_len=min_ANGLE_aa_length) #print >> sys.stderr, flag, name, good for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + ( _frame if m >= _frame else 0) result[_frame].append((flag, s, e)) max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e - s) / 3 + 1) ORFs.append((rec, result, '+')) # (1b) run dumb ORFs, if better than longest of ANGLE's output it as well dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(), max_angle_predicted_orf_len) if sum(len(v) for v in dumb.itervalues()) > 0: ORFs.append((rec, dumb, '+')) # (2a) see if need to predict on - strand # if need to, create a rec2 that has the rev complement if use_rev_strand: rec2 = SeqRecord(rec.seq.reverse_complement(), id=rec.id, description=rec.description) result = defaultdict( lambda: []) # frame --> list of (type, start, end) max_angle_predicted_orf_len = min_dumb_aa_length flag, name, good = ORFscores.predict_ORF( rec2, bdt, o_all, min_aa_len=min_ANGLE_aa_length) for _frame, _stop, _start in good: s = _start * 3 + _frame if _start is not None else _frame e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + ( _frame if m >= _frame else 0) result[_frame].append((flag, s, e)) max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e - s) / 3 + 1) ORFs.append( (rec, result, '-')) # NOTE: sending rec instead of rec2 here is CORRECT dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(), max_angle_predicted_orf_len) if sum(len(v) for v in dumb.itervalues()) > 0: ORFs.append( (rec, dumb, '-')) # NOTE: sending rec instead of rec2 here is CORRECT write_CDS_n_PEP(ORFs, output_prefix, min_utr_length=50, append_file=True, starting_index=starting_index)
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, use_firstORF=False, cpus=8): """ 1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep 2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds 3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2) 4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores 5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4) """ sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = str(r.seq).upper() result = predict_longest_ORFs( seq, min_aa_length, use_firstORF ) # result is {best_frame: [(best_flag, best_s, best_e)]} if result is not None: ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = str(r.seq.reverse_complement()).upper() result = predict_longest_ORFs(seq, min_aa_length, use_firstORF) if result is not None: ORFs.append((r, result, '-')) if use_firstORF: # no need to do scoring, just use firstORF # simply find the first ORF in ORFs write_CDS_n_PEP(ORFs, output_prefix + '.final') print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep' return # all done! else: # need to score, write this current one down first write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -M 0 -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format( o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix + '.nr90.cds'), 'fasta') ] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:500] cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds' with open(cds_nr_selected_filename, 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename # step 3. get base_freq & hexamer scores print >> sys.stderr, "Calculating base frequency from", fasta_filename base_freq = calculate_base_frequency(fasta_filename, fasta_filename + '.base_freq', use_rev_strand) print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename log_scores = calculate_hexa_penta_score( cds_nr_selected_filename, base_freq, cds_nr_selected_filename + '.hexamer.scores') # step 4. score all predicted longest ORFs using log score print >> sys.stderr, "Scoring predicted ORFs...." scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores) # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5 picked_ids = [] for rec_seq_id, scores in scored_result.iteritems(): if scores[0] > 0 and scores[0] == max(scores): picked_ids.append(rec_seq_id) selective_write(output_prefix + '.cds', output_prefix + '.final.cds', picked_ids) selective_write(output_prefix + '.utr', output_prefix + '.final.utr', picked_ids) selective_write(output_prefix + '.pep', output_prefix + '.final.pep', picked_ids) print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep'