def select_for_training(input_prefix, output_prefix, use_top=500, choose_random=False, cpus=8): """ <input_prefix>.cds|.pep|.utr, must exist! Probably the output from transdecoder_main() use_top --- number of top records to use random --- if True, choose randomly instead of the longest top ones """ print >> sys.stderr, "running CD-HIT to generate non-redundant set...." cmd = "cd-hit -T {cpus} -M 0 -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format( o=input_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r.id) for r in SeqIO.parse(open(input_prefix + '.nr90.cds'), 'fasta')] if not choose_random: print >> sys.stderr, "Selecting longest {0} entries from non-redundant set....".format( use_top) lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:use_top] else: print >> sys.stderr, "Selecting random {0} entries from non-redundant set....".format( use_top) lengths = random.sample(lengths, min(len(lengths), use_top)) picked_ids = [rec_seq_id for (seq_len, rec_seq_id) in lengths] selective_write(input_prefix + '.cds', output_prefix + '.cds', picked_ids) selective_write(input_prefix + '.utr', output_prefix + '.utr', picked_ids) selective_write(input_prefix + '.pep', output_prefix + '.pep', picked_ids)
def select_for_training(input_prefix, output_prefix, use_top=500, choose_random=False, cpus=8): """ <input_prefix>.cds|.pep|.utr, must exist! Probably the output from transdecoder_main() use_top --- number of top records to use random --- if True, choose randomly instead of the longest top ones """ print >> sys.stderr, "running CD-HIT to generate non-redundant set...." cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(o=input_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r.id) for r in SeqIO.parse(open(input_prefix+'.nr90.cds'), 'fasta')] if not choose_random: print >> sys.stderr, "Selecting longest {0} entries from non-redundant set....".format(use_top) lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:use_top] else: print >> sys.stderr, "Selecting random {0} entries from non-redundant set....".format(use_top) lengths = random.sample(lengths, min(len(lengths), use_top)) picked_ids = [ rec_seq_id for (seq_len, rec_seq_id) in lengths ] selective_write(input_prefix + '.cds', output_prefix + '.cds', picked_ids) selective_write(input_prefix + '.utr', output_prefix + '.utr', picked_ids) selective_write(input_prefix + '.pep', output_prefix + '.pep', picked_ids)
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, cpus=8): """ 1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep 2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds 3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2) 4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores 5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4) """ sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = r.seq.tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = r.seq.reverse_complement().tostring().upper() result = predict_longest_ORFs(seq, min_aa_length) ORFs.append((r, result, '-')) write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix+'.nr90.cds'), 'fasta')] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:500] cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds' with open(cds_nr_selected_filename, 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename # step 3. get base_freq & hexamer scores print >> sys.stderr, "Calculating base frequency from", fasta_filename base_freq = calculate_base_frequency(fasta_filename, fasta_filename+'.base_freq', use_rev_strand) print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename log_scores = calculate_hexa_penta_score(cds_nr_selected_filename, base_freq, cds_nr_selected_filename+'.hexamer.scores') # step 4. score all predicted longest ORFs using log score print >> sys.stderr, "Scoring predicted ORFs...." scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores) # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5 picked_ids = [] for rec_seq_id, scores in scored_result.iteritems(): if scores[0] > 0 and scores[0] == max(scores): picked_ids.append(rec_seq_id) selective_write(output_prefix + '.cds', output_prefix + '.final.cds', picked_ids) selective_write(output_prefix + '.utr', output_prefix + '.final.utr', picked_ids) selective_write(output_prefix + '.pep', output_prefix + '.final.pep', picked_ids) print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep'
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, use_firstORF=False, cpus=8): """ 1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep 2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds 3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2) 4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores 5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4) """ sanity_check_cdhit() print >> sys.stderr, "predict longest ORFs...." # step 1. predict longest ORFs ORFs = [] # list of (sequence, result, strand) for r in SeqIO.parse(open(fasta_filename), 'fasta'): seq = str(r.seq).upper() result = predict_longest_ORFs( seq, min_aa_length, use_firstORF ) # result is {best_frame: [(best_flag, best_s, best_e)]} if result is not None: ORFs.append((r, result, '+')) if use_rev_strand: # predict on - strand as well seq = str(r.seq.reverse_complement()).upper() result = predict_longest_ORFs(seq, min_aa_length, use_firstORF) if result is not None: ORFs.append((r, result, '-')) if use_firstORF: # no need to do scoring, just use firstORF # simply find the first ORF in ORFs write_CDS_n_PEP(ORFs, output_prefix + '.final') print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep' return # all done! else: # need to score, write this current one down first write_CDS_n_PEP(ORFs, output_prefix) print >> sys.stderr, "running CD-HIT to generate non-redundant set...." # step 2. use CD-hit to remove redundancy, then pick out top <use_top> cmd = "cd-hit -T {cpus} -M 0 -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format( o=output_prefix, cpus=cpus) subprocess.check_call(cmd, shell=True) lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix + '.nr90.cds'), 'fasta') ] lengths.sort(key=lambda x: x[0], reverse=True) lengths = lengths[:500] cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds' with open(cds_nr_selected_filename, 'w') as f: for _len, r in lengths: # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529 f.write(">{0}\n{1}\n".format(r.description, r.seq)) print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename # step 3. get base_freq & hexamer scores print >> sys.stderr, "Calculating base frequency from", fasta_filename base_freq = calculate_base_frequency(fasta_filename, fasta_filename + '.base_freq', use_rev_strand) print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename log_scores = calculate_hexa_penta_score( cds_nr_selected_filename, base_freq, cds_nr_selected_filename + '.hexamer.scores') # step 4. score all predicted longest ORFs using log score print >> sys.stderr, "Scoring predicted ORFs...." scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores) # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5 picked_ids = [] for rec_seq_id, scores in scored_result.iteritems(): if scores[0] > 0 and scores[0] == max(scores): picked_ids.append(rec_seq_id) selective_write(output_prefix + '.cds', output_prefix + '.final.cds', picked_ids) selective_write(output_prefix + '.utr', output_prefix + '.final.utr', picked_ids) selective_write(output_prefix + '.pep', output_prefix + '.final.pep', picked_ids) print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \ output_prefix + '.final.utr', output_prefix + '.final.pep'