Example #1
0
def select_for_training(input_prefix,
                        output_prefix,
                        use_top=500,
                        choose_random=False,
                        cpus=8):
    """
    <input_prefix>.cds|.pep|.utr, must exist! Probably the output from transdecoder_main()

    use_top --- number of top records to use
    random --- if True, choose randomly instead of the longest top ones
    """
    print >> sys.stderr, "running CD-HIT to generate non-redundant set...."
    cmd = "cd-hit -T {cpus} -M 0 -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(
        o=input_prefix, cpus=cpus)
    subprocess.check_call(cmd, shell=True)

    lengths = [(len(r.seq), r.id)
               for r in SeqIO.parse(open(input_prefix + '.nr90.cds'), 'fasta')]
    if not choose_random:
        print >> sys.stderr, "Selecting longest {0} entries from non-redundant set....".format(
            use_top)
        lengths.sort(key=lambda x: x[0], reverse=True)
        lengths = lengths[:use_top]
    else:
        print >> sys.stderr, "Selecting random {0} entries from non-redundant set....".format(
            use_top)
        lengths = random.sample(lengths, min(len(lengths), use_top))

    picked_ids = [rec_seq_id for (seq_len, rec_seq_id) in lengths]
    selective_write(input_prefix + '.cds', output_prefix + '.cds', picked_ids)
    selective_write(input_prefix + '.utr', output_prefix + '.utr', picked_ids)
    selective_write(input_prefix + '.pep', output_prefix + '.pep', picked_ids)
Example #2
0
def select_for_training(input_prefix, output_prefix, use_top=500, choose_random=False, cpus=8):
    """
    <input_prefix>.cds|.pep|.utr, must exist! Probably the output from transdecoder_main()

    use_top --- number of top records to use
    random --- if True, choose randomly instead of the longest top ones
    """
    print >> sys.stderr, "running CD-HIT to generate non-redundant set...."
    cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(o=input_prefix, cpus=cpus)
    subprocess.check_call(cmd, shell=True)

    lengths = [(len(r.seq), r.id) for r in SeqIO.parse(open(input_prefix+'.nr90.cds'), 'fasta')]
    if not choose_random:
        print >> sys.stderr, "Selecting longest {0} entries from non-redundant set....".format(use_top)
        lengths.sort(key=lambda x: x[0], reverse=True)
        lengths = lengths[:use_top]
    else:
        print >> sys.stderr, "Selecting random {0} entries from non-redundant set....".format(use_top)
        lengths = random.sample(lengths, min(len(lengths), use_top))

    picked_ids = [ rec_seq_id for (seq_len, rec_seq_id) in lengths ]
    selective_write(input_prefix + '.cds', output_prefix + '.cds', picked_ids)
    selective_write(input_prefix + '.utr', output_prefix + '.utr', picked_ids)
    selective_write(input_prefix + '.pep', output_prefix + '.pep', picked_ids)
Example #3
0
def transdecoder_main(fasta_filename, output_prefix='dumb_orf', min_aa_length=100, use_rev_strand=False, cpus=8):
    """
    1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep
    2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds
    3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2)
    4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores
    5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4)
    """
    sanity_check_cdhit()

    print >> sys.stderr, "predict longest ORFs...."
    # step 1. predict longest ORFs
    ORFs = [] # list of (sequence, result, strand)
    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        seq = r.seq.tostring().upper()
        result = predict_longest_ORFs(seq, min_aa_length)
        ORFs.append((r, result, '+'))
        if use_rev_strand: # predict on - strand as well
            seq = r.seq.reverse_complement().tostring().upper()
            result = predict_longest_ORFs(seq, min_aa_length)
            ORFs.append((r, result, '-'))
    write_CDS_n_PEP(ORFs, output_prefix)

    print >> sys.stderr, "running CD-HIT to generate non-redundant set...."
    # step 2. use CD-hit to remove redundancy, then pick out top <use_top>
    cmd = "cd-hit -T {cpus} -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(o=output_prefix, cpus=cpus)
    subprocess.check_call(cmd, shell=True)

    lengths = [(len(r.seq), r) for r in SeqIO.parse(open(output_prefix+'.nr90.cds'), 'fasta')]
    lengths.sort(key=lambda x: x[0], reverse=True)
    lengths = lengths[:500]

    cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds'
    with open(cds_nr_selected_filename, 'w') as f:
        for _len, r in lengths:
            # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529
            f.write(">{0}\n{1}\n".format(r.description, r.seq))
    print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename


    # step 3. get base_freq & hexamer scores
    print >> sys.stderr, "Calculating base frequency from", fasta_filename
    base_freq = calculate_base_frequency(fasta_filename, fasta_filename+'.base_freq', use_rev_strand)
    print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename
    log_scores = calculate_hexa_penta_score(cds_nr_selected_filename, base_freq, cds_nr_selected_filename+'.hexamer.scores')

    # step 4. score all predicted longest ORFs using log score
    print >> sys.stderr, "Scoring predicted ORFs...."
    scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores)

    # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5
    picked_ids = []
    for rec_seq_id, scores in scored_result.iteritems():
        if scores[0] > 0 and scores[0] == max(scores):
            picked_ids.append(rec_seq_id)

    selective_write(output_prefix + '.cds', output_prefix + '.final.cds', picked_ids)
    selective_write(output_prefix + '.utr', output_prefix + '.final.utr', picked_ids)
    selective_write(output_prefix + '.pep', output_prefix + '.final.pep', picked_ids)

    print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \
        output_prefix + '.final.utr', output_prefix + '.final.pep'
Example #4
0
def transdecoder_main(fasta_filename,
                      output_prefix='dumb_orf',
                      min_aa_length=100,
                      use_rev_strand=False,
                      use_firstORF=False,
                      cpus=8):
    """
    1. Predict longest ORFs, write to <output_prefix>.cds|.utr|.pep
    2. Run CD-hit to get non-redundant set, then pick the top 500 for getting hexamer information, <output_prefix>.nr90.longest_500.cds
    3. Get base_freq out of <fasta_filename>, get hexamer scores out of (2)
    4. Score everything from (1) based on (3), write to <output_prefix>.cds.scores
    5. Output the FINAL <output_prefix>.final.cds|.utr|.pep based on the scores from (4)
    """
    sanity_check_cdhit()

    print >> sys.stderr, "predict longest ORFs...."
    # step 1. predict longest ORFs
    ORFs = []  # list of (sequence, result, strand)
    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        seq = str(r.seq).upper()
        result = predict_longest_ORFs(
            seq, min_aa_length, use_firstORF
        )  # result is {best_frame: [(best_flag, best_s, best_e)]}
        if result is not None:
            ORFs.append((r, result, '+'))
        if use_rev_strand:  # predict on - strand as well
            seq = str(r.seq.reverse_complement()).upper()
            result = predict_longest_ORFs(seq, min_aa_length, use_firstORF)
            if result is not None:
                ORFs.append((r, result, '-'))

    if use_firstORF:  # no need to do scoring, just use firstORF
        # simply find the first ORF in ORFs
        write_CDS_n_PEP(ORFs, output_prefix + '.final')
        print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \
            output_prefix + '.final.utr', output_prefix + '.final.pep'
        return  # all done!
    else:  # need to score, write this current one down first
        write_CDS_n_PEP(ORFs, output_prefix)

    print >> sys.stderr, "running CD-HIT to generate non-redundant set...."
    # step 2. use CD-hit to remove redundancy, then pick out top <use_top>
    cmd = "cd-hit -T {cpus} -M 0 -i {o}.cds -o {o}.nr90.cds -c 0.90 -n 5".format(
        o=output_prefix, cpus=cpus)
    subprocess.check_call(cmd, shell=True)

    lengths = [(len(r.seq), r)
               for r in SeqIO.parse(open(output_prefix + '.nr90.cds'), 'fasta')
               ]
    lengths.sort(key=lambda x: x[0], reverse=True)
    lengths = lengths[:500]

    cds_nr_selected_filename = output_prefix + '.nr90.longest_500.cds'
    with open(cds_nr_selected_filename, 'w') as f:
        for _len, r in lengths:
            # ex: r.description >PB.1.1|chr1:26227060-26232896(-)|c242/f4p4/976|m.1 type:complete len:150 strand:+ pos:80-529
            f.write(">{0}\n{1}\n".format(r.description, r.seq))
    print >> sys.stderr, "Longest 500 non-redundant predicted ORFs written to:", cds_nr_selected_filename

    # step 3. get base_freq & hexamer scores
    print >> sys.stderr, "Calculating base frequency from", fasta_filename
    base_freq = calculate_base_frequency(fasta_filename,
                                         fasta_filename + '.base_freq',
                                         use_rev_strand)
    print >> sys.stderr, "Calculating hexamer scores from", cds_nr_selected_filename
    log_scores = calculate_hexa_penta_score(
        cds_nr_selected_filename, base_freq,
        cds_nr_selected_filename + '.hexamer.scores')

    # step 4. score all predicted longest ORFs using log score
    print >> sys.stderr, "Scoring predicted ORFs...."
    scored_result = score_cds_by_likelihood(output_prefix + '.cds', log_scores)

    # step 5. output FINAL, where longest ORFs are output ONLY if its score in frame0 is higher than all other 5
    picked_ids = []
    for rec_seq_id, scores in scored_result.iteritems():
        if scores[0] > 0 and scores[0] == max(scores):
            picked_ids.append(rec_seq_id)

    selective_write(output_prefix + '.cds', output_prefix + '.final.cds',
                    picked_ids)
    selective_write(output_prefix + '.utr', output_prefix + '.final.utr',
                    picked_ids)
    selective_write(output_prefix + '.pep', output_prefix + '.final.pep',
                    picked_ids)

    print >> sys.stderr, "Dumb ORF prediction done. Final output written to:", output_prefix + '.final.cds', \
        output_prefix + '.final.utr', output_prefix + '.final.pep'