Esempio n. 1
0
def ANGEL_predict_worker(input_fasta, output_prefix, bdt, o_all, min_ANGEL_aa_length=50, min_dumb_aa_length=100, use_rev_strand=False, output_rev_only_if_longer=False, starting_index=1):
    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        ORFs = []
        seq_len = len(rec.seq)
        n, m = len(rec.seq)/3, len(rec.seq)%3
        print >> sys.stderr, "predicting for", rec.id
        # (1a) predict on + strand
        result = defaultdict(lambda: []) # frame --> list of (type, start, end)
        max_angle_predicted_orf_len = min_dumb_aa_length
        flag, name, good = ORFscores.predict_ORF(rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e - s)/3 + 1)
        ORFs.append((rec, result, '+'))
        # (1b) run dumb ORFs, if better than longest of ANGEL's output it as well
        dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(), max_angle_predicted_orf_len)

        if sum(len(v) for v in dumb.itervalues()) > 0:
            ORFs.append((rec, dumb, '+'))
            for v in dumb.itervalues():
                if len(v) > 0:
                    for _flag, _s, _e in v:
                        max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (_e - _s)/3 + 1)

        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            #print "output_rev_only_if_longer:", output_rev_only_if_longer
            if output_rev_only_if_longer: # min aa length must be longer than the forward strand longest prediction
                min_dumb_aa_length_for_rev = max_angle_predicted_orf_len
                min_ANGEL_aa_length_for_rev = max(max_angle_predicted_orf_len, min_ANGEL_aa_length)
            else:
                min_dumb_aa_length_for_rev = min_dumb_aa_length
                min_ANGEL_aa_length_for_rev = min_ANGEL_aa_length
                #print min_dumb_aa_length, min_ANGEL_aa_length
            rec2 = SeqRecord(rec.seq.reverse_complement(), id=rec.id, description=rec.description)
            result = defaultdict(lambda: []) # frame --> list of (type, start, end)
            max_angle_predicted_orf_len = min_dumb_aa_length_for_rev
            #print "calling rev with min_aa_len", min_ANGEL_aa_length
            flag, name, good = ORFscores.predict_ORF(rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length_for_rev)
            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0)
                result[_frame].append((flag, s, e))
                max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e-s)/3+1)
            ORFs.append((rec, result, '-')) # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(), max_angle_predicted_orf_len)
            if sum(len(v) for v in dumb.itervalues()) > 0:
                ORFs.append((rec, dumb, '-')) # NOTE: sending rec instead of rec2 here is CORRECT

        starting_index = write_CDS_n_PEP(ORFs, output_prefix, min_utr_length=50, append_file=True, starting_index=starting_index)
Esempio n. 2
0
def ANGEL_predict_worker(input_fasta,
                         output_prefix,
                         bdt,
                         o_all,
                         min_ANGEL_aa_length=50,
                         min_dumb_aa_length=100,
                         use_rev_strand=False,
                         output_mode='best',
                         max_angel_secondORF_distance=10,
                         starting_index=1):
    """
    Output Mode is either "best" or "all"

    If "all" and + strand only: ANGEL+, dumb+  (both subject to its length threshold)
    If "all" and - strand also: ANGEL+, dumb+, ANGEL-, dumb-

    If "best" and + strand only: argmax_(ANGEL+, dumb+)
    If "best" and - strand also: argmax_(ANGEL+, dumb+, ANGEL-, dumb-)

    For dumb, pick only the longest ORF ouf of the 3 possible frames for that strand.
    For ANGEL, if there are multiple ORFs (suspicious), the longest one is chosen as the "length" to beat dumb,
              and if ANGEL is chosen as output, all ORFs are output.
    """

    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        ORFs = []
        # convert any non-ATCG to 'A'
        rec.seq = Seq(convert_non_ATCG(str(rec.seq), replace_with='A'))
        seq_len = len(rec.seq)
        n, m = seq_len // 3, seq_len % 3
        print("predicting for", rec.id, file=sys.stderr)
        # (1a) predict on + strand for ANGEL
        result = defaultdict(
            lambda: [])  # frame --> list of (type, start, end)
        stuff = [
        ]  # (frame, type, start, end)  # this should eventually replace result, keeping result for now.
        flag, name, good = ORFscores.predict_ORF(
            rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                _frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            stuff.append((_frame, flag, s, e))

        # REGARDLESS OF FRAME, only keep the first ORF unless the later ones overlap or is sufficiently close
        stuff.sort(key=lambda a_b_c_d: (a_b_c_d[2], a_b_c_d[3] - a_b_c_d[2])
                   )  # sort by start, then length
        i = 1
        while i < len(stuff):
            if stuff[i - 1][3] - max_angel_secondORF_distance <= stuff[i][
                    2] <= stuff[i - 1][3] + max_angel_secondORF_distance:
                i += 1
            else:  # is too far, kick it!
                stuff.pop(i)
        # put stuff back into result as a dict
        result = defaultdict(
            lambda: []
        )  # result is effectively overwritten, in the future I can just remove the result in the lines above
        for _frame, _flag, _start, _end in stuff:
            result[_frame].append((_flag, _start, _end))

        if len(result) > 0:
            ORFs.append((rec, result, '+'))

        # (1b) run dumb ORFs which returns the frame with longest ORF as a dict frame -> (flag,s,e) or None
        dumb = DumbORF.predict_longest_ORFs(
            str(rec.seq).upper(), min_dumb_aa_length)
        if dumb is not None:
            ORFs.append((rec, dumb, '+'))

        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            rec2 = SeqRecord(rec.seq.reverse_complement(),
                             id=rec.id,
                             description=rec.description)
            result = defaultdict(
                lambda: [])  # frame --> list of (type, start, end)
            flag, name, good = ORFscores.predict_ORF(
                rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length)

            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                    _frame if m >= _frame else 0)
                assert s < e
                result[_frame].append((flag, s, e))
            # for each frame, only keep the first ORF unless the later ones overlap or is sufficiently close
            for _frame in result:
                stuff = result[_frame]
                stuff.sort(key=lambda a_b_c: (a_b_c[1], a_b_c[2] - a_b_c[1])
                           )  # sort by start, then length
                i = 1
                while i < len(stuff):
                    if stuff[i - 1][2] - max_angel_secondORF_distance <= stuff[
                            i][1] <= stuff[
                                i - 1][2] + max_angel_secondORF_distance:
                        i += 1
                    else:  # is too far, kick it!
                        break
                result[_frame] = stuff

            if len(result) > 0:
                ORFs.append(
                    (rec, result,
                     '-'))  # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(
                str(rec2.seq).upper(), min_dumb_aa_length)
            if dumb is not None:
                ORFs.append((rec, dumb, '-'))

        # now decide what to output from ORFs
        # if output_mode:all, just output everything
        # if output_mode:best, pick the longest one

        if output_mode == 'best' and len(ORFs) > 0:
            #print >> sys.stderr, "output mode: best"
            #print >> sys.stderr, ORFs
            best_rec, best_result, best_strand = ORFs[0]
            best_len = max(
                max(e - s for (flag, s, e) in v) for v in best_result.values())
            for _rec, _result, _strand in ORFs[1:]:
                _len = max(
                    max(e - s for (flag, s, e) in v) for v in _result.values())
                if _len > best_len:
                    best_rec, best_result, best_strand, best_len = \
                    _rec, _result, _strand, _len
            ORFs = [(best_rec, best_result, best_strand)]
        print("writing result for",
              rec.id,
              "to",
              output_prefix,
              file=sys.stderr)
        #print >> sys.stderr, "current ORFs:", ORFs
        starting_index = write_CDS_n_PEP(ORFs,
                                         output_prefix,
                                         min_utr_length=50,
                                         append_file=True,
                                         starting_index=starting_index)
    print("ALL DONE for", output_prefix, file=sys.stderr)
    os.system("touch {0}.DONE".format(output_prefix))
Esempio n. 3
0
def ANGLE_predict_worker(input_fasta,
                         output_prefix,
                         bdt,
                         o_all,
                         min_ANGLE_aa_length=50,
                         min_dumb_aa_length=100,
                         use_rev_strand=False,
                         starting_index=1):

    ORFs = []
    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        seq_len = len(rec.seq)
        n, m = len(rec.seq) / 3, len(rec.seq) % 3
        print >> sys.stderr, "predicting for", rec.id
        # (1a) predict on + strand
        result = defaultdict(
            lambda: [])  # frame --> list of (type, start, end)
        max_angle_predicted_orf_len = min_dumb_aa_length
        flag, name, good = ORFscores.predict_ORF(
            rec, bdt, o_all, min_aa_len=min_ANGLE_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                _frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            max_angle_predicted_orf_len = max(max_angle_predicted_orf_len,
                                              (e - s) / 3 + 1)
        ORFs.append((rec, result, '+'))
        # (1b) run dumb ORFs, if better than longest of ANGLE's output it as well
        dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(),
                                            max_angle_predicted_orf_len)

        if sum(len(v) for v in dumb.itervalues()) > 0:
            ORFs.append((rec, dumb, '+'))
        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            rec2 = SeqRecord(rec.seq.reverse_complement(),
                             id=rec.id,
                             description=rec.description)
            result = defaultdict(
                lambda: [])  # frame --> list of (type, start, end)
            max_angle_predicted_orf_len = min_dumb_aa_length
            flag, name, good = ORFscores.predict_ORF(
                rec2, bdt, o_all, min_aa_len=min_ANGLE_aa_length)
            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                    _frame if m >= _frame else 0)
                result[_frame].append((flag, s, e))
                max_angle_predicted_orf_len = max(max_angle_predicted_orf_len,
                                                  (e - s) / 3 + 1)
            ORFs.append(
                (rec, result,
                 '-'))  # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(),
                                                max_angle_predicted_orf_len)
            if sum(len(v) for v in dumb.itervalues()) > 0:
                ORFs.append(
                    (rec, dumb,
                     '-'))  # NOTE: sending rec instead of rec2 here is CORRECT

    write_CDS_n_PEP(ORFs,
                    output_prefix,
                    min_utr_length=50,
                    append_file=True,
                    starting_index=starting_index)