Example #1
0
def ANGEL_predict_worker(input_fasta, output_prefix, bdt, o_all, min_ANGEL_aa_length=50, min_dumb_aa_length=100, use_rev_strand=False, output_rev_only_if_longer=False, starting_index=1):
    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        ORFs = []
        seq_len = len(rec.seq)
        n, m = len(rec.seq)/3, len(rec.seq)%3
        print >> sys.stderr, "predicting for", rec.id
        # (1a) predict on + strand
        result = defaultdict(lambda: []) # frame --> list of (type, start, end)
        max_angle_predicted_orf_len = min_dumb_aa_length
        flag, name, good = ORFscores.predict_ORF(rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e - s)/3 + 1)
        ORFs.append((rec, result, '+'))
        # (1b) run dumb ORFs, if better than longest of ANGEL's output it as well
        dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(), max_angle_predicted_orf_len)

        if sum(len(v) for v in dumb.itervalues()) > 0:
            ORFs.append((rec, dumb, '+'))
            for v in dumb.itervalues():
                if len(v) > 0:
                    for _flag, _s, _e in v:
                        max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (_e - _s)/3 + 1)

        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            #print "output_rev_only_if_longer:", output_rev_only_if_longer
            if output_rev_only_if_longer: # min aa length must be longer than the forward strand longest prediction
                min_dumb_aa_length_for_rev = max_angle_predicted_orf_len
                min_ANGEL_aa_length_for_rev = max(max_angle_predicted_orf_len, min_ANGEL_aa_length)
            else:
                min_dumb_aa_length_for_rev = min_dumb_aa_length
                min_ANGEL_aa_length_for_rev = min_ANGEL_aa_length
                #print min_dumb_aa_length, min_ANGEL_aa_length
            rec2 = SeqRecord(rec.seq.reverse_complement(), id=rec.id, description=rec.description)
            result = defaultdict(lambda: []) # frame --> list of (type, start, end)
            max_angle_predicted_orf_len = min_dumb_aa_length_for_rev
            #print "calling rev with min_aa_len", min_ANGEL_aa_length
            flag, name, good = ORFscores.predict_ORF(rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length_for_rev)
            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n*3 + (_frame if m >= _frame else 0)
                result[_frame].append((flag, s, e))
                max_angle_predicted_orf_len = max(max_angle_predicted_orf_len, (e-s)/3+1)
            ORFs.append((rec, result, '-')) # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(), max_angle_predicted_orf_len)
            if sum(len(v) for v in dumb.itervalues()) > 0:
                ORFs.append((rec, dumb, '-')) # NOTE: sending rec instead of rec2 here is CORRECT

        starting_index = write_CDS_n_PEP(ORFs, output_prefix, min_utr_length=50, append_file=True, starting_index=starting_index)
Example #2
0
def add_data_worker(o_all, records, frames, queue):
    for rec in records:
        for i in frames:
            stuff = ORFscores.make_data_smart(rec.seq, o_all, frame_shift=i)
            print >> sys.stderr, "putting into queue", rec.id
            queue.put(stuff)
            print >> sys.stderr, "done for ", rec.id
Example #3
0
def add_data_worker(o_all, records, frames, queue):
    for rec in records:
        for i in frames:
            stuff = ORFscores.make_data_smart(rec.seq, o_all, frame_shift=i)
            print >> sys.stderr, "putting into queue", rec.id
            queue.put(stuff)
            print >> sys.stderr, "done for ", rec.id
    print >> sys.stderr, "Done with records"
Example #4
0
def add_data_worker(o_all, records, frames, queue):
    #result = []
    for rec in records:
        for i in frames:
            print >> sys.stderr, "processing record {0}, frame {1}".format(rec.id, i)
            stuff = ORFscores.make_data_smart(rec.seq, o_all, frame_shift=i)
            print >> sys.stderr, "putting into queue", rec.id
            queue.put(stuff)
            #result += stuff
            print >> sys.stderr, "done for ", rec.id
    print >> sys.stderr, "Done with records"
Example #5
0
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3):
    coding = [r for r in SeqIO.parse(open(cds_filename), 'fasta')]
    utr = [r for r in SeqIO.parse(open(utr_filename), 'fasta')]

    o_all = ORFscores.CDSWindowFeat()
    #o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, coding)
    add_to_background(o_all, utr)

    # Queue is very inefficient for large data passing
    # instead break the records up into chunk sizes and just combine results together
    print("running get_data_parallel for coding, chunk 0", file=sys.stderr)
    data_pos = get_data_parallel(o_all, coding, [0], num_workers)
    data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers)
    #    num_coding = len(coding)
    #    data_pos = get_data_parallel(o_all, coding[:MAX_RECORD_CHUNK], [0], num_workers)
    #    for i in range(1, num_coding/MAX_RECORD_CHUNK + (num_coding%MAX_RECORD_CHUNK>0)):
    #        print >> sys.stderr, "running get_data_parallel for coding, chunk", i
    #        data_pos += get_data_parallel(o_all, coding[i*MAX_RECORD_CHUNK:(i+1)*MAX_RECORD_CHUNK], [0], num_workers)##
    #
    #    print >> sys.stderr, "running get_data_parallel for UTR, chunk 0"
    #    num_utr = len(utr)
    #    data_neg = get_data_parallel(o_all, utr[:MAX_RECORD_CHUNK], [0, 1, 2], num_workers)
    #    for i in range(1, num_utr/MAX_RECORD_CHUNK + (num_utr%MAX_RECORD_CHUNK>0)):
    #        print >> sys.stderr, "running get_data_parallel for UTR, chunk", i
    #        data_neg += get_data_parallel(o_all, utr[i*MAX_RECORD_CHUNK:(i+1)*MAX_RECORD_CHUNK], [0, 1, 2], num_workers)

    print("size of neg training data: {0}, pos training data: {1}".format(\
        len(data_neg), len(data_pos)), file=sys.stderr)

    print("using first 10,000 training pos/neg only", file=sys.stderr)
    data_neg = data_neg[:10000]
    data_pos = data_pos[:10000]
    data = data_neg + data_pos

    target = [0] * len(data_neg) + [1] * len(data_pos)
    data = np.array(data)

    print("data prep done, running classifier....", file=sys.stderr)
    bdt = AdaBoostClassifier(n_estimators=50)
    bdt.fit(data, target)

    print("classifier trained. putting pickle to",
          output_pickle,
          file=sys.stderr)

    with open(output_pickle, 'wb') as f:
        dump({'bdt': bdt, 'o_all': o_all}, f)

    return data, target, bdt
Example #6
0
def predict_longest_ORFs(seq, min_aa_length):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return all longest ORFs that exceed <min_length>

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq) / 3, len(seq) % 3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0:  # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0] * 3 + frame,
                                      n * 3 + (frame if frame <= m else 0)))
        else:  # has stop
            if len(starts) == 0:  # 5' partial
                if stops[0] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-5partial', frame, stops[0] * 3 + 3 + frame))
            else:  # has at least one start and one stop
                i, j = 0, 0
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        result[frame].append(
                            ('dumb-complete', starts[i] * 3 + frame,
                             stops[j] * 3 + 3 + frame))
                    j += 1  # move stop one step down
                    while i < len(starts) and starts[i] < stops[j - 1]:
                        i += 1
                # check the very last possible ORF
                if i < len(starts) and (
                        j == len(stops) or
                    (j < len(stops) and starts[i] > stops[j])
                ) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-3partial', starts[i] * 3 + frame,
                         n * 3 + (frame if frame <= m else 0)))
    return result
Example #7
0
def predict_longest_ORFs(seq, min_aa_length):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return all longest ORFs that exceed <min_length>

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq)/3, len(seq)%3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0: # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else 0)))
        else: # has stop
            if len(starts) == 0: # 5' partial
                if  stops[0] + 1 >= min_aa_length:
                    result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame))
            else: # has at least one start and one stop
                i, j = 0, 0
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame))
                    j += 1 # move stop one step down
                    while i < len(starts) and starts[i] < stops[j-1]:
                        i += 1
                # check the very last possible ORF
                if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else 0)))
    return result
Example #8
0
def ANGEL_predict_worker(input_fasta,
                         output_prefix,
                         bdt,
                         o_all,
                         min_ANGEL_aa_length=50,
                         min_dumb_aa_length=100,
                         use_rev_strand=False,
                         output_mode='best',
                         max_angel_secondORF_distance=10,
                         starting_index=1):
    """
    Output Mode is either "best" or "all"

    If "all" and + strand only: ANGEL+, dumb+  (both subject to its length threshold)
    If "all" and - strand also: ANGEL+, dumb+, ANGEL-, dumb-

    If "best" and + strand only: argmax_(ANGEL+, dumb+)
    If "best" and - strand also: argmax_(ANGEL+, dumb+, ANGEL-, dumb-)

    For dumb, pick only the longest ORF ouf of the 3 possible frames for that strand.
    For ANGEL, if there are multiple ORFs (suspicious), the longest one is chosen as the "length" to beat dumb,
              and if ANGEL is chosen as output, all ORFs are output.
    """

    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        ORFs = []
        # convert any non-ATCG to 'A'
        rec.seq = Seq(convert_non_ATCG(str(rec.seq), replace_with='A'))
        seq_len = len(rec.seq)
        n, m = seq_len // 3, seq_len % 3
        print("predicting for", rec.id, file=sys.stderr)
        # (1a) predict on + strand for ANGEL
        result = defaultdict(
            lambda: [])  # frame --> list of (type, start, end)
        stuff = [
        ]  # (frame, type, start, end)  # this should eventually replace result, keeping result for now.
        flag, name, good = ORFscores.predict_ORF(
            rec, bdt, o_all, min_aa_len=min_ANGEL_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                _frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            stuff.append((_frame, flag, s, e))

        # REGARDLESS OF FRAME, only keep the first ORF unless the later ones overlap or is sufficiently close
        stuff.sort(key=lambda a_b_c_d: (a_b_c_d[2], a_b_c_d[3] - a_b_c_d[2])
                   )  # sort by start, then length
        i = 1
        while i < len(stuff):
            if stuff[i - 1][3] - max_angel_secondORF_distance <= stuff[i][
                    2] <= stuff[i - 1][3] + max_angel_secondORF_distance:
                i += 1
            else:  # is too far, kick it!
                stuff.pop(i)
        # put stuff back into result as a dict
        result = defaultdict(
            lambda: []
        )  # result is effectively overwritten, in the future I can just remove the result in the lines above
        for _frame, _flag, _start, _end in stuff:
            result[_frame].append((_flag, _start, _end))

        if len(result) > 0:
            ORFs.append((rec, result, '+'))

        # (1b) run dumb ORFs which returns the frame with longest ORF as a dict frame -> (flag,s,e) or None
        dumb = DumbORF.predict_longest_ORFs(
            str(rec.seq).upper(), min_dumb_aa_length)
        if dumb is not None:
            ORFs.append((rec, dumb, '+'))

        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            rec2 = SeqRecord(rec.seq.reverse_complement(),
                             id=rec.id,
                             description=rec.description)
            result = defaultdict(
                lambda: [])  # frame --> list of (type, start, end)
            flag, name, good = ORFscores.predict_ORF(
                rec2, bdt, o_all, min_aa_len=min_ANGEL_aa_length)

            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                    _frame if m >= _frame else 0)
                assert s < e
                result[_frame].append((flag, s, e))
            # for each frame, only keep the first ORF unless the later ones overlap or is sufficiently close
            for _frame in result:
                stuff = result[_frame]
                stuff.sort(key=lambda a_b_c: (a_b_c[1], a_b_c[2] - a_b_c[1])
                           )  # sort by start, then length
                i = 1
                while i < len(stuff):
                    if stuff[i - 1][2] - max_angel_secondORF_distance <= stuff[
                            i][1] <= stuff[
                                i - 1][2] + max_angel_secondORF_distance:
                        i += 1
                    else:  # is too far, kick it!
                        break
                result[_frame] = stuff

            if len(result) > 0:
                ORFs.append(
                    (rec, result,
                     '-'))  # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(
                str(rec2.seq).upper(), min_dumb_aa_length)
            if dumb is not None:
                ORFs.append((rec, dumb, '-'))

        # now decide what to output from ORFs
        # if output_mode:all, just output everything
        # if output_mode:best, pick the longest one

        if output_mode == 'best' and len(ORFs) > 0:
            #print >> sys.stderr, "output mode: best"
            #print >> sys.stderr, ORFs
            best_rec, best_result, best_strand = ORFs[0]
            best_len = max(
                max(e - s for (flag, s, e) in v) for v in best_result.values())
            for _rec, _result, _strand in ORFs[1:]:
                _len = max(
                    max(e - s for (flag, s, e) in v) for v in _result.values())
                if _len > best_len:
                    best_rec, best_result, best_strand, best_len = \
                    _rec, _result, _strand, _len
            ORFs = [(best_rec, best_result, best_strand)]
        print("writing result for",
              rec.id,
              "to",
              output_prefix,
              file=sys.stderr)
        #print >> sys.stderr, "current ORFs:", ORFs
        starting_index = write_CDS_n_PEP(ORFs,
                                         output_prefix,
                                         min_utr_length=50,
                                         append_file=True,
                                         starting_index=starting_index)
    print("ALL DONE for", output_prefix, file=sys.stderr)
    os.system("touch {0}.DONE".format(output_prefix))
Example #9
0
def ANGLE_predict_worker(input_fasta,
                         output_prefix,
                         bdt,
                         o_all,
                         min_ANGLE_aa_length=50,
                         min_dumb_aa_length=100,
                         use_rev_strand=False,
                         starting_index=1):

    ORFs = []
    for rec in SeqIO.parse(open(input_fasta), 'fasta'):
        seq_len = len(rec.seq)
        n, m = len(rec.seq) / 3, len(rec.seq) % 3
        print >> sys.stderr, "predicting for", rec.id
        # (1a) predict on + strand
        result = defaultdict(
            lambda: [])  # frame --> list of (type, start, end)
        max_angle_predicted_orf_len = min_dumb_aa_length
        flag, name, good = ORFscores.predict_ORF(
            rec, bdt, o_all, min_aa_len=min_ANGLE_aa_length)
        #print >> sys.stderr, flag, name, good
        for _frame, _stop, _start in good:
            s = _start * 3 + _frame if _start is not None else _frame
            e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                _frame if m >= _frame else 0)
            result[_frame].append((flag, s, e))
            max_angle_predicted_orf_len = max(max_angle_predicted_orf_len,
                                              (e - s) / 3 + 1)
        ORFs.append((rec, result, '+'))
        # (1b) run dumb ORFs, if better than longest of ANGLE's output it as well
        dumb = DumbORF.predict_longest_ORFs(rec.seq.tostring().upper(),
                                            max_angle_predicted_orf_len)

        if sum(len(v) for v in dumb.itervalues()) > 0:
            ORFs.append((rec, dumb, '+'))
        # (2a) see if need to predict on - strand
        #      if need to, create a rec2 that has the rev complement
        if use_rev_strand:
            rec2 = SeqRecord(rec.seq.reverse_complement(),
                             id=rec.id,
                             description=rec.description)
            result = defaultdict(
                lambda: [])  # frame --> list of (type, start, end)
            max_angle_predicted_orf_len = min_dumb_aa_length
            flag, name, good = ORFscores.predict_ORF(
                rec2, bdt, o_all, min_aa_len=min_ANGLE_aa_length)
            for _frame, _stop, _start in good:
                s = _start * 3 + _frame if _start is not None else _frame
                e = _stop * 3 + _frame + 3 if _stop is not None else n * 3 + (
                    _frame if m >= _frame else 0)
                result[_frame].append((flag, s, e))
                max_angle_predicted_orf_len = max(max_angle_predicted_orf_len,
                                                  (e - s) / 3 + 1)
            ORFs.append(
                (rec, result,
                 '-'))  # NOTE: sending rec instead of rec2 here is CORRECT
            dumb = DumbORF.predict_longest_ORFs(rec2.seq.tostring().upper(),
                                                max_angle_predicted_orf_len)
            if sum(len(v) for v in dumb.itervalues()) > 0:
                ORFs.append(
                    (rec, dumb,
                     '-'))  # NOTE: sending rec instead of rec2 here is CORRECT

    write_CDS_n_PEP(ORFs,
                    output_prefix,
                    min_utr_length=50,
                    append_file=True,
                    starting_index=starting_index)
Example #10
0
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return the longest ORFs that exceed <min_length> (unless use_firstORF is True)

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq) / 3, len(seq) % 3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0:  # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(
                    ('dumb-3partial', starts[0] * 3 + frame,
                     n * 3 + (frame if frame <= m else frame - 3)))
        else:  # has stop
            if len(starts) == 0:  # 5' partial
                if stops[0] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-5partial', frame, stops[0] * 3 + 3 + frame))
            else:  # has at least one start and one stop
                i, j = 0, 0
                # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j]
                if stops[0] < starts[0]:
                    while j < len(stops) and starts[0] < stops[j - 1]:
                        j += 1
                # now: stops[j-1] < starts[0] < stops[j]
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        #rint frame, starts[i], stops[j]
                        result[frame].append(
                            ('dumb-complete', starts[i] * 3 + frame,
                             stops[j] * 3 + 3 + frame))
                    j += 1  # move stop one step down
                    while i < len(starts) and starts[i] < stops[j - 1]:
                        i += 1
                    # now starts[i] is between the last stop and this one
                # check the very last possible ORF
                if i < len(starts) and (
                        j == len(stops) or
                    (j < len(stops) and starts[i] > stops[j])
                ) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(
                        ('dumb-3partial', starts[i] * 3 + frame,
                         n * 3 + (frame if frame <= m else frame - 3)))

    # now pick the frame with the longest ORF!
    if all(len(v) == 0 for v in result.itervalues()):  # no ORF found
        return None

    best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0
    if not use_firstORF:  # find the longest ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if _len > best_len:
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len
    else:  # use the first ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if best_s is None or s < best_s or (s == best_s
                                                    and _len > best_len):
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len

    return {best_frame: [(best_flag, best_s, best_e)]}
Example #11
0
def predict_longest_ORFs(seq, min_aa_length, use_firstORF=False):
    """
    seq --- should be plain string in all upper case, A/T/C/G
    Return the longest ORFs that exceed <min_length> (unless use_firstORF is True)

    Returns: dict of <frame> --> list of (flag, <0-based start>, <1-based end>)
    NOTE that is the seq is reverse complemented, the handler function needs to rev the coords on its own
    """
    start_d, stop_d = ORFscores.find_start_stop_codons(seq)
    result = {0: [], 1: [], 2: []}

    n, m = len(seq)/3, len(seq)%3

    for frame in xrange(3):
        starts, stops = start_d[frame].keys(), stop_d[frame].keys()
        starts.sort()
        stops.sort()
        #print frame, starts, stops
        if len(stops) == 0: # no stop, so just output first (start, last)
            if len(starts) > 0 and n - starts[0] + 1 >= min_aa_length:
                result[frame].append(('dumb-3partial', starts[0]*3+frame, n*3+(frame if frame<=m else frame-3)))
        else: # has stop
            if len(starts) == 0: # 5' partial
                if  stops[0] + 1 >= min_aa_length:
                    result[frame].append(('dumb-5partial', frame, stops[0]*3+3+frame))
            else: # has at least one start and one stop
                i, j = 0, 0
                # if the first stop is smaller than i, find the first j s.t. stops[j-1] < start[0] < stops[j]
                if stops[0] < starts[0]:
                    while j < len(stops) and starts[0] < stops[j-1]:
                        j += 1
                # now: stops[j-1] < starts[0] < stops[j]
                while j < len(stops):
                    if i == len(starts): break
                    if stops[j] - starts[i] + 1 >= min_aa_length:
                        #rint frame, starts[i], stops[j]
                        result[frame].append(('dumb-complete', starts[i]*3+frame, stops[j]*3+3+frame))
                    j += 1 # move stop one step down
                    while i < len(starts) and starts[i] < stops[j-1]:
                        i += 1
                    # now starts[i] is between the last stop and this one
                # check the very last possible ORF
                if i < len(starts) and (j == len(stops) or (j < len(stops) and starts[i] > stops[j])) and n - starts[i] + 1 >= min_aa_length:
                    result[frame].append(('dumb-3partial', starts[i]*3+frame, n*3+(frame if frame<=m else frame-3)))

    # now pick the frame with the longest ORF!
    if all(len(v)==0 for v in result.itervalues()): # no ORF found
        return None


    best_frame, best_flag, best_s, best_e, best_len = None, None, None, None, 0
    if not use_firstORF: # find the longest ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if _len > best_len:
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len
    else: # use the first ORF among all frames
        for _frame, v in result.iteritems():
            for (flag, s, e) in v:
                _len = e - s
                if best_s is None or s < best_s or (s==best_s and _len>best_len):
                    best_frame, best_flag, best_s, best_e, best_len = \
                    _frame, flag, s, e, _len

    return {best_frame: [(best_flag, best_s, best_e)]}