def get_all_orfs(seqrecord, frame): """ Generic ORF finder; it returns a list of all ORFs as they are found, given codons (a list if tuples in the form (codon, position)) from `get_codons`. This list is a list of SeqRange objects. Earlier versions did not allow for overlapping ORFs, since our approach was to always take the 5'-most start codon. However, now new versions will allow for other start codon prediction methods. For example: |-----M---M---M----------------*----| Earlier methods would just give a single ORF candidate: |----------ORF-----------| |-----M---M---M----------------*----| Now, this new version will allow overlapping ORFs (via a queue): |-----ORF3-------| |---------ORF2-------| |-------------ORF1-------| |-- open ended case -----------| |-----M---M---M----------------*----| Save for annotating them differently, this function will not ignore partial 5'-incomplete for 3'-incomplete cases. In other words, we assume we are in an open reading frame from the start. This allows us to enumerate every biologically possible ORF. """ seq = seqrecord.seq seqname = seqrecord.id seqlength = len(seq) # initialize ORF collector, and starting positions all_orfs = SeqRanges() # for final ORFs # to handle keeping many reading possible reading frame candidates # open at once, we use a queue. Tuples maintain key data: # (start orf position, start query position, whether had start codon) orf_queue = deque(list()) # get all codons in frame codons = get_codons(seq, frame) # push case that we're in reading frame from start onto queue, but # only if it's not a stop codon codon, orf_pos, query_pos = codons[0] if codon not in STOP_CODONS: # note what we're adding here: query_pos is query position *in # frame*. So even if the ORF is open-ended, we will still not # start from the beginning of the sequence, but rather the # beginning of the sequence in frame. orf_queue.append((orf_pos, query_pos, False)) for codon, orf_pos, query_pos in codons: codon = codon.upper() if codon in START_CODONS: #print "adding start codon '%s' pos %d to queue" % (codon, query_pos) orf_queue.append((orf_pos, query_pos, True)) continue if codon in STOP_CODONS: # pop everything off queue and make it an ORF to add to # the candidates list while True: try: orf_start_pos, query_start_pos, had_start = orf_queue.popleft() except IndexError: break orf_data = {"no_start":not had_start, "no_stop":False} orf = SeqRange(Range(query_start_pos, query_pos+2), seqname, "+", seqlength=seqlength, data=orf_data) all_orfs.append(orf) # iteration complete. If there are still items in the ORF queue, # pop them off and add them as incomplete. if len(orf_queue) > 0: while True: try: orf_start_pos, query_start_pos, had_start = orf_queue.pop() except IndexError: break orf_data = {"no_start":not had_start, "no_stop":True} orf = SeqRange(Range(query_start_pos, query_pos+2), seqname, "+", seqlength=seqlength, data=orf_data) all_orfs.append(orf) return all_orfs