def common_ancestor(self, targets, *more_targets): """Most recent common ancestor (clade) of all the given targets. Edge cases: - If no target is given, returns self.root - If 1 target is given, returns the target - If any target is not found in this tree, raises a ValueError """ paths = [self.get_path(t) for t in _combine_args(targets, *more_targets)] # Validation -- otherwise izip throws a spooky error below for p, t in zip(paths, targets): if p is None: raise ValueError("target %s is not in this tree" % repr(t)) mrca = self.root for level in zip(*paths): ref = level[0] for other in level[1:]: if ref is not other: break else: mrca = ref if ref is not mrca: break return mrca
def common_ancestor(self, targets, *more_targets): """Most recent common ancestor (clade) of all the given targets. Edge cases: - If no target is given, returns self.root - If 1 target is given, returns the target - If any target is not found in this tree, raises a ValueError """ paths = [ self.get_path(t) for t in _combine_args(targets, *more_targets) ] # Validation -- otherwise izip throws a spooky error below for p, t in zip(paths, targets): if p is None: raise ValueError("target %s is not in this tree" % repr(t)) mrca = self.root for level in zip(*paths): ref = level[0] for other in level[1:]: if ref is not other: break else: mrca = ref if ref is not mrca: break return mrca
def _get_inter_coords(coords, strand=1): """From the given pairs of coordinates, returns a list of pairs covering the intervening ranges.""" # adapted from Python's itertools guide # if strand is -1, adjust coords to the ends and starts are chained if strand == -1: sorted_coords = [(max(a, b), min(a, b)) for a, b in coords] inter_coords = list(chain(*sorted_coords))[1:-1] return list(zip(inter_coords[1::2], inter_coords[::2])) else: inter_coords = list(chain(*coords))[1:-1] return list(zip(inter_coords[::2], inter_coords[1::2]))
def _reorient_starts(starts, blksizes, seqlen, strand): """Reorients block starts into the opposite strand's coordinates. Arguments: starts -- List of integers, start coordinates. start -- Integer, 'Q start' or 'T start' column blksizes -- List of integers, block sizes. seqlen -- Integer of total sequence length. strand -- Integer denoting sequence strand. """ assert len(starts) == len(blksizes), \ "Unequal start coordinates and block sizes list (%r vs %r)" \ % (len(starts), len(blksizes)) # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html # no need to reorient if it's already the positive strand if strand >= 0: return starts else: # the plus-oriented coordinate is calculated by this: # plus_coord = length - minus_coord - block_size return [ seqlen - start - blksize for start, blksize in zip(starts, blksizes) ]
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens): """Returns the length of introns between fragments.""" # set opposite type, for setting introns opp_type = 'hit' if seq_type == 'query' else 'query' # list of flags to denote if an intron follows a block # it reads e.g. this line: # "ATGTT{TT} >>>> Target Intron 1 >>>> {G}TGTGTGTACATT" # and sets the opposing sequence type's intron (since this # line is present on the opposite sequence type line) has_intron_after = ['Intron' in x[seq_type] for x in inter_blocks] assert len(has_intron_after) == len(raw_inter_lens) # create list containing coord adjustments incorporating # intron lengths inter_lens = [] for flag, parsed_len in zip(has_intron_after, raw_inter_lens): if flag: # joint introns if all(parsed_len[:2]): # intron len is [0] if opp_type is query, otherwise it's [1] intron_len = int(parsed_len[0]) if opp_type == 'query' \ else int(parsed_len[1]) # single hit/query introns elif parsed_len[2]: intron_len = int(parsed_len[2]) else: raise ValueError("Unexpected intron parsing " "result: %r" % parsed_len) else: intron_len = 0 inter_lens.append(intron_len) return inter_lens
def randomized(cls, taxa, branch_length=1.0, branch_stdev=None): """Create a randomized bifurcating tree given a list of taxa. :param taxa: Either an integer specifying the number of taxa to create (automatically named taxon#), or an iterable of taxon names, as strings. :returns: a tree of the same type as this class. """ if isinstance(taxa, int): taxa = ['taxon%s' % (i+1) for i in range(taxa)] elif hasattr(taxa, '__iter__'): taxa = list(taxa) else: raise TypeError("taxa argument must be integer (# taxa) or " "iterable of taxon names.") rtree = cls() terminals = [rtree.root] while len(terminals) < len(taxa): newsplit = random.choice(terminals) newsplit.split(branch_length=branch_length) newterms = newsplit.clades if branch_stdev: # Add some noise to the branch lengths for nt in newterms: nt.branch_length = max(0, random.gauss(branch_length, branch_stdev)) terminals.remove(newsplit) terminals.extend(newterms) # Distribute taxon labels randomly random.shuffle(taxa) for node, name in zip(terminals, taxa): node.name = name return rtree
def randomized(cls, taxa, branch_length=1.0, branch_stdev=None): """Create a randomized bifurcating tree given a list of taxa. :param taxa: Either an integer specifying the number of taxa to create (automatically named taxon#), or an iterable of taxon names, as strings. :returns: a tree of the same type as this class. """ if isinstance(taxa, int): taxa = ['taxon%s' % (i + 1) for i in range(taxa)] elif hasattr(taxa, '__iter__'): taxa = list(taxa) else: raise TypeError("taxa argument must be integer (# taxa) or " "iterable of taxon names.") rtree = cls() terminals = [rtree.root] while len(terminals) < len(taxa): newsplit = random.choice(terminals) newsplit.split(branch_length=branch_length) newterms = newsplit.clades if branch_stdev: # Add some noise to the branch lengths for nt in newterms: nt.branch_length = max( 0, random.gauss(branch_length, branch_stdev)) terminals.remove(newsplit) terminals.extend(newterms) # Distribute taxon labels randomly random.shuffle(taxa) for node, name in zip(terminals, taxa): node.name = name return rtree
def __init__(self, line=None): self.chromat_file = '' self.phd_file = '' self.time = '' self.chem = '' self.dye = '' self.template = '' self.direction = '' if line: tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] poss = [line.find(x) for x in tags] tagpos = dict(zip(poss, tags)) if -1 in tagpos: del tagpos[-1] ps = sorted(tagpos) # the keys for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]): setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
def _flip_codons(codon_seq, target_seq): """Flips the codon characters from one seq to another.""" a, b = '', '' for char1, char2 in zip(codon_seq, target_seq): # no need to do anything if the codon seq line has nothing if char1 == ' ': a += char1 b += char2 else: a += char2 b += char1 return a, b
def _gaf10iterator(handle): for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[3] = inrec[3].split('|') #Qualifier inrec[5] = inrec[5].split('|') # DB:reference(s) inrec[7] = inrec[7].split('|') # With || From inrec[10] = inrec[10].split('|') # Synonym inrec[12] = inrec[12].split('|') # Taxon yield dict(zip(GAF10FIELDS, inrec))
def _gpi10iterator(handle): """Read GPI 1.0 format files (PRIVATE). This iterator is used to read a gp_information.goa_uniprot file which is in the GPI 1.0 format. """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) inrec[8] = inrec[8].split('|') # Annotation_Target_Set yield dict(zip(GPI10FIELDS, inrec))
def _get_coords(filename): alb = file(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line]))) # returns [(start0, end0), (start1, end1)]
def _gpi11iterator(handle): """Read GPI 1.0 format files (PRIVATE). This iterator is used to read a gp_information.goa_uniprot file which is in the GPI 1.0 format. """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[2] = inrec[2].split('|') # DB_Object_Name inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) inrec[7] = inrec[7].split('|') # DB_Xref(s) inrec[8] = inrec[8].split('|') # Properties yield dict(zip(GPI11FIELDS, inrec))
def _get_coords(filename): alb = file(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line])) ) # returns [(start0, end0), (start1, end1)]
def _gpa11iterator(handle): """Read GPA 1.1 format files (PRIVATE). This iterator is used to read a gp_association.goa_uniprot file which is in the GPA 1.1 format. Do not call directly. Rather use the gpa_iterator function """ for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[2] = inrec[2].split('|') # Qualifier inrec[4] = inrec[4].split('|') # DB:Reference(s) inrec[6] = inrec[6].split('|') # With inrec[10] = inrec[10].split('|') # Annotation extension yield dict(zip(GPA11FIELDS, inrec))
def _gaf20byproteiniterator(handle): cur_id = None id_rec_list = [] for inline in handle: if inline[0] == '!': continue inrec = inline.rstrip('\n').split('\t') if len(inrec) == 1: continue inrec[3] = inrec[3].split('|') #Qualifier inrec[5] = inrec[5].split('|') # DB:reference(s) inrec[7] = inrec[7].split('|') # With || From inrec[10] = inrec[10].split('|') # Synonym inrec[12] = inrec[12].split('|') # Taxon cur_rec = dict(zip(GAF20FIELDS, inrec)) if cur_rec['DB_Object_ID'] != cur_id and cur_id: ret_list = copy.copy(id_rec_list) id_rec_list = [cur_rec] cur_id = cur_rec['DB_Object_ID'] yield ret_list else: cur_id = cur_rec['DB_Object_ID'] id_rec_list.append(cur_rec)
def _reorient_starts(starts, blksizes, seqlen, strand): """Reorients block starts into the opposite strand's coordinates. Arguments: starts -- List of integers, start coordinates. start -- Integer, 'Q start' or 'T start' column blksizes -- List of integers, block sizes. seqlen -- Integer of total sequence length. strand -- Integer denoting sequence strand. """ assert len(starts) == len(blksizes), \ "Unequal start coordinates and block sizes list (%r vs %r)" \ % (len(starts), len(blksizes)) # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html # no need to reorient if it's already the positive strand if strand >= 0: return starts else: # the plus-oriented coordinate is calculated by this: # plus_coord = length - minus_coord - block_size return [seqlen - start - blksize for start, blksize in zip(starts, blksizes)]
def AbiIterator(handle, alphabet=None, trim=False): """Iterator for the Abi file format. """ # raise exception is alphabet is not dna if alphabet is not None: if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet): raise ValueError( "Invalid alphabet, ABI files do not hold proteins.") if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold RNA.") # raise exception if handle mode is not 'rb' if hasattr(handle, 'mode'): if set('rb') != set(handle.mode.lower()): raise ValueError("ABI files has to be opened in 'rb' mode.") # check if input file is a valid Abi file handle.seek(0) marker = handle.read(4) if not marker: # handle empty file gracefully raise StopIteration if marker != _as_bytes('ABIF'): raise IOError('File should start ABIF, not %r' % marker) # dirty hack for handling time information times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', } # initialize annotations annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) # parse header and extract data from directories header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): # stop iteration if all desired tags have been extracted # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3, # and seq, qual, id # todo key = tag_name + str(tag_number) # PBAS2 is base-called sequence if key == 'PBAS2': seq = tag_data ambigs = 'KYWMRS' if alphabet is None: if set(seq).intersection(ambigs): alphabet = ambiguous_dna else: alphabet = unambiguous_dna # PCON2 is quality values of base-called sequence elif key == 'PCON2': qual = [ord(val) for val in tag_data] # SMPL1 is sample id entered before sequencing run elif key == 'SMPL1': sample_id = tag_data elif key in times: times[key] = tag_data else: # extract sequence annotation as defined in _EXTRACT if key in _EXTRACT: annot[_EXTRACT[key]] = tag_data # set time annotations annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1']) annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2']) # use the file name as SeqRecord.name if available try: file_name = basename(handle.name).replace('.ab1', '') except: file_name = "" record = SeqRecord(Seq(seq, alphabet), id=sample_id, name=file_name, description='', annotations=annot, letter_annotations={'phred_quality': qual}) if not trim: yield record else: yield _abi_trim(record)
def AbiIterator(handle, alphabet=None, trim=False): """Iterator for the Abi file format. """ # raise exception is alphabet is not dna if alphabet is not None: if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold proteins.") if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold RNA.") # raise exception if handle mode is not 'rb' if hasattr(handle, "mode"): if set("rb") != set(handle.mode.lower()): raise ValueError("ABI files has to be opened in 'rb' mode.") # check if input file is a valid Abi file handle.seek(0) marker = handle.read(4) if not marker: # handle empty file gracefully raise StopIteration if marker != _as_bytes("ABIF"): raise IOError("File should start ABIF, not %r" % marker) # dirty hack for handling time information times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""} # initialize annotations annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) # parse header and extract data from directories header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): # stop iteration if all desired tags have been extracted # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3, # and seq, qual, id # todo key = tag_name + str(tag_number) # PBAS2 is base-called sequence if key == "PBAS2": seq = tag_data ambigs = "KYWMRS" if alphabet is None: if set(seq).intersection(ambigs): alphabet = ambiguous_dna else: alphabet = unambiguous_dna # PCON2 is quality values of base-called sequence elif key == "PCON2": qual = [ord(val) for val in tag_data] # SMPL1 is sample id entered before sequencing run elif key == "SMPL1": sample_id = tag_data elif key in times: times[key] = tag_data else: # extract sequence annotation as defined in _EXTRACT if key in _EXTRACT: annot[_EXTRACT[key]] = tag_data # set time annotations annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"]) annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"]) # use the file name as SeqRecord.name if available try: file_name = basename(handle.name).replace(".ab1", "") except: file_name = "" record = SeqRecord( Seq(seq, alphabet), id=sample_id, name=file_name, description="", annotations=annot, letter_annotations={"phred_quality": qual}, ) if not trim: yield record else: yield _abi_trim(record)
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list(zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list(zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def _create_hsp(hid, qid, psl): # protein flag is_protein = _is_protein(psl) # strand #if query is protein, strand is 0 if is_protein: qstrand = 0 else: qstrand = 1 if psl['strand'][0] == '+' else -1 # try to get hit strand, if it exists try: hstrand = 1 if psl['strand'][1] == '+' else -1 except IndexError: hstrand = 1 # hit strand defaults to plus # query block starts qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'], qstrand) # hit block starts if len(psl['strand']) == 2: hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'], psl['tsize'], hstrand) else: hstarts = psl['tstarts'] # set query and hit coords # this assumes each block has no gaps (which seems to be the case) assert len(qstarts) == len(hstarts) == len(psl['blocksizes']) query_range_all = list( zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])])) hit_range_all = list( zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])])) # check length of sequences and coordinates, all must match if 'tseqs' in psl and 'qseqs' in psl: assert len(psl['tseqs']) == len(psl['qseqs']) == \ len(query_range_all) == len(hit_range_all) else: assert len(query_range_all) == len(hit_range_all) frags = [] # iterating over query_range_all, but hit_range_all works just as well for idx, qcoords in enumerate(query_range_all): hseqlist = psl.get('tseqs') hseq = '' if not hseqlist else hseqlist[idx] qseqlist = psl.get('qseqs') qseq = '' if not qseqlist else qseqlist[idx] frag = HSPFragment(hid, qid, hit=hseq, query=qseq) # set alphabet frag.alphabet = generic_dna # set coordinates frag.query_start = qcoords[0] frag.query_end = qcoords[1] frag.hit_start = hit_range_all[idx][0] frag.hit_end = hit_range_all[idx][1] # and strands frag.query_strand = qstrand frag.hit_strand = hstrand frags.append(frag) # create hsp object hsp = HSP(frags) # check if start and end are set correctly assert hsp.query_start == psl['qstart'] assert hsp.query_end == psl['qend'] assert hsp.hit_start == psl['tstart'] assert hsp.hit_end == psl['tend'] # and check block spans as well assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes'] # set its attributes hsp.match_num = psl['matches'] hsp.mismatch_num = psl['mismatches'] hsp.match_rep_num = psl['repmatches'] hsp.n_num = psl['ncount'] hsp.query_gapopen_num = psl['qnuminsert'] hsp.query_gap_num = psl['qbaseinsert'] hsp.hit_gapopen_num = psl['tnuminsert'] hsp.hit_gap_num = psl['tbaseinsert'] hsp.ident_num = psl['matches'] + psl['repmatches'] hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert'] hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert'] hsp.query_is_protein = is_protein hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 hsp.score = _calc_score(psl, is_protein) # helper flag, for writing hsp._has_hit_strand = len(psl['strand']) == 2 return hsp
def parse_vulgar_comp(hsp, vulgar_comp): """Parses the vulgar components present in the hsp dictionary.""" # containers for block coordinates qstarts, qends, hstarts, hends = \ [hsp['query_start']], [], [hsp['hit_start']], [] # containers for split codons hsp['query_split_codons'], hsp['hit_split_codons'] = [], [] # containers for ner blocks hsp['query_ner_ranges'], hsp['hit_ner_ranges'] = [], [] # sentinels for tracking query and hit positions qpos, hpos = hsp['query_start'], hsp['hit_start'] # multiplier for determining sentinel movement qmove = 1 if hsp['query_strand'] >= 0 else -1 hmove = 1 if hsp['hit_strand'] >= 0 else -1 vcomps = re.findall(_RE_VCOMP, vulgar_comp) for idx, match in enumerate(vcomps): label, qstep, hstep = match[0], int(match[1]), int(match[2]) # check for label, must be recognized assert label in 'MCGF53INS', "Unexpected vulgar label: %r" % label # match, codon, or gaps if label in 'MCGS': # if the previous comp is not an MCGS block, it's the # start of a new block if vcomps[idx-1][0] not in 'MCGS': qstarts.append(qpos) hstarts.append(hpos) # other labels # store the values in the hsp dict as a tuple of (start, stop) # we're not doing anything if the label is in '53IN', as these # basically tell us what the inter-block coordinates are and # inter-block coordinates are automatically calculated by # and HSP property if label == 'S': # get start and stop from parsed values qstart, hstart = qpos, hpos qend = qstart + qstep * qmove hend = hstart + hstep * hmove # adjust the start-stop ranges sqstart, sqend = min(qstart, qend), max(qstart, qend) shstart, shend = min(hstart, hend), max(hstart, hend) # split codons # XXX: is it possible to have a frameshift that introduces # a codon split? If so, this may need a different treatment.. hsp['query_split_codons'].append((sqstart, sqend)) hsp['hit_split_codons'].append((shstart, shend)) # move sentinels accordingly qpos += qstep * qmove hpos += hstep * hmove # append to ends if the next comp is not an MCGS block or # if it's the last comp if idx == len(vcomps)-1 or \ (label in 'MCGS' and vcomps[idx+1][0] not in 'MCGS'): qends.append(qpos) hends.append(hpos) # adjust coordinates for seq_type in ('query_', 'hit_'): strand = hsp[seq_type + 'strand'] # switch coordinates if strand is < 0 if strand < 0: # switch the starts and ends hsp[seq_type + 'start'], hsp[seq_type + 'end'] = \ hsp[seq_type + 'end'], hsp[seq_type + 'start'] if seq_type == 'query_': qstarts, qends = qends, qstarts else: hstarts, hends = hends, hstarts # set start and end ranges hsp['query_ranges'] = list(zip(qstarts, qends)) hsp['hit_ranges'] = list(zip(hstarts, hends)) return hsp