def stitch(record1, record2): seq1 = array([record1.seq.tostring()]) seq2 = array([reverse_complement(record2.seq.tostring())]) seq1.dtype = '|S1' seq2.dtype = '|S1' quals1 = array(record1.letter_annotations['phred_quality']) quals2 = array(record2.letter_annotations['phred_quality'][::-1]) log10p_consensus_1 = log1p(-power(10, -quals1 / 10.)) / log(10) log10p_consensus_2 = log1p(-power(10, -quals2 / 10.)) / log(10) log10p_error_1 = -log10(3) - (quals1 / 10.) log10p_error_2 = -log10(3) - (quals2 / 10.) min_overlap = 1 max_overlap = max(len(record1), len(record2)) overlaps = {} for overlap in range(1, max_overlap): s1 = seq1[-overlap:] s2 = seq2[:overlap] q1 = quals1[-overlap:] q2 = quals2[:overlap] lpc1 = log10p_consensus_1[-overlap:] lpc2 = log10p_consensus_2[:overlap] lpe1 = log10p_error_1[-overlap:] lpe2 = log10p_error_2[:overlap] consensus = choose(q1 < q2, [s1, s2]) score = sum(choose(consensus == s1, [lpe1, lpc1])) + sum(choose(consensus == s2, [lpe2, lpc2])) + len(consensus) * log10(4) * 2 # last term is null hypothesis, p=1/4 consensus.dtype = '|S%i' % len(consensus) overlaps[overlap] = (consensus[0],score) return overlaps
def build_aln(alnsummary, vulgar_commands, queryseq, targetseq): """Build full alignment from exonerate using 'parsable' preset and vulgar output""" queryname = alnsummary['query_id'] targetname = alnsummary['target_id'] # process strands. the position vars below will always progress # from 0->len(seq), so the seqs must be revcomped accordingly queryposition = alnsummary['query_aln_begin'] targetposition = alnsummary['target_aln_begin'] if alnsummary['query_strand'] == '-': queryseq = seqtools.reverse_complement(queryseq) queryposition = len(queryseq) - queryposition if alnsummary['target_strand'] == '-': targetseq = seqtools.reverse_complement(targetseq) targetposition = len(targetseq) - targetposition pad = abs(queryposition - targetposition) # build alignment queryaln = '' targetaln = '' # process necessary padding if queryposition > targetposition: targetaln = ' ' * pad else: queryaln = ' ' * pad # add pre-aln sequence queryaln += queryseq[0:queryposition] targetaln += targetseq[0:targetposition] # walk through alignment (from vulgar output) for cmd in vulgar_commands: if cmd[0] == 'M': assert (cmd[1] == cmd[2]) queryaln += queryseq[queryposition:queryposition + cmd[1]] targetaln += targetseq[targetposition:targetposition + cmd[2]] queryposition += cmd[1] targetposition += cmd[2] elif cmd[0] == 'G': assert ((cmd[1] == 0) != (cmd[1] == 0)) # xor if cmd[1] == 0: queryaddendum = '-' * cmd[2] targetaddendum = targetseq[targetposition:targetposition + cmd[2]] elif cmd[2] == 0: queryaddendum = queryseq[queryposition:queryposition + cmd[1]] targetaddendum = '-' * cmd[1] queryaln += queryaddendum targetaln += targetaddendum queryposition += cmd[1] targetposition += cmd[2] else: raise ValueError, "I do not understand the vulgar command %s" % cmd[ 0] # add any post-aln sequence queryaln += queryseq[queryposition:] targetaln += targetseq[targetposition:] return (queryaln, targetaln)
def seqdict2revcompseqdict(seqdict): revcompdict = {} for item in seqdict.iteritems(): revcompdict[item[0]] = seqtools.reverse_complement(item[1]) return revcompdict
def build_aln(alnsummary,vulgar_commands,queryseq,targetseq): """Build full alignment from exonerate using 'parsable' preset and vulgar output""" queryname = alnsummary['query_id'] targetname = alnsummary['target_id'] # process strands. the position vars below will always progress # from 0->len(seq), so the seqs must be revcomped accordingly queryposition = alnsummary['query_aln_begin'] targetposition = alnsummary['target_aln_begin'] if alnsummary['query_strand'] == '-': queryseq = seqtools.reverse_complement(queryseq) queryposition = len(queryseq) - queryposition if alnsummary['target_strand'] == '-': targetseq = seqtools.reverse_complement(targetseq) targetposition = len(targetseq) - targetposition pad = abs(queryposition - targetposition) # build alignment queryaln = '' targetaln = '' # process necessary padding if queryposition > targetposition: targetaln = ' ' * pad else: queryaln = ' ' * pad # add pre-aln sequence queryaln += queryseq[0:queryposition] targetaln += targetseq[0:targetposition] # walk through alignment (from vulgar output) for cmd in vulgar_commands: if cmd[0] == 'M': assert(cmd[1]==cmd[2]) queryaln += queryseq[queryposition:queryposition+cmd[1]] targetaln += targetseq[targetposition:targetposition+cmd[2]] queryposition += cmd[1] targetposition += cmd[2] elif cmd[0] == 'G': assert( (cmd[1]==0) != (cmd[1]==0) ) # xor if cmd[1] == 0: queryaddendum = '-' * cmd[2] targetaddendum = targetseq[targetposition:targetposition+cmd[2]] elif cmd[2] == 0: queryaddendum = queryseq[queryposition:queryposition+cmd[1]] targetaddendum = '-' * cmd[1] queryaln += queryaddendum targetaln += targetaddendum queryposition += cmd[1] targetposition += cmd[2] else: raise ValueError, "I do not understand the vulgar command %s" % cmd[0] # add any post-aln sequence queryaln += queryseq[queryposition:] targetaln += targetseq[targetposition:] return (queryaln,targetaln)