def get_strand(seq, probes, thres): """Determine which strand the sequence comes from by trying to align probes from the sense strand. Returns 'sense', 'anti', or None. Algorithm: This tries each probe in both directions. If at least one of the alignments has an identity above the threshold, a vote is cast for the direction with a higher identity. If the votes that were cast are unanimous for one direction, that strand is returned. Else, return None.""" votes = [] for probe in probes: alignment = swalign.smith_waterman(seq, probe) sense_id = alignment.matches/len(probe) alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe)) anti_id = alignment.matches/len(probe) # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id) if sense_id > thres or anti_id > thres: if sense_id > anti_id: votes.append('sense') else: votes.append('anti') strand = None for vote in votes: if strand: if strand != vote: return None else: strand = vote return strand
def is_alignment_reversed(barcode1, barcode2): """Return True if the barcodes are reversed with respect to each other, False otherwise. "reversed" in this case meaning the alpha + beta halves are swapped. Determine by aligning the two to each other, once in their original forms, and once with the second barcode reversed. If the smith-waterman score is higher in the reversed form, return True. """ half = len(barcode2)//2 barcode2_rev = barcode2[half:] + barcode2[:half] fwd_align = swalign.smith_waterman(barcode1, barcode2) rev_align = swalign.smith_waterman(barcode1, barcode2_rev) if rev_align.score > fwd_align.score: return True else: return False
def make_dcss(sscss): # ordermates is the mapping between the duplex consensus mate number and the order/mates of the # SSCSs it's composed of. It's arbitrary but consistent, to make sure the duplex consensuses have # different mate numbers, and they're the same from run to run. ordermates = { 0: (('ab', 0), ('ba', 1)), 1: (('ab', 1), ('ba', 0)), } # Get the consensus of each pair of SSCSs. # dcss is indexed by (0-based) mate. dcss = [] for duplex_mate in 0, 1: # Gather the pair of reads for this duplex consensus. sscs_pair = [] for order, mate in ordermates[duplex_mate]: sscs = sscss.get((order, mate)) if sscs: sscs_pair.append(sscs) if len(sscs_pair) < 2: # If we didn't find two SSCSs for this duplex mate, we can't make a complete pair of duplex # consensus sequences. break align = swalign.smith_waterman(sscs_pair[0]['seq'], sscs_pair[1]['seq']) if len(align.target) != len(align.query): message = '{} != {}:\n'.format(len(align.target), len(align.query)) message += '\n'.join([repr(sscs) for sscs in sscs_pair]) raise AssertionError(message) seq = consensus.build_consensus_duplex_simple(align.target, align.query) reads_per_strand = [sscs['nreads'] for sscs in sscs_pair] dcss.append({'seq': seq, 'nreads': reads_per_strand}) assert len(dcss) == 0 or len(dcss) == 2, len(dcss) return dcss
def make_dcss(sscss): # ordermates is the mapping between the duplex consensus mate number and the order/mates of the # SSCSs it's composed of. It's arbitrary but consistent, to make sure the duplex consensuses have # different mate numbers, and they're the same from run to run. ordermates = { 0: (('ab', 0), ('ba', 1)), 1: (('ab', 1), ('ba', 0)), } # Get the consensus of each pair of SSCSs. # dcss is indexed by (0-based) mate. dcss = [] for duplex_mate in 0, 1: # Gather the pair of reads for this duplex consensus. sscs_pair = [] for order, mate in ordermates[duplex_mate]: sscs = sscss.get((order, mate)) if sscs: sscs_pair.append(sscs) if len(sscs_pair) < 2: # If we didn't find two SSCSs for this duplex mate, we can't make a complete pair of duplex # consensus sequences. break align = swalign.smith_waterman(sscs_pair[0]['seq'], sscs_pair[1]['seq']) if len(align.target) != len(align.query): message = '{} != {}:\n'.format(len(align.target), len(align.query)) message += '\n'.join([repr(sscs) for sscs in sscs_pair]) raise AssertionError(message) seq = consensus.build_consensus_duplex_simple(align.target, align.query) reads_per_strand = [sscs['nreads'] for sscs in sscs_pair] dcss.append({'seq':seq, 'nreads':reads_per_strand}) assert len(dcss) == 0 or len(dcss) == 2, len(dcss) return dcss
def get_duplex_consensi(family): consensi = [] for (order1, mate1), (order2, mate2) in (('ab', 0), ('ba', 1)), (('ab', 1), ('ba', 0)): result = swalign.smith_waterman(family[order1][mate1].consensus.replace('-', ''), family[order2][mate2].consensus.replace('-', '')) consensi.append(consensuslib.build_consensus_duplex_simple(result.query, result.target)) return consensi
def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None, processes=1, min_reads=1, qual_thres=' '): stats['families'] += 1 # Are we the controller process or a worker? if processes > 1: i = stats['families'] % len(workers) worker = workers[i] delegate(worker, duplex, barcode) return # We're a worker. Actually process the family. start = time.time() consensi = [] reads_per_strand = [] duplex_mate = None for (order, mate), family in duplex.items(): reads = len(family) if reads < min_reads: continue # The mate number for the duplex consensus. It's arbitrary, but all that matters is that the # two mates have different numbers. This system ensures that: # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2. if (order == 'ab' and mate == 1) or (order == 'ba' and mate == 2): duplex_mate = 1 else: duplex_mate = 2 seqs = [read['seq'] for read in family] quals = [read['qual'] for read in family] consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres)) reads_per_strand.append(reads) assert len(consensi) <= 2 if sscs_fh: for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand): sscs_fh.write('>{bar}.{order}.{mate} {reads}\n'.format(bar=barcode, order=order, mate=mate, reads=reads)) sscs_fh.write(cons+'\n') if len(consensi) == 1 and incl_sscs: print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand) elif len(consensi) == 2: align = swalign.smith_waterman(*consensi) #TODO: log error & return if len(align.target) != len(align.query) cons = consensus.build_consensus_duplex_simple(align.target, align.query) print_duplex(cons, barcode, duplex_mate, reads_per_strand) elapsed = time.time() - start logging.info('{} sec for {} reads.'.format(elapsed, sum(reads_per_strand))) if stats and len(consensi) > 0: stats['time'] += elapsed stats['reads'] += sum(reads_per_strand) stats['runs'] += 1
def get_similarity(seq1, seq2): align = swalign.smith_waterman(seq1, seq2) logging.debug(align.target + '\n' + align.query) return align.matches / len(align.query)
def get_similarity(seq1, seq2): align = swalign.smith_waterman(seq1, seq2) logging.debug(align.target+'\n'+align.query) return align.matches / len(align.query)