Ejemplo n.º 1
0
def get_strand(seq, probes, thres):
  """Determine which strand the sequence comes from by trying to align probes from the sense strand.
  Returns 'sense', 'anti', or None.
  Algorithm: This tries each probe in both directions.
  If at least one of the alignments has an identity above the threshold, a vote is cast for the
  direction with a higher identity.
  If the votes that were cast are unanimous for one direction, that strand is returned.
  Else, return None."""
  votes = []
  for probe in probes:
    alignment = swalign.smith_waterman(seq, probe)
    sense_id = alignment.matches/len(probe)
    alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe))
    anti_id  = alignment.matches/len(probe)
    # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id)
    if sense_id > thres or anti_id > thres:
      if sense_id > anti_id:
        votes.append('sense')
      else:
        votes.append('anti')
  strand = None
  for vote in votes:
    if strand:
      if strand != vote:
        return None
    else:
      strand = vote
  return strand
Ejemplo n.º 2
0
def get_strand(seq, probes, thres):
  """Determine which strand the sequence comes from by trying to align probes from the sense strand.
  Returns 'sense', 'anti', or None.
  Algorithm: This tries each probe in both directions.
  If at least one of the alignments has an identity above the threshold, a vote is cast for the
  direction with a higher identity.
  If the votes that were cast are unanimous for one direction, that strand is returned.
  Else, return None."""
  votes = []
  for probe in probes:
    alignment = swalign.smith_waterman(seq, probe)
    sense_id = alignment.matches/len(probe)
    alignment = swalign.smith_waterman(seq, seqtools.get_revcomp(probe))
    anti_id  = alignment.matches/len(probe)
    # print '{}: sense: {}, anti: {}'.format(probe, sense_id, anti_id)
    if sense_id > thres or anti_id > thres:
      if sense_id > anti_id:
        votes.append('sense')
      else:
        votes.append('anti')
  strand = None
  for vote in votes:
    if strand:
      if strand != vote:
        return None
    else:
      strand = vote
  return strand
Ejemplo n.º 3
0
def is_alignment_reversed(barcode1, barcode2):
  """Return True if the barcodes are reversed with respect to each other, False otherwise.
  "reversed" in this case meaning the alpha + beta halves are swapped.
  Determine by aligning the two to each other, once in their original forms, and once with the
  second barcode reversed. If the smith-waterman score is higher in the reversed form, return True.
  """
  half = len(barcode2)//2
  barcode2_rev = barcode2[half:] + barcode2[:half]
  fwd_align = swalign.smith_waterman(barcode1, barcode2)
  rev_align = swalign.smith_waterman(barcode1, barcode2_rev)
  if rev_align.score > fwd_align.score:
    return True
  else:
    return False
Ejemplo n.º 4
0
def make_dcss(sscss):
    # ordermates is the mapping between the duplex consensus mate number and the order/mates of the
    # SSCSs it's composed of. It's arbitrary but consistent, to make sure the duplex consensuses have
    # different mate numbers, and they're the same from run to run.
    ordermates = {
        0: (('ab', 0), ('ba', 1)),
        1: (('ab', 1), ('ba', 0)),
    }
    # Get the consensus of each pair of SSCSs.
    # dcss is indexed by (0-based) mate.
    dcss = []
    for duplex_mate in 0, 1:
        # Gather the pair of reads for this duplex consensus.
        sscs_pair = []
        for order, mate in ordermates[duplex_mate]:
            sscs = sscss.get((order, mate))
            if sscs:
                sscs_pair.append(sscs)
        if len(sscs_pair) < 2:
            # If we didn't find two SSCSs for this duplex mate, we can't make a complete pair of duplex
            # consensus sequences.
            break
        align = swalign.smith_waterman(sscs_pair[0]['seq'],
                                       sscs_pair[1]['seq'])
        if len(align.target) != len(align.query):
            message = '{} != {}:\n'.format(len(align.target), len(align.query))
            message += '\n'.join([repr(sscs) for sscs in sscs_pair])
            raise AssertionError(message)
        seq = consensus.build_consensus_duplex_simple(align.target,
                                                      align.query)
        reads_per_strand = [sscs['nreads'] for sscs in sscs_pair]
        dcss.append({'seq': seq, 'nreads': reads_per_strand})
    assert len(dcss) == 0 or len(dcss) == 2, len(dcss)
    return dcss
Ejemplo n.º 5
0
def make_dcss(sscss):
  # ordermates is the mapping between the duplex consensus mate number and the order/mates of the
  # SSCSs it's composed of. It's arbitrary but consistent, to make sure the duplex consensuses have
  # different mate numbers, and they're the same from run to run.
  ordermates = {
    0: (('ab', 0), ('ba', 1)),
    1: (('ab', 1), ('ba', 0)),
  }
  # Get the consensus of each pair of SSCSs.
  # dcss is indexed by (0-based) mate.
  dcss = []
  for duplex_mate in 0, 1:
    # Gather the pair of reads for this duplex consensus.
    sscs_pair = []
    for order, mate in ordermates[duplex_mate]:
      sscs = sscss.get((order, mate))
      if sscs:
        sscs_pair.append(sscs)
    if len(sscs_pair) < 2:
      # If we didn't find two SSCSs for this duplex mate, we can't make a complete pair of duplex
      # consensus sequences.
      break
    align = swalign.smith_waterman(sscs_pair[0]['seq'], sscs_pair[1]['seq'])
    if len(align.target) != len(align.query):
      message = '{} != {}:\n'.format(len(align.target), len(align.query))
      message += '\n'.join([repr(sscs) for sscs in sscs_pair])
      raise AssertionError(message)
    seq = consensus.build_consensus_duplex_simple(align.target, align.query)
    reads_per_strand = [sscs['nreads'] for sscs in sscs_pair]
    dcss.append({'seq':seq, 'nreads':reads_per_strand})
  assert len(dcss) == 0 or len(dcss) == 2, len(dcss)
  return dcss
Ejemplo n.º 6
0
def get_duplex_consensi(family):
  consensi = []
  for (order1, mate1), (order2, mate2) in (('ab', 0), ('ba', 1)), (('ab', 1), ('ba', 0)):
    result = swalign.smith_waterman(family[order1][mate1].consensus.replace('-', ''),
                                    family[order2][mate2].consensus.replace('-', ''))
    consensi.append(consensuslib.build_consensus_duplex_simple(result.query, result.target))
  return consensi
Ejemplo n.º 7
0
def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None,
                   processes=1, min_reads=1, qual_thres=' '):
  stats['families'] += 1
  # Are we the controller process or a worker?
  if processes > 1:
    i = stats['families'] % len(workers)
    worker = workers[i]
    delegate(worker, duplex, barcode)
    return
  # We're a worker. Actually process the family.
  start = time.time()
  consensi = []
  reads_per_strand = []
  duplex_mate = None
  for (order, mate), family in duplex.items():
    reads = len(family)
    if reads < min_reads:
      continue
    # The mate number for the duplex consensus. It's arbitrary, but all that matters is that the
    # two mates have different numbers. This system ensures that:
    # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2.
    if (order == 'ab' and mate == 1) or (order == 'ba' and mate == 2):
      duplex_mate = 1
    else:
      duplex_mate = 2
    seqs = [read['seq'] for read in family]
    quals = [read['qual'] for read in family]
    consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres))
    reads_per_strand.append(reads)
  assert len(consensi) <= 2
  if sscs_fh:
    for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand):
      sscs_fh.write('>{bar}.{order}.{mate} {reads}\n'.format(bar=barcode, order=order, mate=mate,
                                                             reads=reads))
      sscs_fh.write(cons+'\n')
  if len(consensi) == 1 and incl_sscs:
    print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand)
  elif len(consensi) == 2:
    align = swalign.smith_waterman(*consensi)
    #TODO: log error & return if len(align.target) != len(align.query)
    cons = consensus.build_consensus_duplex_simple(align.target, align.query)
    print_duplex(cons, barcode, duplex_mate, reads_per_strand)
  elapsed = time.time() - start
  logging.info('{} sec for {} reads.'.format(elapsed, sum(reads_per_strand)))
  if stats and len(consensi) > 0:
    stats['time'] += elapsed
    stats['reads'] += sum(reads_per_strand)
    stats['runs'] += 1
Ejemplo n.º 8
0
def get_similarity(seq1, seq2):
    align = swalign.smith_waterman(seq1, seq2)
    logging.debug(align.target + '\n' + align.query)
    return align.matches / len(align.query)
Ejemplo n.º 9
0
def get_similarity(seq1, seq2):
  align = swalign.smith_waterman(seq1, seq2)
  logging.debug(align.target+'\n'+align.query)
  return align.matches / len(align.query)