Exemple #1
0
def add_consensi(family, qual_thres):
    for order in ('ab', 'ba'):
        for mate in (0, 1):
            alignment = family[order][mate]
            alignment.consensus = consensuslib.get_consensus(
                alignment.seqs,
                alignment.quals,
                qual_thres=chr(qual_thres + QUAL_OFFSET),
                gapped=True)
def make_sscs(family, order, mate, qual_thres, cons_thres, min_cons_reads):
  seqs = [read['seq'] for read in family]
  quals = [read['qual'] for read in family]
  consensus_seq = consensus.get_consensus(seqs,
                                          quals,
                                          cons_thres=cons_thres,
                                          min_reads=min_cons_reads,
                                          qual_thres=qual_thres
                                         )
  return {'seq':consensus_seq, 'order':order, 'mate':mate, 'nreads':len(family)}
Exemple #3
0
def make_sscs(family, order, mate, qual_thres, cons_thres, min_cons_reads):
  seqs = [read['seq'] for read in family]
  quals = [read['qual'] for read in family]
  consensus_seq = consensus.get_consensus(seqs,
                                          quals,
                                          cons_thres=cons_thres,
                                          min_reads=min_cons_reads,
                                          qual_thres=qual_thres
                                         )
  return {'seq':consensus_seq, 'order':order, 'mate':mate, 'nreads':len(family)}
Exemple #4
0
def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None,
                   processes=1, min_reads=1, qual_thres=' '):
  stats['families'] += 1
  # Are we the controller process or a worker?
  if processes > 1:
    i = stats['families'] % len(workers)
    worker = workers[i]
    delegate(worker, duplex, barcode)
    return
  # We're a worker. Actually process the family.
  start = time.time()
  consensi = []
  reads_per_strand = []
  duplex_mate = None
  for (order, mate), family in duplex.items():
    reads = len(family)
    if reads < min_reads:
      continue
    # The mate number for the duplex consensus. It's arbitrary, but all that matters is that the
    # two mates have different numbers. This system ensures that:
    # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2.
    if (order == 'ab' and mate == 1) or (order == 'ba' and mate == 2):
      duplex_mate = 1
    else:
      duplex_mate = 2
    seqs = [read['seq'] for read in family]
    quals = [read['qual'] for read in family]
    consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres))
    reads_per_strand.append(reads)
  assert len(consensi) <= 2
  if sscs_fh:
    for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand):
      sscs_fh.write('>{bar}.{order}.{mate} {reads}\n'.format(bar=barcode, order=order, mate=mate,
                                                             reads=reads))
      sscs_fh.write(cons+'\n')
  if len(consensi) == 1 and incl_sscs:
    print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand)
  elif len(consensi) == 2:
    align = swalign.smith_waterman(*consensi)
    #TODO: log error & return if len(align.target) != len(align.query)
    cons = consensus.build_consensus_duplex_simple(align.target, align.query)
    print_duplex(cons, barcode, duplex_mate, reads_per_strand)
  elapsed = time.time() - start
  logging.info('{} sec for {} reads.'.format(elapsed, sum(reads_per_strand)))
  if stats and len(consensi) > 0:
    stats['time'] += elapsed
    stats['reads'] += sum(reads_per_strand)
    stats['runs'] += 1
Exemple #5
0
def get_consensus(seq_align, qual_align, qual_thres):
    """Wrapper around consensus.get_consensus().
  When running under Python 3, this encodes strings passed to it as bytes and decodes its return
  value into str."""
    if not (seq_align and qual_align):
        return None
    if sys.version_info.major == 3:
        seqs_bytes = [bytes(seq, 'utf8') for seq in seq_align]
        quals_bytes = [bytes(qual, 'utf8') for qual in qual_align]
        qual_thres_byte = qual_thres + 32
    else:
        seqs_bytes = seq_align
        quals_bytes = qual_align
        qual_thres_byte = chr(qual_thres + 32)
    cons_bytes = consensuslib.get_consensus(seqs_bytes,
                                            quals_bytes,
                                            qual_thres=qual_thres_byte,
                                            gapped=True)
    if sys.version_info.major == 3:
        cons_seq = str(cons_bytes, 'utf8')
    else:
        cons_seq = cons_bytes
    return cons_seq
Exemple #6
0
def get_consensus(seq_align, qual_align, qual_thres):
  """Wrapper around consensus.get_consensus().
  When running under Python 3, this encodes strings passed to it as bytes and decodes its return
  value into str."""
  if not (seq_align and qual_align):
    return None
  if sys.version_info.major == 3:
    seqs_bytes = [bytes(seq, 'utf8') for seq in seq_align]
    quals_bytes = [bytes(qual, 'utf8') for qual in qual_align]
    qual_thres_byte = qual_thres+32
  else:
    seqs_bytes = seq_align
    quals_bytes = qual_align
    qual_thres_byte = chr(qual_thres+32)
  cons_bytes = consensuslib.get_consensus(seqs_bytes,
                                          quals_bytes,
                                          qual_thres=qual_thres_byte,
                                          gapped=True)
  if sys.version_info.major == 3:
    cons_seq = str(cons_bytes, 'utf8')
  else:
    cons_seq = cons_bytes
  return cons_seq
Exemple #7
0
def process_barcodes(dict_num, kmer, barcodes):
  """Perform a multiple sequence alignment on a set of barcodes and parse the result.
  Uses MAFFT."""
  # If there's only one barcode, we don't have to do an alignment.
  if len(barcodes) == 1:
    return dict_num, kmer, barcodes[0], barcodes, [1.0]
  with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file:
    for i, barcode in enumerate(barcodes):
      family_file.write('>{}\n'.format(i))
      family_file.write(barcode+'\n')
  with open(os.devnull, 'w') as devnull:
    try:
      command = ['mafft', '--nuc', '--quiet', family_file.name]
      output = subprocess.check_output(command, stderr=devnull)
    except (OSError, subprocess.CalledProcessError):
      return None
  os.remove(family_file.name)
  alignment = read_fasta(output, upper=True)
  consensus_seq = consensus.get_consensus(alignment)
  similarities = []
  for barcode in barcodes:
    similarities.append(get_similarity(consensus_seq, barcode))
  return dict_num, kmer, consensus_seq, barcodes, similarities
Exemple #8
0
def process_barcodes(dict_num, kmer, barcodes):
    """Perform a multiple sequence alignment on a set of barcodes and parse the result.
  Uses MAFFT."""
    # If there's only one barcode, we don't have to do an alignment.
    if len(barcodes) == 1:
        return dict_num, kmer, barcodes[0], barcodes, [1.0]
    with tempfile.NamedTemporaryFile('w', delete=False,
                                     prefix='align.msa.') as family_file:
        for i, barcode in enumerate(barcodes):
            family_file.write('>{}\n'.format(i))
            family_file.write(barcode + '\n')
    with open(os.devnull, 'w') as devnull:
        try:
            command = ['mafft', '--nuc', '--quiet', family_file.name]
            output = subprocess.check_output(command, stderr=devnull)
        except (OSError, subprocess.CalledProcessError):
            return None
    os.remove(family_file.name)
    alignment = read_fasta(output, upper=True)
    consensus_seq = consensus.get_consensus(alignment)
    similarities = []
    for barcode in barcodes:
        similarities.append(get_similarity(consensus_seq, barcode))
    return dict_num, kmer, consensus_seq, barcodes, similarities
    intens1 = np.array(list(map_1.values()))
    intens2 = np.array(list(map_2.values()))

    intens1 = intens1.reshape(1, len(intens1))
    intens2 = intens2.reshape(1, len(intens2))
    cos_lib = cosine_similarity(intens1, intens2)

    return cos_lib[0][0]


print(len(np.unique(image_UPGMA_pixel1)))
cluster2concensus = {}
cluster2comparison = {}
for cluster in np.unique(image_UPGMA_pixel1):
    print(cluster)
    cluster2concensus[cluster] = consensus.get_consensus(
        cluster, image_UPGMA_pixel1, dist_dot_product, ids, imzMLfile, xs, ys)
    cluster_ids = consensus.get_cluster_elements(cluster, image_UPGMA_pixel1,
                                                 parser, xs, ys)
    tmp = list()
    for i in cluster_ids:
        tmp.append(
            1 - (get_similarity(cluster2concensus[cluster],
                                consensus.tupel2map(parser.getspectrum(i)))))
    cluster2comparison[cluster] = tmp

consensus_distance = np.zeros(
    (len(cluster2concensus.keys()), len(cluster2concensus.keys())))
for cluster1 in range(len(cluster2concensus.keys())):
    for cluster2 in range(cluster1, len(cluster2concensus.keys())):
        consensus_distance[cluster1, cluster2] = consensus_distance[
            cluster2, cluster1] = 1 - get_similarity(
Exemple #10
0
def main(argv):

  parser = argparse.ArgumentParser(description=DESCRIPTION)
  parser.set_defaults(**OPT_DEFAULTS)

  parser.add_argument('seqs', metavar='sequence', nargs='*',
    help='The alignment.')
  parser.add_argument('-i', '--input',
    help='Provide the sequences in this input file instead of as command-line arguments. '
         'Give "-" to use stdin.')
  parser.add_argument('-f', '--format', choices=('plain', 'duplex'),
    help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is '
         'the 8-column format of the family-sorted read data from the duplex pipeline. It must be '
         'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and '
         'beta-alpha strands). If "duplex" is given, you must also specify which of the four '
         'possible alignments to output with --mate and --order.')
  parser.add_argument('-m', '--mate', type=int, choices=(1, 2))
  parser.add_argument('-o', '--order', choices=('ab', 'ba'))
  parser.add_argument('-F', '--qual-format', choices=('sanger',))
  parser.add_argument('-q', '--qual', type=int,
    help='Quality threshold: Default: %(default)s')

  args = parser.parse_args(argv[1:])

  qual_thres = ' '
  if args.qual_format == 'sanger':
    qual_thres = chr(args.qual + 33)
  else:
    fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format))
  # Check arguments.
  if not (args.seqs or args.input):
    fail('Error: You must provide sequences either in a file with --input or as arguments.')
  elif args.seqs and args.input:
    fail('Error: You cannot provide sequences in both a file and command-line arguments.')
  if args.format == 'duplex' and not (args.mate and args.order):
    fail('Error: If the --format is duplex, you must specify a --mate and --order.')

  # Read input.
  quals = []
  if args.input:
    if args.format == 'plain':
      if args.input == '-':
        seqs = [line.strip() for line in sys.stdin]
      else:
        with open(args.input) as infile:
          seqs = [line.strip() for line in infile]
    elif args.format == 'duplex':
      if args.input == '-':
        (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order)
      else:
        with open(args.input) as infile:
          (seqs, quals) = parse_duplex(infile, args.mate, args.order)
  else:
    seqs = args.seqs

  align = make_msa(seqs)
  if quals:
    quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ')
  cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True)

  output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres))

  for seq in output:
    print seq