Esempio n. 1
0
def main(argv):

    parser = make_argparser()
    args = parser.parse_args(argv[1:])
    if args.help:
        parser.print_help()
        return 0

    logging.basicConfig(stream=args.log,
                        level=args.volume,
                        format='%(message)s')
    tone_down_logger()

    start_time = time.time()
    # If the user requested, report back some data about the start of the run.
    if args.phone_home:
        call = phone.Call(__file__,
                          version.get_version(),
                          platform=args.platform,
                          test=args.test,
                          fail='warn')
        call.send_data('start')
        data = {
            'stdin': args.infile is sys.stdin,
            'processes': args.processes,
            'queue_size': args.queue_size,
        }
        if data['stdin']:
            data['input_size'] = None
        else:
            data['input_size'] = os.path.getsize(args.infile.name)
        call.send_data('prelim', run_data=data)
    else:
        call = None

    # Execute as much of the script as possible in a try/except to catch any exception that occurs
    # and report it via ET.phone.
    try:
        # Process and validate arguments.
        if args.queue_size is not None and args.queue_size <= 0:
            fail('Error: --queue-size must be greater than zero.')
        qual_start = QUAL_OFFSETS[args.qual_format]
        qual_thres = chr(args.qual + qual_start)
        if args.fastq_out is None:
            # Output FASTA.
            output_qual = None
        else:
            # Output FASTQ.
            if qual_start + args.fastq_out > 126:
                fail(
                    'Error: --fastq-out PHRED score ({}) is too large.'.format(
                        args.fastq_out))
            output_qual = chr(qual_start + args.fastq_out)
        if args.min_cons_reads > args.min_reads:
            fail(
                'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of '
                'consensus sequences with only N\'s!). If you want to exclude families with fewer than X '
                'reads, give --min-reads X instead of --min-cons-reads X.')
        if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)):
            fail('Error: must specify an output file!')
        # A dict of output filehandles.
        # Indexed so we can do filehandles['dcs'][mate].
        filehandles = {
            'dcs': (args.dcs1, args.dcs2),
            'sscs': (args.sscs1, args.sscs2),
        }

        # Open a pool of worker processes.
        stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0}
        static_kwargs = {
            'min_reads': args.min_reads,
            'cons_thres': args.cons_thres,
            'min_cons_reads': args.min_cons_reads,
            'qual_thres': qual_thres,
            'output_qual': output_qual,
        }
        pool = parallel_tools.SyncAsyncPool(
            process_duplex,
            processes=args.processes,
            static_kwargs=static_kwargs,
            queue_size=args.queue_size,
            callback=process_result,
            callback_args=[filehandles, stats],
        )
        try:
            process_families(args.infile, pool, stats)
        finally:
            # If the root process encounters an exception and doesn't tell the workers to stop, it will
            # hang forever.
            pool.close()
            pool.join()
            # Close all open filehandles.
            if args.infile is not sys.stdin:
                args.infile.close()
            for fh_group in filehandles.values():
                for fh in fh_group:
                    if fh:
                        fh.close()

        # Final stats on the run.
        run_time = int(time.time() - start_time)
        max_mem = get_max_mem()
        logging.info(
            'Processed {} reads and {} duplexes in {} seconds.'.format(
                stats['total_reads'], stats['runs'], run_time))
        if stats['reads'] > 0 and stats['runs'] > 0:
            per_read = stats['time'] / stats['reads']
            per_run = stats['time'] / stats['runs']
            logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(
                per_read, per_run))
        logging.info('in {}s total time and {:0.2f}MB RAM.'.format(
            run_time, max_mem))

    except (Exception, KeyboardInterrupt) as exception:
        if args.phone_home and call:
            try:
                exception_data = getattr(exception, 'child_context',
                                         parallel_tools.get_exception_data())
                logging.critical(
                    parallel_tools.format_traceback(exception_data))
                exception_data = parallel_tools.scrub_tb_paths(
                    exception_data, script_path=__file__)
            except Exception:
                exception_data = {}
            run_time = int(time.time() - start_time)
            try:
                run_data = get_run_data(stats, pool)
            except (Exception, UnboundLocalError):
                run_data = {}
            try:
                run_data['mem'] = get_max_mem()
            except Exception:
                pass
            run_data['failed'] = True
            if exception_data:
                run_data['exception'] = exception_data
            call.send_data('end', run_time=run_time, run_data=run_data)
            raise exception
        else:
            raise

    if args.phone_home and call:
        run_data = get_run_data(stats, pool, max_mem)
        call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 2
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'aligner': args.aligner,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')

    # If we're using mafft, check that we can execute it.
    if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'):
      fail('Error: Could not find "mafft" command on $PATH.')

    # Open a pool of worker processes.
    stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0}
    pool = parallel_tools.SyncAsyncPool(
      process_duplex, processes=args.processes, static_kwargs={'aligner':args.aligner},
      queue_size=args.queue_size, callback=process_result, callback_args=[stats]
    )

    try:
      # The main loop.
      align_families(args.infile, pool, stats, check_ids=args.check_ids)
    finally:
      # If an exception occurs in the parent without stopping the child processes, this will hang.
      # Make sure to kill the children in all cases.
      pool.close()
      pool.join()
      # Close input filehandle if it's open.
      if args.infile is not sys.stdin:
        args.infile.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.error(
      'Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment failures.'
      .format(**stats)
    )
    if stats['aligned_pairs'] > 0 and stats['runs'] > 0:
      per_pair = stats['time'] / stats['aligned_pairs']
      per_run = stats['time'] / stats['runs']
      logging.error(f'{per_pair:0.3f}s per pair, {per_run:0.3f}s per run.')
    logging.error(f'in {run_time}s total time and {max_mem:0.2f}MB RAM.')

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool, args.aligner)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, args.aligner, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 3
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])
  if args.help:
    parser.print_help()
    return 0

  logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)
  else:
    call = None

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    # Process and validate arguments.
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')
    qual_start = QUAL_OFFSETS[args.qual_format]
    qual_thres = chr(args.qual + qual_start)
    if args.fastq_out is None:
      # Output FASTA.
      output_qual = None
    else:
      # Output FASTQ.
      if qual_start+args.fastq_out > 126:
        fail('Error: --fastq-out PHRED score ({}) is too large.'.format(args.fastq_out))
      output_qual = chr(qual_start+args.fastq_out)
    if args.min_cons_reads > args.min_reads:
      fail('Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of '
           'consensus sequences with only N\'s!). If you want to exclude families with fewer than X '
           'reads, give --min-reads X instead of --min-cons-reads X.')
    if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)):
      fail('Error: must specify an output file!')
    # A dict of output filehandles.
    # Indexed so we can do filehandles['dcs'][mate].
    filehandles = {
      'dcs': (args.dcs1, args.dcs2),
      'sscs': (args.sscs1, args.sscs2),
    }

    # Open a pool of worker processes.
    stats = {'time':0, 'reads':0, 'runs':0, 'duplexes':0}
    static_kwargs = {
      'min_reads': args.min_reads,
      'cons_thres': args.cons_thres,
      'min_cons_reads': args.min_cons_reads,
      'qual_thres': qual_thres,
      'output_qual': output_qual,
    }
    pool = parallel_tools.SyncAsyncPool(process_duplex,
                                        processes=args.processes,
                                        static_kwargs=static_kwargs,
                                        queue_size=args.queue_size,
                                        callback=process_result,
                                        callback_args=[filehandles, stats],
                                       )
    try:
      total_reads = 0
      duplex = collections.OrderedDict()
      family = []
      barcode = None
      order = None
      # Note: mate is a 0-indexed integer ("mate 1" from the input file is mate 0 here).
      mate = None
      for line in args.infile:
        # Allow comments (e.g. for test input files).
        if line.startswith('#'):
          continue
        fields = line.rstrip('\r\n').split('\t')
        if len(fields) != 6:
          continue
        this_barcode, this_order, this_mate, name, seq, qual = fields
        this_mate = int(this_mate)-1
        # If the barcode, order, and mate are the same, we're just continuing the add reads to the
        # current family. Otherwise, store the current family, start a new one, and process the
        # duplex if we're at the end of one.
        new_barcode = this_barcode != barcode
        new_order = this_order != order
        new_mate = this_mate != mate
        if new_barcode or new_order or new_mate:
          if order is not None and mate is not None:
            duplex[(order, mate)] = family
          # If the barcode changed, process the last duplex and start a new one.
          if new_barcode and barcode is not None:
            assert len(duplex) <= 4, duplex.keys()
            pool.compute(duplex, barcode)
            stats['duplexes'] += 1
            duplex = collections.OrderedDict()
          barcode = this_barcode
          order = this_order
          mate = this_mate
          family = []
        read = {'name': name, 'seq':seq, 'qual':qual}
        family.append(read)
        total_reads += 1
      # Process the last family.
      if order is not None and mate is not None:
        duplex[(order, mate)] = family
      assert len(duplex) <= 4, duplex.keys()
      pool.compute(duplex, barcode)
      stats['duplexes'] += 1

      # Retrieve the remaining results.
      logging.info('Flushing remaining results from worker processes..')
      pool.flush()

    finally:
      # If the root process encounters an exception and doesn't tell the workers to stop, it will
      # hang forever.
      pool.close()
      pool.join()
      # Close all open filehandles.
      if args.infile is not sys.stdin:
        args.infile.close()
      for fh_group in filehandles.values():
        for fh in fh_group:
          if fh:
            fh.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.info('Processed {} reads and {} duplexes in {} seconds.'
                 .format(total_reads, stats['runs'], run_time))
    if stats['reads'] > 0 and stats['runs'] > 0:
      per_read = stats['time'] / stats['reads']
      per_run = stats['time'] / stats['runs']
      logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(per_read, per_run))
    logging.info('in {}s total time and {:0.2f}MB RAM.'.format(run_time, max_mem))

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 4
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'aligner': args.aligner,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')

    # If we're using mafft, check that we can execute it.
    if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'):
      fail('Error: Could not find "mafft" command on $PATH.')

    # Open a pool of worker processes.
    stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0}
    pool = parallel_tools.SyncAsyncPool(process_duplex,
                                        processes=args.processes,
                                        static_kwargs={'aligner':args.aligner},
                                        queue_size=args.queue_size,
                                        callback=process_result,
                                        callback_args=[stats],
                                       )
    """Now the main loop.
    This processes whole duplexes (pairs of strands) at a time for a future option to align the
    whole duplex at a time.
    duplex data structure:
    duplex = {
      'ab': [
        {'name1': 'read_name1a',
         'seq1':  'GATT-ACA',
         'qual1': 'sc!0 /J*',
         'name2': 'read_name1b',
         'seq2':  'ACTGACTA',
         'qual2': '34I&SDF)'
        },
        {'name1': 'read_name2a',
         ...
        },
        ...
      ],
      'ba': [
        ...
      ]
    }
    e.g.:
    seq = duplex[order][pair_num]['seq1']"""

    try:
      duplex = collections.OrderedDict()
      family = []
      barcode = None
      order = None
      for line in args.infile:
        fields = line.rstrip('\r\n').split('\t')
        if len(fields) != 8:
          continue
        (this_barcode, this_order, name1, seq1, qual1, name2, seq2, qual2) = fields
        # If the barcode or order has changed, we're in a new family.
        # Process the reads we've previously gathered as one family and start a new family.
        if this_barcode != barcode or this_order != order:
          duplex[order] = family
          # If the barcode is different, we're at the end of the whole duplex. Process the it and start
          # a new one. If the barcode is the same, we're in the same duplex, but we've switched strands.
          if this_barcode != barcode:
            # logging.debug('processing {}: {} orders ({})'.format(barcode, len(duplex),
            #               '/'.join([str(len(duplex[o])) for o in duplex])))
            if barcode is not None:
              pool.compute(duplex, barcode)
              stats['duplexes'] += 1
            duplex = collections.OrderedDict()
          barcode = this_barcode
          order = this_order
          family = []
        pair = {'name1': name1, 'seq1':seq1, 'qual1':qual1, 'name2':name2, 'seq2':seq2, 'qual2':qual2}
        family.append(pair)
        stats['pairs'] += 1
      # Process the last family.
      duplex[order] = family
      # logging.debug('processing {}: {} orders ({}) [last]'.format(barcode, len(duplex),
      #               '/'.join([str(len(duplex[o])) for o in duplex])))
      pool.compute(duplex, barcode)
      stats['duplexes'] += 1

      # Retrieve the remaining results.
      logging.info('Flushing remaining results from worker processes..')
      pool.flush()

    finally:
      # If an exception occurs in the parent without stopping the child processes, this will hang.
      # Make sure to kill the children in all cases.
      pool.close()
      pool.join()
      # Close input filehandle if it's open.
      if args.infile is not sys.stdin:
        args.infile.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.error('Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment '
                  'failures.'.format(**stats))
    if stats['aligned_pairs'] > 0 and stats['runs'] > 0:
      per_pair = stats['time'] / stats['aligned_pairs']
      per_run = stats['time'] / stats['runs']
      logging.error('{:0.3f}s per pair, {:0.3f}s per run.'.format(per_pair, per_run))
    logging.error('in {}s total time and {:0.2f}MB RAM.'.format(run_time, max_mem))

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool, args.aligner)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, args.aligner, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 5
0
def main(argv):

  # Allow using -v for --version if it's the only argument, and --verbose if there are more.
  if len(argv) == 2 and argv[1] == '-v':
    print(version.get_version())
    return

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    call.send_data('prelim', run_data=gather_prelim_data(args.families, args.reads, args.sam))

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    logging.info('Reading the fasta/q to map read names to barcodes..')
    names_to_barcodes = map_names_to_barcodes(args.reads, args.limit)

    logging.info('Reading the SAM to build the graph of barcode relationships..')
    graph, reversed_barcodes, num_good_alignments = read_alignments(args.sam, names_to_barcodes,
                                                                    args.pos, args.mapq,
                                                                    args.dist, args.limit)

    logging.info('Reading the families.tsv to get the counts of each family..')
    family_counts, read_pairs = get_family_counts(args.families, limit=args.limit,
                                                  check_ids=args.check_ids)

    if args.structures or args.visualize != 0:
      logging.info('Counting the unique barcode networks..')
      structures = count_structures(graph, family_counts)
      if args.structures:
        print_structures(structures, args.struct_human)
      if args.visualize != 0:
        logging.info('Generating a visualization of barcode networks..')
        visualize([s['graph'] for s in structures], args.visualize, args.viz_format)

    logging.info('Building the correction table from the graph..')
    corrections = make_correction_table(graph, family_counts, args.choose_by)

    logging.info('Reading the families.tsv again to print corrected output..')
    with open_as_text_or_gzip(args.families.name) as families:
      print_corrected_output(families, corrections, reversed_barcodes, args.prepend, args.limit,
                             args.output)

    run_time = int(time.time() - start_time)
    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024
    logging.info('Max memory usage: {:0.2f}MB'.format(max_mem))
    logging.info('Wall clock time:  {} seconds'.format(run_time))

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
        raise
      run_time = int(time.time() - start_time)
      try:
        run_data = {'barcodes':len(names_to_barcodes), 'good_alignments':num_good_alignments,
                    'read_pairs':read_pairs, 'max_mem':int(max_mem)}
      except Exception:
        run_data = {}
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home:
    run_data = {'barcodes':len(names_to_barcodes), 'good_alignments':num_good_alignments,
                'read_pairs':read_pairs, 'max_mem':int(max_mem)}
    call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 6
0
def main(argv):

    parser = make_argparser()
    args = parser.parse_args(argv[1:])
    if args.help:
        parser.print_help()
        return 0

    logging.basicConfig(stream=args.log,
                        level=args.volume,
                        format='%(message)s')
    tone_down_logger()

    start_time = time.time()
    # If the user requested, report back some data about the start of the run.
    if args.phone_home:
        call = phone.Call(__file__,
                          version.get_version(),
                          platform=args.platform,
                          test=args.test,
                          fail='warn')
        call.send_data('start')
        data = {
            'stdin': args.infile is sys.stdin,
            'processes': args.processes,
            'queue_size': args.queue_size,
        }
        if data['stdin']:
            data['input_size'] = None
        else:
            data['input_size'] = os.path.getsize(args.infile.name)
        call.send_data('prelim', run_data=data)
    else:
        call = None

    # Execute as much of the script as possible in a try/except to catch any exception that occurs
    # and report it via ET.phone.
    try:
        # Process and validate arguments.
        if args.queue_size is not None and args.queue_size <= 0:
            fail('Error: --queue-size must be greater than zero.')
        qual_start = QUAL_OFFSETS[args.qual_format]
        qual_thres = chr(args.qual + qual_start)
        if args.fastq_out is None:
            # Output FASTA.
            output_qual = None
        else:
            # Output FASTQ.
            if qual_start + args.fastq_out > 126:
                fail(
                    'Error: --fastq-out PHRED score ({}) is too large.'.format(
                        args.fastq_out))
            output_qual = chr(qual_start + args.fastq_out)
        if args.min_cons_reads > args.min_reads:
            fail(
                'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of '
                'consensus sequences with only N\'s!). If you want to exclude families with fewer than X '
                'reads, give --min-reads X instead of --min-cons-reads X.')
        if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)):
            fail('Error: must specify an output file!')
        # A dict of output filehandles.
        # Indexed so we can do filehandles['dcs'][mate].
        filehandles = {
            'dcs': (args.dcs1, args.dcs2),
            'sscs': (args.sscs1, args.sscs2),
        }

        # Open a pool of worker processes.
        stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0}
        static_kwargs = {
            'min_reads': args.min_reads,
            'cons_thres': args.cons_thres,
            'min_cons_reads': args.min_cons_reads,
            'qual_thres': qual_thres,
            'output_qual': output_qual,
        }
        pool = parallel_tools.SyncAsyncPool(
            process_duplex,
            processes=args.processes,
            static_kwargs=static_kwargs,
            queue_size=args.queue_size,
            callback=process_result,
            callback_args=[filehandles, stats],
        )
        try:
            total_reads = 0
            duplex = collections.OrderedDict()
            family = []
            barcode = None
            order = None
            # Note: mate is a 0-indexed integer ("mate 1" from the input file is mate 0 here).
            mate = None
            for line in args.infile:
                # Allow comments (e.g. for test input files).
                if line.startswith('#'):
                    continue
                fields = line.rstrip('\r\n').split('\t')
                if len(fields) != 6:
                    continue
                this_barcode, this_order, this_mate, name, seq, qual = fields
                this_mate = int(this_mate) - 1
                # If the barcode, order, and mate are the same, we're just continuing the add reads to the
                # current family. Otherwise, store the current family, start a new one, and process the
                # duplex if we're at the end of one.
                new_barcode = this_barcode != barcode
                new_order = this_order != order
                new_mate = this_mate != mate
                if new_barcode or new_order or new_mate:
                    if order is not None and mate is not None:
                        duplex[(order, mate)] = family
                    # If the barcode changed, process the last duplex and start a new one.
                    if new_barcode and barcode is not None:
                        assert len(duplex) <= 4, duplex.keys()
                        pool.compute(duplex, barcode)
                        stats['duplexes'] += 1
                        duplex = collections.OrderedDict()
                    barcode = this_barcode
                    order = this_order
                    mate = this_mate
                    family = []
                read = {'name': name, 'seq': seq, 'qual': qual}
                family.append(read)
                total_reads += 1
            # Process the last family.
            if order is not None and mate is not None:
                duplex[(order, mate)] = family
            assert len(duplex) <= 4, duplex.keys()
            pool.compute(duplex, barcode)
            stats['duplexes'] += 1

            # Retrieve the remaining results.
            logging.info('Flushing remaining results from worker processes..')
            pool.flush()

        finally:
            # If the root process encounters an exception and doesn't tell the workers to stop, it will
            # hang forever.
            pool.close()
            pool.join()
            # Close all open filehandles.
            if args.infile is not sys.stdin:
                args.infile.close()
            for fh_group in filehandles.values():
                for fh in fh_group:
                    if fh:
                        fh.close()

        # Final stats on the run.
        run_time = int(time.time() - start_time)
        max_mem = get_max_mem()
        logging.info(
            'Processed {} reads and {} duplexes in {} seconds.'.format(
                total_reads, stats['runs'], run_time))
        if stats['reads'] > 0 and stats['runs'] > 0:
            per_read = stats['time'] / stats['reads']
            per_run = stats['time'] / stats['runs']
            logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(
                per_read, per_run))
        logging.info('in {}s total time and {:0.2f}MB RAM.'.format(
            run_time, max_mem))

    except (Exception, KeyboardInterrupt) as exception:
        if args.phone_home and call:
            try:
                exception_data = getattr(exception, 'child_context',
                                         parallel_tools.get_exception_data())
                logging.critical(
                    parallel_tools.format_traceback(exception_data))
                exception_data = parallel_tools.scrub_tb_paths(
                    exception_data, script_path=__file__)
            except Exception:
                exception_data = {}
            run_time = int(time.time() - start_time)
            try:
                run_data = get_run_data(stats, pool)
            except (Exception, UnboundLocalError):
                run_data = {}
            try:
                run_data['mem'] = get_max_mem()
            except Exception:
                pass
            run_data['failed'] = True
            if exception_data:
                run_data['exception'] = exception_data
            call.send_data('end', run_time=run_time, run_data=run_data)
            raise exception
        else:
            raise

    if args.phone_home and call:
        run_data = get_run_data(stats, pool, max_mem)
        call.send_data('end', run_time=run_time, run_data=run_data)
Esempio n. 7
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'aligner': args.aligner,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')

    # If we're using mafft, check that we can execute it.
    if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'):
      fail('Error: Could not find "mafft" command on $PATH.')

    # Open a pool of worker processes.
    stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0}
    pool = parallel_tools.SyncAsyncPool(process_duplex,
                                        processes=args.processes,
                                        static_kwargs={'aligner':args.aligner},
                                        queue_size=args.queue_size,
                                        callback=process_result,
                                        callback_args=[stats],
                                       )
    """Now the main loop.
    This processes whole duplexes (pairs of strands) at a time for a future option to align the
    whole duplex at a time.
    duplex data structure:
    duplex = {
      'ab': [
        {'name1': 'read_name1a',
         'seq1':  'GATT-ACA',
         'qual1': 'sc!0 /J*',
         'name2': 'read_name1b',
         'seq2':  'ACTGACTA',
         'qual2': '34I&SDF)'
        },
        {'name1': 'read_name2a',
         ...
        },
        ...
      ],
      'ba': [
        ...
      ]
    }
    e.g.:
    seq = duplex[order][pair_num]['seq1']"""

    try:
      duplex = collections.OrderedDict()
      family = []
      barcode = None
      order = None
      for line in args.infile:
        fields = line.rstrip('\r\n').split('\t')
        if len(fields) != 8:
          continue
        (this_barcode, this_order, name1, seq1, qual1, name2, seq2, qual2) = fields
        if args.check_ids and not read_ids_match(name1, name2):
          raise ValueError('Read names "{}" and "{}" do not match.'.format(name1, name2))
        # If the barcode or order has changed, we're in a new family.
        # Process the reads we've previously gathered as one family and start a new family.
        if this_barcode != barcode or this_order != order:
          duplex[order] = family
          # If the barcode is different, we're at the end of the whole duplex. Process the it and start
          # a new one. If the barcode is the same, we're in the same duplex, but we've switched strands.
          if this_barcode != barcode:
            # logging.debug('processing {}: {} orders ({})'.format(barcode, len(duplex),
            #               '/'.join([str(len(duplex[o])) for o in duplex])))
            if barcode is not None:
              pool.compute(duplex, barcode)
              stats['duplexes'] += 1
            duplex = collections.OrderedDict()
          barcode = this_barcode
          order = this_order
          family = []
        pair = {'name1': name1, 'seq1':seq1, 'qual1':qual1, 'name2':name2, 'seq2':seq2, 'qual2':qual2}
        family.append(pair)
        stats['pairs'] += 1
      # Process the last family.
      duplex[order] = family
      # logging.debug('processing {}: {} orders ({}) [last]'.format(barcode, len(duplex),
      #               '/'.join([str(len(duplex[o])) for o in duplex])))
      pool.compute(duplex, barcode)
      stats['duplexes'] += 1

      # Retrieve the remaining results.
      logging.info('Flushing remaining results from worker processes..')
      pool.flush()

    finally:
      # If an exception occurs in the parent without stopping the child processes, this will hang.
      # Make sure to kill the children in all cases.
      pool.close()
      pool.join()
      # Close input filehandle if it's open.
      if args.infile is not sys.stdin:
        args.infile.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.error('Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment '
                  'failures.'.format(**stats))
    if stats['aligned_pairs'] > 0 and stats['runs'] > 0:
      per_pair = stats['time'] / stats['aligned_pairs']
      per_run = stats['time'] / stats['runs']
      logging.error('{:0.3f}s per pair, {:0.3f}s per run.'.format(per_pair, per_run))
    logging.error('in {}s total time and {:0.2f}MB RAM.'.format(run_time, max_mem))

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool, args.aligner)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, args.aligner, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)