Esempio n. 1
0
def build_parser(parser):
    parser.add_argument(
        'infile',
        type=Opener('rU'),
        help=
        ('Input CSV file to process, minimally containing the field `tax_id`. '
         'Use "-" for stdin.'))
    parser = taxtastic.utils.add_database_args(parser)
    parser.add_argument(
        '-o',
        '--outfile',
        default=sys.stdout,
        type=Opener('wt'),
        help='Modified version of input file [default: stdout]')
    parser.add_argument(
        '--taxid-column',
        default='tax_id',
        help='name of column containing tax_ids to be replaced [%(default)s]')
    parser.add_argument(
        '--unknowns',
        type=Opener('wt'),
        help=('optional output file containing rows with unknown tax_ids '
              'having no replacements in merged table'))
    parser.add_argument(
        '-a',
        '--unknown-action',
        choices=['drop', 'ignore', 'error'],
        default='error',
        help=('action to perform for tax_ids with no replacement '
              'in merged table [%(default)s]'))
Esempio n. 2
0
def build_parser(parser):
    parser = add_database_args(parser)
    parser.add_argument(
        'source_name',
        help='name of source identifying names and nodes to extract')
    parser.add_argument('-o',
                        '--outfile',
                        type=Opener('w'),
                        default=sys.stdout)
Esempio n. 3
0
def main(arguments):

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('--binary', default='esl-alimerge')

    inputs = parser.add_argument_group('input files')
    inputs.add_argument('sto_1', type=Opener('r'))
    inputs.add_argument('sto_2', type=Opener('r'))

    outputs = parser.add_argument_group('output files')
    outputs.add_argument('fasta_1', type=Opener('w'))
    outputs.add_argument('fasta_2', type=Opener('w'))

    args = parser.parse_args(arguments)

    input1_names = {
        line.split()[0]
        for line in args.sto_1 if line.strip() and not line.startswith('#')
    }

    with tempfile.NamedTemporaryFile('w+t', dir='.') as tf:
        cmd = [
            args.binary, '--dna', '--outformat', 'afa', '-o', tf.name,
            args.sto_1.name, args.sto_2.name
        ]
        sys.stderr.write(' '.join(cmd) + '\n')
        p = subprocess.Popen(cmd)
        p.communicate()

        merged = fastalite(tf)
        for seq in merged:
            if seq.id in input1_names:
                args.fasta_1.write(reformat(seq))
            else:
                args.fasta_2.write(reformat(seq))
Esempio n. 4
0
def build_parser(parser):
    parser = add_database_args(parser)
    parser.add_argument('new_nodes',
                        metavar='FILE',
                        type=Opener('r'),
                        help='yaml file specifying new nodes')
    parser.add_argument(
        '--source-name',
        dest='source_name',
        help=("""Provides the default source name for new nodes.  The
              value is overridden by "source_name" in the input
              file. If not provided, "source_name" is required in each
              node or name definition. This source name is created if
              it does not exist."""))
Esempio n. 5
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'fastq',
        help='reads to count in fastq format',
        metavar='file.fastq[.bz2|.gz]',
        type=Opener(),
    )
    parser.add_argument(
        'read_counts',
        help='tabulate read counts and store as a CSV',
        metavar='FILE',
        type=argparse.FileType('w'),
    )
    args = parser.parse_args(arguments)
    count = sum(1 for _ in fastqlite(args.fastq))
    read_counts_writer = csv.writer(args.read_counts)
    read_counts_writer.writerow([args.fastq.name, count, count])
Esempio n. 6
0
def main(arguments=None):
    parser = argparse.ArgumentParser(
        prog='barcodecop',
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'index',
        nargs='+',
        type=Opener(),
        metavar='file.fastq[.bz2|.gz]',
        help='one or two files containing index reads in fastq format')
    parser.add_argument('-f',
                        '--fastq',
                        type=Opener(),
                        metavar='file.fastq[.bz2|.gz]',
                        help='reads to filter in fastq format')
    parser.add_argument('-o',
                        '--outfile',
                        default=sys.stdout,
                        type=Opener('w'),
                        help='output fastq')
    parser.add_argument(
        '--snifflimit',
        type=int,
        default=10000,
        metavar='N',
        help='read no more than N records from the index file [%(default)s]')
    parser.add_argument('--head',
                        type=int,
                        metavar='N',
                        help='limit the output file to N records')
    parser.add_argument(
        '--min-pct-assignment',
        type=float,
        default=90.0,
        metavar='PERCENT',
        help=("""warn (or fail with an error; see --strict) if the
               most common barcode represents less than PERCENT of the
               total [%(default)s]"""))
    parser.add_argument(
        '--strict',
        action='store_true',
        default=False,
        help=("""fail if conditions of --min-pct-assignment are not met"""))
    parser.add_argument(
        '--invert',
        action='store_true',
        default=False,
        help='include only sequences *not* matching the most common barcode')
    # parser.add_argument('--format', choices=['fasta', 'fastq'], default='fastq')
    parser.add_argument('-c',
                        '--show-counts',
                        action='store_true',
                        default=False,
                        help='tabulate barcode counts and exit')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        default=False,
                        help='minimize messages to stderr')
    parser.add_argument('-V',
                        '--version',
                        action=VersionAction,
                        version=__version__,
                        help='Print the version number and exit')

    args = parser.parse_args(arguments)

    logging.basicConfig(format='%(message)s',
                        level=logging.ERROR if args.quiet else logging.INFO)
    log = logging.getLogger(__name__)

    if len(args.index) == 1:
        bcseqs = fastqlite(args.index[0])
    elif len(args.index) == 2:
        bcseqs = combine_dual_indices(*args.index)
    else:
        log.error('error: please specify either one or two index files')

    bc1, bc2 = tee(bcseqs, 2)

    # determine the most common barcode
    barcode_counts = Counter(
        [str(seq.seq) for seq in islice(bc1, args.snifflimit)])
    barcodes, counts = zip(*barcode_counts.most_common())

    most_common_bc = barcodes[0]
    most_common_pct = 100 * float(counts[0]) / sum(counts)
    log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format(
        most_common_bc, counts[0], sum(counts), most_common_pct))

    if args.show_counts:
        for bc, count in barcode_counts.most_common():
            print('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count))
        return None

    if most_common_pct < args.min_pct_assignment:
        msg = 'frequency of most common barcode is less than {}%'.format(
            args.min_pct_assignment)
        if args.strict:
            log.error('Error: ' + msg)
            sys.exit(1)
        else:
            log.warning('Warning: ' + msg)

    if not args.fastq:
        log.error('specify a fastq format file to filter using -f/--fastq')
        sys.exit(1)

    seqs = fastqlite(args.fastq)
    filtered = islice(filter(bc2, seqs, most_common_bc, args.invert),
                      args.head)

    for seq in filtered:
        args.outfile.write(as_fastq(seq))
Esempio n. 7
0
def main(arguments=None):
    parser = argparse.ArgumentParser(
        prog='barcodecop',
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'index',
        nargs='+',
        type=Opener(),
        metavar='file.fastq[.bz2|.gz]',
        help='one or two files containing index reads in fastq format')
    parser.add_argument('-f',
                        '--fastq',
                        type=Opener(),
                        metavar='file.fastq[.bz2|.gz]',
                        help='reads to filter in fastq format')
    parser.add_argument('-o',
                        '--outfile',
                        default=sys.stdout,
                        type=Opener('w'),
                        help='output fastq')
    parser.add_argument(
        '--snifflimit',
        type=int,
        default=10000,
        metavar='N',
        help='read no more than N records from the index file [%(default)s]')
    parser.add_argument('--head',
                        type=int,
                        metavar='N',
                        help='limit the output file to N records')
    parser.add_argument(
        '--invert',
        action='store_true',
        default=False,
        help='include only sequences failing filtering criteria')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        default=False,
                        help='minimize messages to stderr')
    parser.add_argument('-V',
                        '--version',
                        action=VersionAction,
                        version=__version__,
                        help='Print the version number and exit')

    match_options = parser.add_argument_group('Barcode matching options')

    match_options.add_argument(
        '--match-filter',
        action='store_true',
        default=False,
        help=('filter reads based on exact match to most common barcode '
              '[default: no match filter]'))
    match_options.add_argument(
        '--min-pct-assignment',
        type=float,
        default=90.0,
        metavar='PERCENT',
        help=("""warn (or fail with an error; see --strict) if the
               most common barcode represents less than PERCENT of the
               total [%(default)s]"""))
    match_options.add_argument(
        '--strict',
        action='store_true',
        default=False,
        help=("""fail if conditions of --min-pct-assignment are not met"""))
    match_options.add_argument('-c',
                               '--show-counts',
                               action='store_true',
                               default=False,
                               help='tabulate barcode counts and exit')

    qual_options = parser.add_argument_group(
        'Barcode quality filtering options')

    qual_options.add_argument(
        '--qual-filter',
        action='store_true',
        default=False,
        help=
        'filter reads based on minimum index quality [default: no quality filter]'
    )
    qual_options.add_argument(
        '-p',
        '--min-qual',
        type=int,
        default=MIN_QUAL,
        help="""reject seqs with mean barcode quality score less than
        this value; for dual index, both barcodes must meet the
        threshold [%(default)s]""")
    qual_options.add_argument('--encoding',
                              default='phred',
                              choices=['phred'],
                              help="""quality score encoding; see
             https://en.wikipedia.org/wiki/FASTQ_format [%(default)s]""")

    args = parser.parse_args(arguments)

    logging.basicConfig(format='%(message)s',
                        level=logging.ERROR if args.quiet else logging.INFO)
    log = logging.getLogger(__name__)

    # when provided with dual barcodes, concatenate into a single
    # namedtuple with attributes qual and qual1; generate a filter
    # function appropriate for either case.
    if len(args.index) == 1:
        bcseqs = fastqlite(args.index[0])
        qual_filter = get_qual_filter(args.min_qual, args.encoding)
    elif len(args.index) == 2:
        qual_filter = get_qual_filter(args.min_qual,
                                      args.encoding,
                                      paired=True)
        bcseqs = combine_dual_indices(*args.index)
    else:
        log.error('error: please specify either one or two index files')

    # use bc1 to determine most common barcode
    bc1, bc2 = tee(bcseqs, 2)

    # determine the most common barcode
    barcode_counts = Counter(
        [str(seq.seq) for seq in islice(bc1, args.snifflimit)])
    barcodes, counts = list(zip(*barcode_counts.most_common()))

    most_common_bc = barcodes[0]
    most_common_pct = 100 * float(counts[0]) / sum(counts)
    log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format(
        most_common_bc, counts[0], sum(counts), most_common_pct))

    if args.show_counts:
        for bc, count in barcode_counts.most_common():
            print(('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc),
                                       count)))
        return None

    if most_common_pct < args.min_pct_assignment:
        msg = 'frequency of most common barcode is less than {}%'.format(
            args.min_pct_assignment)
        if args.strict:
            log.error('Error: ' + msg)
            sys.exit(1)
        else:
            log.warning('Warning: ' + msg)

    if not args.fastq:
        log.error('specify a fastq format file to filter using -f/--fastq')
        sys.exit(1)

    ifilterfun = filterfalse if args.invert else filter

    seqs = fastqlite(args.fastq)
    filtered = zip_longest(seqs, bc2)

    if args.match_filter:
        filtered = ifilterfun(get_match_filter(most_common_bc), filtered)

    if args.qual_filter:
        filtered = ifilterfun(qual_filter, filtered)

    for seq, bc in islice(filtered, args.head):
        assert seq.id == bc.id
        args.outfile.write(as_fastq(seq))