def build_parser(parser): parser.add_argument( 'infile', type=Opener('rU'), help= ('Input CSV file to process, minimally containing the field `tax_id`. ' 'Use "-" for stdin.')) parser = taxtastic.utils.add_database_args(parser) parser.add_argument( '-o', '--outfile', default=sys.stdout, type=Opener('wt'), help='Modified version of input file [default: stdout]') parser.add_argument( '--taxid-column', default='tax_id', help='name of column containing tax_ids to be replaced [%(default)s]') parser.add_argument( '--unknowns', type=Opener('wt'), help=('optional output file containing rows with unknown tax_ids ' 'having no replacements in merged table')) parser.add_argument( '-a', '--unknown-action', choices=['drop', 'ignore', 'error'], default='error', help=('action to perform for tax_ids with no replacement ' 'in merged table [%(default)s]'))
def build_parser(parser): parser = add_database_args(parser) parser.add_argument( 'source_name', help='name of source identifying names and nodes to extract') parser.add_argument('-o', '--outfile', type=Opener('w'), default=sys.stdout)
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--binary', default='esl-alimerge') inputs = parser.add_argument_group('input files') inputs.add_argument('sto_1', type=Opener('r')) inputs.add_argument('sto_2', type=Opener('r')) outputs = parser.add_argument_group('output files') outputs.add_argument('fasta_1', type=Opener('w')) outputs.add_argument('fasta_2', type=Opener('w')) args = parser.parse_args(arguments) input1_names = { line.split()[0] for line in args.sto_1 if line.strip() and not line.startswith('#') } with tempfile.NamedTemporaryFile('w+t', dir='.') as tf: cmd = [ args.binary, '--dna', '--outformat', 'afa', '-o', tf.name, args.sto_1.name, args.sto_2.name ] sys.stderr.write(' '.join(cmd) + '\n') p = subprocess.Popen(cmd) p.communicate() merged = fastalite(tf) for seq in merged: if seq.id in input1_names: args.fasta_1.write(reformat(seq)) else: args.fasta_2.write(reformat(seq))
def build_parser(parser): parser = add_database_args(parser) parser.add_argument('new_nodes', metavar='FILE', type=Opener('r'), help='yaml file specifying new nodes') parser.add_argument( '--source-name', dest='source_name', help=("""Provides the default source name for new nodes. The value is overridden by "source_name" in the input file. If not provided, "source_name" is required in each node or name definition. This source name is created if it does not exist."""))
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'fastq', help='reads to count in fastq format', metavar='file.fastq[.bz2|.gz]', type=Opener(), ) parser.add_argument( 'read_counts', help='tabulate read counts and store as a CSV', metavar='FILE', type=argparse.FileType('w'), ) args = parser.parse_args(arguments) count = sum(1 for _ in fastqlite(args.fastq)) read_counts_writer = csv.writer(args.read_counts) read_counts_writer.writerow([args.fastq.name, count, count])
def main(arguments=None): parser = argparse.ArgumentParser( prog='barcodecop', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'index', nargs='+', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='one or two files containing index reads in fastq format') parser.add_argument('-f', '--fastq', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='reads to filter in fastq format') parser.add_argument('-o', '--outfile', default=sys.stdout, type=Opener('w'), help='output fastq') parser.add_argument( '--snifflimit', type=int, default=10000, metavar='N', help='read no more than N records from the index file [%(default)s]') parser.add_argument('--head', type=int, metavar='N', help='limit the output file to N records') parser.add_argument( '--min-pct-assignment', type=float, default=90.0, metavar='PERCENT', help=("""warn (or fail with an error; see --strict) if the most common barcode represents less than PERCENT of the total [%(default)s]""")) parser.add_argument( '--strict', action='store_true', default=False, help=("""fail if conditions of --min-pct-assignment are not met""")) parser.add_argument( '--invert', action='store_true', default=False, help='include only sequences *not* matching the most common barcode') # parser.add_argument('--format', choices=['fasta', 'fastq'], default='fastq') parser.add_argument('-c', '--show-counts', action='store_true', default=False, help='tabulate barcode counts and exit') parser.add_argument('-q', '--quiet', action='store_true', default=False, help='minimize messages to stderr') parser.add_argument('-V', '--version', action=VersionAction, version=__version__, help='Print the version number and exit') args = parser.parse_args(arguments) logging.basicConfig(format='%(message)s', level=logging.ERROR if args.quiet else logging.INFO) log = logging.getLogger(__name__) if len(args.index) == 1: bcseqs = fastqlite(args.index[0]) elif len(args.index) == 2: bcseqs = combine_dual_indices(*args.index) else: log.error('error: please specify either one or two index files') bc1, bc2 = tee(bcseqs, 2) # determine the most common barcode barcode_counts = Counter( [str(seq.seq) for seq in islice(bc1, args.snifflimit)]) barcodes, counts = zip(*barcode_counts.most_common()) most_common_bc = barcodes[0] most_common_pct = 100 * float(counts[0]) / sum(counts) log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format( most_common_bc, counts[0], sum(counts), most_common_pct)) if args.show_counts: for bc, count in barcode_counts.most_common(): print('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count)) return None if most_common_pct < args.min_pct_assignment: msg = 'frequency of most common barcode is less than {}%'.format( args.min_pct_assignment) if args.strict: log.error('Error: ' + msg) sys.exit(1) else: log.warning('Warning: ' + msg) if not args.fastq: log.error('specify a fastq format file to filter using -f/--fastq') sys.exit(1) seqs = fastqlite(args.fastq) filtered = islice(filter(bc2, seqs, most_common_bc, args.invert), args.head) for seq in filtered: args.outfile.write(as_fastq(seq))
def main(arguments=None): parser = argparse.ArgumentParser( prog='barcodecop', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'index', nargs='+', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='one or two files containing index reads in fastq format') parser.add_argument('-f', '--fastq', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='reads to filter in fastq format') parser.add_argument('-o', '--outfile', default=sys.stdout, type=Opener('w'), help='output fastq') parser.add_argument( '--snifflimit', type=int, default=10000, metavar='N', help='read no more than N records from the index file [%(default)s]') parser.add_argument('--head', type=int, metavar='N', help='limit the output file to N records') parser.add_argument( '--invert', action='store_true', default=False, help='include only sequences failing filtering criteria') parser.add_argument('-q', '--quiet', action='store_true', default=False, help='minimize messages to stderr') parser.add_argument('-V', '--version', action=VersionAction, version=__version__, help='Print the version number and exit') match_options = parser.add_argument_group('Barcode matching options') match_options.add_argument( '--match-filter', action='store_true', default=False, help=('filter reads based on exact match to most common barcode ' '[default: no match filter]')) match_options.add_argument( '--min-pct-assignment', type=float, default=90.0, metavar='PERCENT', help=("""warn (or fail with an error; see --strict) if the most common barcode represents less than PERCENT of the total [%(default)s]""")) match_options.add_argument( '--strict', action='store_true', default=False, help=("""fail if conditions of --min-pct-assignment are not met""")) match_options.add_argument('-c', '--show-counts', action='store_true', default=False, help='tabulate barcode counts and exit') qual_options = parser.add_argument_group( 'Barcode quality filtering options') qual_options.add_argument( '--qual-filter', action='store_true', default=False, help= 'filter reads based on minimum index quality [default: no quality filter]' ) qual_options.add_argument( '-p', '--min-qual', type=int, default=MIN_QUAL, help="""reject seqs with mean barcode quality score less than this value; for dual index, both barcodes must meet the threshold [%(default)s]""") qual_options.add_argument('--encoding', default='phred', choices=['phred'], help="""quality score encoding; see https://en.wikipedia.org/wiki/FASTQ_format [%(default)s]""") args = parser.parse_args(arguments) logging.basicConfig(format='%(message)s', level=logging.ERROR if args.quiet else logging.INFO) log = logging.getLogger(__name__) # when provided with dual barcodes, concatenate into a single # namedtuple with attributes qual and qual1; generate a filter # function appropriate for either case. if len(args.index) == 1: bcseqs = fastqlite(args.index[0]) qual_filter = get_qual_filter(args.min_qual, args.encoding) elif len(args.index) == 2: qual_filter = get_qual_filter(args.min_qual, args.encoding, paired=True) bcseqs = combine_dual_indices(*args.index) else: log.error('error: please specify either one or two index files') # use bc1 to determine most common barcode bc1, bc2 = tee(bcseqs, 2) # determine the most common barcode barcode_counts = Counter( [str(seq.seq) for seq in islice(bc1, args.snifflimit)]) barcodes, counts = list(zip(*barcode_counts.most_common())) most_common_bc = barcodes[0] most_common_pct = 100 * float(counts[0]) / sum(counts) log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format( most_common_bc, counts[0], sum(counts), most_common_pct)) if args.show_counts: for bc, count in barcode_counts.most_common(): print(('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count))) return None if most_common_pct < args.min_pct_assignment: msg = 'frequency of most common barcode is less than {}%'.format( args.min_pct_assignment) if args.strict: log.error('Error: ' + msg) sys.exit(1) else: log.warning('Warning: ' + msg) if not args.fastq: log.error('specify a fastq format file to filter using -f/--fastq') sys.exit(1) ifilterfun = filterfalse if args.invert else filter seqs = fastqlite(args.fastq) filtered = zip_longest(seqs, bc2) if args.match_filter: filtered = ifilterfun(get_match_filter(most_common_bc), filtered) if args.qual_filter: filtered = ifilterfun(qual_filter, filtered) for seq, bc in islice(filtered, args.head): assert seq.id == bc.id args.outfile.write(as_fastq(seq))