def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename', metavar='output_presence_table_filename', help='output' ' k-mer presence table filename.') parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('--write-fp-rate', '-w', action='store_true', help="Write false positive rate into .info file") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ The resulting partition maps are saved as '${basename}.subset.#.pmap' files. """ parser = argparse.ArgumentParser( description="Partition a sequence graph based upon waypoint " "connectivity", epilog=epilog, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('basename', help="basename of the input k-mer presence" " table + tagset files") parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, type=float, help='Set subset size (usually 1e5-1e6 is ' 'good)') parser.add_argument('--no-big-traverse', action='store_true', default=False, help='Truncate graph joins at big ' 'traversals') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_threading_args(parser) return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savetable', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer counting table to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") return parser
def get_parser(): epilog = """\ The resulting partition maps are saved as ``${basename}.subset.#.pmap`` files. """ parser = argparse.ArgumentParser( description="Partition a sequence graph based upon waypoint " "connectivity", epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('basename', help="basename of the input k-mer" "nodegraph + tagset files") parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, type=float, help='Set subset size (usually 1e5-1e6 is ' 'good)') parser.add_argument('--no-big-traverse', action='store_true', default=False, help='Truncate graph joins at big ' 'traversals') parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_threading_args(parser) return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savetable', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer counting table to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = ''' Note that with :option:`-b` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. ''' parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The name of the input' ' FAST[AQ] sequence file.') parser.add_argument('output_histogram_filename', help='The name of the ' 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savetable', default='', metavar="filename", help="Save the k-mer counting table to the specified " "filename.") return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument("--cutoff", "-C", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument( "--savetable", metavar="filename", default="", help="If present, the name of the file to save the " "k-mer counting table to", ) parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim") return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: load-into-counting.py -k 20 -x 5e7 table.ct data/100k-filtered.fa filter-abund.py -C 2 table.ct data/100k-filtered.fa """ parser = argparse.ArgumentParser( description='Trim sequences at a minimum k-mer abundance.', epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('input_table', metavar='input_counting_table_filename', help='The input k-mer counting table filename') parser.add_argument('input_filename', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename', nargs='+') add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--out', dest='single_output_filename', default='', metavar="optional_output_filename", help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') parser.add_argument('--version', action='version', version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """\ The resulting partition maps are saved as ``${basename}.subset.#.pmap`` files. """ parser = argparse.ArgumentParser( description="Partition a sequence graph based upon waypoint " "connectivity", epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('basename', help="basename of the input k-mer " "nodegraph + tagset files") parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, type=float, help='Set subset size (usually 1e5-1e6 is ' 'good)') parser.add_argument('--no-big-traverse', action='store_true', default=False, help='Truncate graph joins at big ' 'traversals') parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_threading_args(parser) return parser
def get_parser(): epilog = """ The resulting partition maps are saved as '${basename}.subset.#.pmap' files. """ parser = argparse.ArgumentParser( description="Partition a sequence graph based upon waypoint " "connectivity", epilog=epilog, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('basename', help="basename of the input k-mer presence" " table + tagset files") parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, type=float, help='Set subset size (usually 1e5-1e6 is ' 'good)') parser.add_argument('--no-big-traverse', action='store_true', default=False, help='Truncate graph joins at big ' 'traversals') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) add_threading_args(parser) return parser
def get_parser(): epilog = """ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. """ parser = build_hashbits_args( descr='Load, partition, and annotate FAST[AQ] sequences', epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase', help="base name for output files") parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filenames') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """\ Note: with :option:`-b`/:option:`--no-bigcount` the output will be the exact size of the k-mer countgraph and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer countgraph from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countgraph_filename', help="The name of the" " file to write the k-mer countgraph to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.") parser.add_argument('--summary-info', '-s', type=str, default=None, metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " "summary be in? (`json` or `tsv`, disabled by" " default)") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') return parser
def get_parser(): parser = build_nodegraph_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') return parser
def get_parser(): epilog = '''\ Note that with :option:`-b`/:option:`--no-bigcount` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. Example:: abundance-dist-single.py -x 1e7 -N 2 -k 17 \\ tests/test-data/test-abund-read-2.fa test-dist ''' parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The name of the input' ' FAST[AQ] sequence file.') parser.add_argument('output_histogram_filename', help='The name of the ' 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output zero-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savegraph', default='', metavar="filename", help="Save the k-mer countgraph to the specified " "filename.") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT", choices=['json', 'tsv'], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Note that with :option:`-b` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. """ parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog), ) add_threading_args(parser) parser.add_argument("input_sequence_filename", help="The name of the input" " FAST[AQ] sequence file.") parser.add_argument( "output_histogram_filename", help="The name of the " "output histogram file. The columns are: (1) k-mer " "abundance, (2) k-mer count, (3) cumulative count, " "(4) fraction of total distinct k-mers.", ) parser.add_argument( "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins" ) parser.add_argument( "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255" ) parser.add_argument( "-s", "--squash", dest="squash_output", default=False, action="store_true", help="Overwrite output file if it exists", ) parser.add_argument( "--csv", default=False, action="store_true", help="Use the CSV format for the histogram. " "Includes column headers.", ) parser.add_argument( "--savetable", default="", metavar="filename", help="Save the k-mer counting table to the specified " "filename.", ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument( "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to." ) parser.add_argument( "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files." ) parser.add_argument( "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.", ) parser.add_argument( "--summary-info", "-s", type=str, default=None, metavar="FORMAT", choices=[str("json"), str("tsv")], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)", ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def build_parser(parser): add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename', metavar='output_nodegraph_filename', help='output' ' k-mer nodegraph filename.') parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequence filename') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): parser = build_counting_args( descr="Output abundances of the k-mers in the sequence file.") add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The input' ' FAST[AQ] sequence file.') parser.add_argument('-o', '--out', metavar="output_file", dest='output_file', type=argparse.FileType('w'), default=None, help='output counts to this file') return parser
def get_parser(): epilog = """\ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. This script combines the functionality of :program:`load-graph.py`, :program:`partition-graph.py`, :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. Example:: do-partition.py -k 20 example tests/test-data/random-20-a.fa """ parser = build_nodegraph_args( descr="Load, partition, and annotate FAST[AQ] sequences", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument( "--subset-size", "-s", default=DEFAULT_SUBSET_SIZE, dest="subset_size", type=float, help="Set subset size (usually 1e5-1e6 is good)", ) parser.add_argument( "--no-big-traverse", dest="no_big_traverse", action="store_true", default=False, help="Truncate graph joins at big traversals", ) parser.add_argument( "--keep-subsets", dest="remove_subsets", default=True, action="store_false", help="Keep individual subsets (default: False)", ) parser.add_argument("graphbase", help="base name for output files") parser.add_argument( "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filenames" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt`` for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa filter-abund.py -C 2 countgraph data/100k-filtered.fa """ parser = KhmerArgumentParser( description='Trim sequences at a minimum k-mer abundance.', epilog=textwrap.dedent(epilog), citations=['counting']) parser.add_argument('input_graph', metavar='input_count_graph_filename', help='The input k-mer countgraph filename') parser.add_argument('input_filename', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename', nargs='+') add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, 'cutoff'), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--output', dest='single_output_file', type=khFileType('wb'), metavar="optional_output_filename", help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def build_parser(parser): add_threading_args(parser) parser.add_argument( "--no-build-tagset", "-n", default=False, action="store_true", dest="no_build_tagset", help="Do NOT construct tagset while loading sequences", ) parser.add_argument( "output_filename", metavar="output_nodegraph_filename", help="output" " k-mer nodegraph filename." ) parser.add_argument( "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): epilog = """\ Note: with :option:`-b`/:option:`--no-bigcount` the output will be the exact size of the k-mer countgraph and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer countgraph from the given" " sequences.", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('output_countgraph_filename', help="The name of the" " file to write the k-mer countgraph to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.") parser.add_argument('-s', '--summary-info', type=str, default=None, metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " "summary be in? (`json` or `tsv`, disabled by" " default)") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT", choices=['json', 'tsv'], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_output_compression_type(parser) return parser
def get_parser(): parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.") add_threading_args(parser) parser.add_argument( "--no-build-tagset", "-n", default=False, action="store_true", dest="no_build_tagset", help="Do NOT construct tagset while loading sequences", ) parser.add_argument( "output_filename", metavar="output_presence_table_filename", help="output" " k-mer presence table filename." ) parser.add_argument( "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename" ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("--write-fp-rate", "-w", action="store_true", help="Write false positive rate into .info file") return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") add_threading_args(parser) parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') parser.add_argument('--bfout', help='output bloom filter of ref') group = parser.add_mutually_exclusive_group() group.add_argument('--shared', dest='output', action='store_const', const='shared', help='output shared kmers') group.add_argument('--uniq', dest='output', action='store_const', const='uniq', help='output uniq kmers in query') group2 = parser.add_mutually_exclusive_group(required=True) group2.add_argument( '--ref', nargs='+', help='fasta sequence file to be loaded in bloom filter') group2.add_argument('--load', help='load existing bloom filter') parser.set_defaults(output='uniq') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query output = args.output start_time = time.time() # load from existing bloom filter if args.load: ht = khmer.Nodetable.load(args.load) end_time = time.time() secs = end_time - start_time mes = 'load bloom filter ({}) took {:.2f} hours..' print(mes.format(os.path.basename(args.load), secs / 3600.0), file=sys.stderr) # create a hashbits data structure else: refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded within {:.2f} hours..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: rparser = khmer.ReadParser(filename) except OSError as e: mes = ( '*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue threads = [] for _ in range(args.threads): cur_thrd = \ threading.Thread( target=ht.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if args.bfout: ht.save(args.bfout) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() # create a hashbits data structure ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2) n_unique2 = 0 n_shared = 0 if output == 'uniq': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 else: mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) ht2.count(kmer) elif output == 'shared': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) else: pass ht2.count(kmer) mes = ('Unique kmer in {} (query):\t{}\n' 'Shared kmer:\t{}\n' 'Unique kmer in {}:\t{}\n') print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs', n_unique1), file=sys.stderr)
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=["counting", "SeqAn"], ) add_threading_args(parser) parser.add_argument( "--cutoff", "-C", default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.", ) parser.add_argument( "--variable-coverage", "-V", action="store_true", dest="variable_coverage", default=False, help="Only trim low-abundance k-mers from sequences " "that have high coverage.", ) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="Base the variable-coverage cutoff on this median" " k-mer abundance.", default=DEFAULT_NORMALIZE_LIMIT, ) parser.add_argument( "--savegraph", metavar="filename", default="", help="If present, the name of the file to save the " "k-mer countgraph to", ) parser.add_argument( "-o", "--outfile", metavar="optional_output_filename", default=None, help="Override default output filename " "and output trimmed sequences into a file with the " "given filename.", ) parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true") add_output_compression_type(parser) return parser