Example #1
0
def get_parser():
    """
    returns the parser object for the oxli subcommand handler
    """

    parser = argparse.ArgumentParser(
        description='Single entry point script for khmer',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    subparsers = parser.add_subparsers()

    # build-graph (formerly load-graph.py) parsers here
    parser_build_graph = \
        subparsers.add_parser('build-graph',
                              help="Load sequences into the compressible graph"
                              "format plus optional tagset",
                              description="Load sequences into the "
                              "compressible graph format plus optional tagset")

    khmer_args.build_nodegraph_args("Load sequences into the compressible"
                                    "graph format plus optional tagset.",
                                    None, parser=parser_build_graph)
    build_graph.build_parser(parser_build_graph)
    parser_build_graph.set_defaults(func=build_graph.main)

    return parser
Example #2
0
def get_parser():
    parser = build_nodegraph_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument('-r',
                        '--traversal_range',
                        type=int,
                        dest='traversal_range',
                        default=DEFAULT_RANGE,
                        help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('--max_queue_size', type=int, default=1000)
    parser.add_argument('--prefix',
                        dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir',
                        dest='outdir',
                        default='',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('--query',
                        dest='query',
                        nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('--db',
                        dest='db',
                        nargs='+',
                        help='Database reads for sweep',
                        required=True)

    return parser
def get_parser():
    epilog = """\
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of
    :program:`load-graph.py`, :program:`partition-graph.py`,
    :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into
    one script. This is convenient but should probably not be used for large
    data sets, because :program:`do-partition.py` doesn't provide save/resume
    functionality.

    Example::

        do-partition.py -k 20 example tests/test-data/random-20-a.fa
    """
    parser = build_nodegraph_args(
        descr='Load, partition, and annotate FAST[AQ] sequences',
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')
    parser.add_argument('--no-big-traverse', dest='no_big_traverse',
                        action='store_true', default=False,
                        help='Truncate graph joins at big traversals')
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase', help="base name for output files")
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filenames')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #4
0
def get_parser():
    parser = build_nodegraph_args(descr="Load sequences into the compressible "
                                  "graph format plus optional tagset.",
                                  citations=['graph', 'SeqAn'])

    parser = build_graph.build_parser(parser)
    return parser
Example #5
0
def main():

    parser = khmer_args.build_nodegraph_args()
    parser.add_argument("--samples", nargs="+")
    parser.add_argument("--save-prefix")
    parser.add_argument("--print-tree", action="store_true", default=False)
    args = parser.parse_args()

    if not args.save_prefix:
        print >> sys.stderr, "No save prefix specified! Exiting..."
        sys.exit(1)

    factory = NodegraphFactory(args)
    root = sbt.Node(factory)

    for sample_fn in args.samples:
        print "*** Build node for", sample_fn
        leaf = sbt.Leaf(os.path.basename(sample_fn), os.path.basename(sample_fn), factory.create_nodegraph())
        print "--- Consuming file..."
        leaf.graph.consume_fasta(sample_fn)
        print "--- Adding node to SBT..."
        root.add_node(leaf)
        print "--- Done with", sample_fn

    if args.print_tree:
        sbt.print_sbt(root)

    print "\n*** Saving to disk"
    fn = sbt.save_sbt(root, args.save_prefix)
    print "--- Save to", fn
Example #6
0
def get_parser():
    parser = build_nodegraph_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
Example #7
0
def get_parser():
    parser = build_nodegraph_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
Example #8
0
def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o',
                        '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_seqfile(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)
Example #9
0
def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o', '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_fasta(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)
Example #10
0
def get_parser():
    """Return the parser object for the oxli subcommand handler."""
    parser = argparse.ArgumentParser(
        description='Single entry point script for khmer',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    subparsers = parser.add_subparsers()

    # build-graph (formerly load-graph.py) parsers here
    parser_build_graph = \
        subparsers.add_parser(
            name='build-graph',
            help="Load sequences into the compressible graph format "
            "plus optional tagset")

    parser_build_graph = build_nodegraph_args(parser=parser_build_graph)
    build_graph.build_parser(parser_build_graph)
    parser_build_graph.set_defaults(func=build_graph.main)

    return parser
Example #11
0
def get_parser():
    parser = build_nodegraph_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
                        default=DEFAULT_MAX_READS,
                        help='Max total reads to buffer before flushing')
    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
                        default=DEFAULT_BUFFER_SIZE,
                        help='Max length of an individual label buffer \
                              before flushing')
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                        default=DEFAULT_NUM_BUFFERS,
                        help='Max individual label buffers before flushing')
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument('--label-by-pid', dest='label_by_pid',
                          action='store_true', help='separate reads by\
                        reference partition id')
    labeling.add_argument('--label-by-seq', dest='label_by_seq',
                          action='store_true', help='separate reads by\
                        reference sequence')
    labeling.add_argument('--label-by-group', dest='group_size', type=int,
                          help='separate reads by arbitrary sized groups\
                        of reference sequences')
    parser.add_argument(dest='input_fastp', help='Reference fasta or fastp')
    parser.add_argument('input_files', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #12
0
def get_parser():
    parser = build_nodegraph_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
                        default=DEFAULT_MAX_READS,
                        help='Max total reads to buffer before flushing')
    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
                        default=DEFAULT_BUFFER_SIZE,
                        help='Max length of an individual label buffer \
                              before flushing')
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                        default=DEFAULT_NUM_BUFFERS,
                        help='Max individual label buffers before flushing')
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument('--label-by-pid', dest='label_by_pid',
                          action='store_true', help='separate reads by\
                        reference partition id')
    labeling.add_argument('--label-by-seq', dest='label_by_seq',
                          action='store_true', help='separate reads by\
                        reference sequence')
    labeling.add_argument('--label-by-group', dest='group_size', type=int,
                          help='separate reads by arbitrary sized groups\
                        of reference sequences')
    parser.add_argument(dest='input_fastp', help='Reference fasta or fastp')
    parser.add_argument('input_files', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
Example #13
0
def get_parser():
    parser = build_nodegraph_args('Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to')
    parser.epilog = EPILOG
    parser.add_argument(
        '-r', '--traversal_range', type=int, dest='traversal_range',
        default=DEFAULT_RANGE, help='depth of breadth-first search to perform\
                                    from each read')
    parser.add_argument('--max_queue_size', type=int, default=1000)
    parser.add_argument('--prefix', dest='output_prefix',
                        default=DEFAULT_OUT_PREF,
                        help='Prefix for sorted read files')
    parser.add_argument('--outdir', dest='outdir', default='',
                        help='output directory; default is location of \
                              fastp file')
    parser.add_argument('--query', dest='query', nargs='+',
                        help='Reads to be swept and sorted')
    parser.add_argument('--db', dest='db', nargs='+',
                        help='Database reads for sweep', required=True)

    return parser
Example #14
0
def main():

    parser = khmer_args.build_nodegraph_args()
    parser.add_argument('--samples', nargs='+')
    parser.add_argument('--save-prefix')
    parser.add_argument('--print-tree', action='store_true', default=False)
    args = parser.parse_args()

    if not args.save_prefix:
        print('No save prefix specified! Exiting...', file=sys.stderr)
        sys.exit(1)

    factory = NodegraphFactory(args)
    tree = sbt.SBT(factory)

    for sample_fn in args.samples:
        print('*** Build node for', sample_fn)
        leaf = sbt.Leaf(os.path.basename(sample_fn),
                        factory.create_nodegraph())
        fname = os.path.join('.sbt.' + args.save_prefix,
                             ".".join([args.save_prefix,
                                       os.path.basename(sample_fn),
                                       'sbt']))
        if os.path.exists(fname):
            print('--- Loading existing file...')
            leaf.graph.load(fname)
        else:
            print('--- Consuming file...')
            leaf.graph.consume_fasta(sample_fn)
        print('--- Adding node to SBT...')
        tree.add_node(leaf)
        print('--- Done with', sample_fn)

    if args.print_tree:
        tree.print()

    print('\n*** Saving to disk')
    fn = tree.save(args.save_prefix)
    print('--- Save to', fn)
def main():
    parser = build_nodegraph_args()
    parser.add_argument('readfile1', help='fasta sequence file to be loaded in hashtable, use "-" if from stdin')
    parser.add_argument('readfile2', help='fasta readfile to query against hashtable, use "-" if from stdin')
    parser.add_argument('--shared', help='shared kmer in readfile 1 and 2')
    parser.add_argument('--uniq2', help='uniq kmer in readfile2')
    parser.add_argument('--x2', default='1e8', help='max_table size for readfile2')
    parser.add_argument('--N2', default='4', help='# of table (N) for readfile2')

    args = parser.parse_args()
    print(args)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    readfile1 = args.readfile1
    readfile2 = args.readfile2
    shared = args.shared
    uniq2 = args.uniq2
    if readfile1 == '-' and readfile2 == '-':
        mes = ('*** Only one of readfile1 and readfile2 '
                 'can be read from stdin')
        print(mes, file=sys.stderr)

    try:
        if readfile1 == '-':
            fp1 = sys.stdin
        else:
            fp1 = open(readfile1)

        if readfile2 == '-':
            fp2 = sys.stdin
        else:
            fp2 = open(readfile2)

        if uniq2:
            fw2 = open(uniq2, 'w')

        # create a hashbits data structure
        ht = khmer.Nodetable(K, HT_SIZE, N_HT)

        # load contigs, connect into N partitions
        print('loading input reads from {}..'.format(os.path.basename(readfile1)),
                                    file=sys.stderr)
        #ht.consume_seqfile(readfile1)
        for record in fasta_iter(fp1):
            ht.consume(record.sequence)

        # Change 0.2 only if you really grok it.  HINT: You don't.
        fp_rate = khmer.calc_expected_collisions(ht)
        print('fp rate estimated to be {:1.3f}'.format(fp_rate), file=sys.stderr)

        if fp_rate > 0.01:
            mes = ('**\n'
                   '** ERROR: the counting hash is too small for\n'
                   '** {}.  Increase hashsize/num ht.\n'
                   '**\n'
                   '** Do not use these results!!')
            print(mes.format(os.path.basename(readfile1)), file=sys.stderr)
            sys.exit(-1)


        n_unique1 = ht.n_unique_kmers()
        # create a hashbits data structure
        ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2)

        n_unique2 = 0
        n_shared = 0

        for n, record in enumerate(fasta_iter(fp2)):
            name = record['name']
            sequence = record['sequence']
            seq_len = len(sequence)
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                    else:
                        mes = '>{}__{}  length_{};k_{}\n{}\n'
                        fw2.write(mes.format(name, i, seq_len, K, kmer))
                ht2.count(kmer)

        mes = ('Unique kmer in {}:\t{}\n'
               'Shared kmer:\t{}\n'
               'Unique kmer in {}:\t{}\n')

        print(mes.format(os.path.basename(readfile1), n_unique1, 
                           n_shared, 
                         os.path.basename(readfile2), n_unique2))

    finally:
        fp1.close()
        fp2.close()
        fw2.close()
Example #16
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument(
        'ref',
        nargs='+',
        help='fasta sequence file to be loaded in bloom filter')
    parser.add_argument('--bfout',
                        default='nodetable.bf',
                        help='output bloom filter of ref')

    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    # positional
    refs = args.ref

    start_time = time.time()
    print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
    ht = khmer.Nodetable(K, HT_SIZE, N_HT)
    end_time = time.time()
    secs = end_time - start_time
    mes = 'initiation of bloom filter took {:.2f} hours..'
    print(mes.format(secs / 3600.0), file=sys.stderr)

    for index, filename in enumerate(refs):
        if index != 0 and index % 100 == 0:
            end_time = time.time()
            secs = end_time - start_time
            mes = '{} refs have been loaded with in {:.2f} hours ..'
            print(mes.format(index, secs / 3600.0), file=sys.stderr)
        try:
            ht.consume_seqfile(filename)
        except OSError as e:
            mes = ('*** Skipping due to OSError (machine or system problem):'
                   ' {}\n'
                   '*** Detailed error message:\n'
                   '*** {}')
            print(mes.format(os.path.basename(filename), str(e)),
                  file=sys.stderr)
            continue

    print('Saving bloom filter to {}..'.format(args.bfout), file=sys.stderr)
    ht.save(args.bfout)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    mes = ('Unique kmer:\t{}\n')

    print(mes.format(n_unique1), file=sys.stderr)
Example #17
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))

    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')

    parser.add_argument('--bfout', help='output bloom filter of ref')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--shared',
                       dest='output',
                       action='store_const',
                       const='shared',
                       help='output shared kmers')
    group.add_argument('--uniq',
                       dest='output',
                       action='store_const',
                       const='uniq',
                       help='output uniq kmers in query')

    group2 = parser.add_mutually_exclusive_group(required=True)
    group2.add_argument(
        '--ref',
        nargs='+',
        help='fasta sequence file to be loaded in bloom filter')
    group2.add_argument('--load', help='load existing bloom filter')

    parser.set_defaults(output='uniq')
    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    output = args.output

    start_time = time.time()
    # load from existing bloom filter
    if args.load:
        print('loading bloom filter from {}..'.format(args.load),
              file=sys.stderr)
        ht = khmer.load_nodetable(args.load)
        k = ht.ksize()
        mes = ('*** incompatible ksize ({}) in {} with parameters K on '
               'command line ({})')
        assert k == K, mes.format(k, args.load, K)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'load bloom filter ({}) took {:.2f} hours..'
        print(mes.format(os.path.basename(args.load), secs / 3600.0),
              file=sys.stderr)

    # create a hashbits data structure
    else:
        refs = args.ref
        print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
        if query == '-' and refs == ['-']:
            print('*** query and ref can not both be "-" (read from stdin)',
                  file=sys.stderr)
        ht = khmer.Nodetable(K, HT_SIZE, N_HT)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'initiation of bloom filter took {:.2f} hours..'
        print(mes.format(secs / 3600.0), file=sys.stderr)
        for index, filename in enumerate(refs):
            if index != 0 and index % 100 == 0:
                end_time = time.time()
                secs = end_time - start_time
                mes = '{} refs have been loaded with in {:.2f} hours ..'
                print(mes.format(index, secs / 3600.0), file=sys.stderr)
            try:
                ht.consume_seqfile(filename)
            except OSError as e:
                mes = (
                    '*** Skipping due to OSError (machine or system problem):'
                    ' {}\n'
                    '*** Detailed error message:\n'
                    '*** {}')
                print(mes.format(os.path.basename(filename), str(e)),
                      file=sys.stderr)
                continue

        if args.bfout:
            if args.load:
                mes = '*** Bloom filter exists as {}, NOT saving again as {}..'
                print(mes.format(args.load, args.bfout), file=sys.stderr)
            else:
                print('*** Saving bloom filter to {}..'.format(args.bfout),
                      file=sys.stderr)
                ht.save(args.bfout)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    # create a hashbits data structure
    ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2)

    n_unique2 = 0
    n_shared = 0

    if output == 'uniq':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                    else:
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                ht2.count(kmer)

    elif output == 'shared':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                    else:
                        pass

                ht2.count(kmer)

    mes = ('Unique kmer in {} (query):\t{}\n'
           'Shared kmer:\t{}\n'
           'Unique kmer in {}:\t{}\n')

    print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs',
                     n_unique1),
          file=sys.stderr)
Example #18
0
def get_parser():
    parser = build_nodegraph_args(
        "Takes a partitioned reference file \
                                  and a list of reads, and sorts reads \
                                  by which partition they connect to"
    )
    parser.epilog = EPILOG
    parser.add_argument(
        "-r",
        "--traversal_range",
        type=int,
        dest="traversal_range",
        default=DEFAULT_RANGE,
        help="depth of breadth-first search to perform\
                                    from each read",
    )
    parser.add_argument(
        "-b",
        "--buffer_size",
        dest="max_reads",
        type=int,
        default=DEFAULT_MAX_READS,
        help="Max total reads to buffer before flushing",
    )
    parser.add_argument(
        "-l",
        "--buffer_length",
        dest="buffer_size",
        type=int,
        default=DEFAULT_BUFFER_SIZE,
        help="Max length of an individual label buffer \
                              before flushing",
    )
    parser.add_argument("--prefix", dest="output_prefix", default=DEFAULT_OUT_PREF, help="Prefix for sorted read files")
    parser.add_argument(
        "--outdir",
        dest="outdir",
        help="output directory; default is location of \
                              fastp file",
    )
    parser.add_argument(
        "-m",
        "--max_buffers",
        dest="max_buffers",
        type=int,
        default=DEFAULT_NUM_BUFFERS,
        help="Max individual label buffers before flushing",
    )
    labeling = parser.add_mutually_exclusive_group(required=True)
    labeling.add_argument(
        "--label-by-pid",
        dest="label_by_pid",
        action="store_true",
        help="separate reads by\
                        reference partition id",
    )
    labeling.add_argument(
        "--label-by-seq",
        dest="label_by_seq",
        action="store_true",
        help="separate reads by\
                        reference sequence",
    )
    labeling.add_argument(
        "--label-by-group",
        dest="group_size",
        type=int,
        help="separate reads by arbitrary sized groups\
                        of reference sequences",
    )
    parser.add_argument(dest="input_fastp", help="Reference fasta or fastp")
    parser.add_argument("input_files", nargs="+", help="Reads to be swept and sorted")
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
Example #19
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))
    parser.add_argument('ref',
                        nargs='+',
                        help='fasta sequence file to be loaded in hashtable')
    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')

    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    refs = args.ref
    print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
    if query == '-' and refs == ['-']:
        print('*** query and ref can not both be "-" (read from stdin)',
              file=sys.stderr)
    # create a hashbits data structure
    start_time = time.time()
    ht = khmer.Nodetable(K, HT_SIZE, N_HT)
    end_time = time.time()
    secs = end_time - start_time
    mes = 'initiation of bloom filter took {:.2f} hours..'
    print(mes.format(secs / 3600.0), file=sys.stderr)
    for index, filename in enumerate(refs):
        if index != 0 and index % 100 == 0:
            end_time = time.time()
            secs = end_time - start_time
            mes = '{} refs have been loaded with in {:.2f} hours ..'
            print(mes.format(index, secs / 3600.0), file=sys.stderr)
        try:
            ht.consume_seqfile(filename)
        except OSError as e:
            mes = ('*** Skipping due to OSError (machine or system problem):'
                   ' {}\n'
                   '*** Detailed error message:\n'
                   '*** {}')
            print(mes.format(os.path.basename(filename), str(e)),
                  file=sys.stderr)
            continue

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    pair = 0
    forward = 0
    reverse = 0
    other = 0
    total_pair = 0
    for n, is_pair, r1, r2 in broken_paired_reader(
            khmer.ReadParser(query, require_paired=True)):
        #for n, record in enumerate(screed.open(query)):
        total_pair += 1
        share_list = []
        for record in [r1, r2]:
            name, desc = record.name.split(None, 1)
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                print('*** {} is shorter than {}..'.format(r1.name, K),
                      file=sys.stderr)
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]
                if ht.get(kmer):
                    share_list.append(1)
                    break
                else:
                    share_list.append(0)

        if share_list == [1, 1]:
            pair += 1
        elif share_list == [1, 0]:
            forward += 1
        elif share_list == [0, 1]:
            reverse += 1
        else:  #[0, 0]
            other += 1
            # do not print
            continue

        mes = ('>{}  {}||uniq_{}\n{}\n' '>{}  {}||uniq_{}\n{}')
        l1 = r1.name.split(None, 1)
        l2 = r2.name.split(None, 1)
        print(
            mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1],
                       share_list[1], r2.sequence))

    mes = ('Unique kmer in ref:\t{}\n'
           'Total pair:\t{}\n'
           'Both primers uniq:\t{}\n'
           'Pair with forward uniq:\t{}\n'
           'Pair with reverse uniq:\t{}')

    print(mes.format(n_unique1, total_pair, pair, forward, reverse),
          file=sys.stderr)