Exemple #1
0
def test_load_graph_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    khmer.load_nodegraph(out1)
Exemple #2
0
 def data(self):
     if self._data is None:
         if self._filename is None:
             self._data = self._factory()
         else:
             self._data = khmer.load_nodegraph(self._filename)
     return self._data
Exemple #3
0
    def load(sbt_fn):
        with open(sbt_fn) as fp:
            nodes = json.load(fp)

        if nodes[0] is None:
            # TODO error!
            raise ValueError("Empty tree!")

        sbt_nodes = []

        ksize, tablesize, ntables, _, _, _ = khmer.extract_nodegraph_info(nodes[0]["filename"])
        factory = GraphFactory(ksize, tablesize, ntables)

        for node in nodes:
            if node is None:
                sbt_nodes.append(None)
                continue

            graph = khmer.load_nodegraph(node["filename"])

            if "metadata" in node:
                # only Leaf nodes have metadata
                l = Leaf(node["metadata"], graph)
                sbt_nodes.append(l)
            else:
                n = Node(factory, name=node["name"])
                n.graph = graph
                sbt_nodes.append(n)

        tree = SBT(factory)
        tree.nodes = sbt_nodes

        return tree
Exemple #4
0
def test_save_load(tabletype):
    kh = tabletype(5, PRIMES_1m)
    savefile = utils.get_temp_filename('tablesave.out')

    # test add(dna)
    x = kh.add("ATGGC")
    z = kh.get("ATGGC")
    assert z == 1

    kh.save(savefile)

    # should we provide a single load function here? yes, probably. @CTB
    if tabletype == _Countgraph:
        loaded = khmer.load_countgraph(savefile)
    elif tabletype == _Counttable:
        loaded = khmer.load_counttable(savefile)
    elif tabletype == _SmallCountgraph:
        loaded = khmer.load_countgraph(savefile, small=True)
    elif tabletype == _SmallCounttable:
        loaded = khmer.load_counttable(savefile, small=True)
    elif tabletype == _Nodegraph:
        loaded = khmer.load_nodegraph(savefile)
    elif tabletype == _Nodetable:
        loaded = khmer.load_nodetable(savefile)
    else:
        raise Exception("unknown tabletype")

    z = loaded.get('ATGGC')
    assert z == 1
def test_load_graph_1():
    in1 = utils.get_test_data("test-abund-read-2.fa")
    out1 = utils.get_temp_filename("out.ct")

    cmd = """
       cat {in1} |
       {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    (status, out, err) = run_shell_cmd(cmd)
    assert os.path.exists(out1)
    khmer.load_nodegraph(out1)
Exemple #6
0
def test_hashbits_file_version_check():

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        nodegraph = khmer.load_nodegraph(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
def test_hashbits_file_version_check():

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        nodegraph = khmer.load_nodegraph(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemple #8
0
def test_nodegraph_file_type_check():
    kh = khmer._Countgraph(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    try:
        nodegraph = khmer.load_nodegraph(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
def test_nodegraph_file_type_check():
    kh = khmer._Countgraph(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    try:
        nodegraph = khmer.load_nodegraph(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemple #10
0
 def data(self):
     if self._data is None:
         data = self.storage.load(self._path)
         # We need to do this tempfile dance because khmer only load
         # data from files.
         with NamedTemporaryFile(suffix=".gz") as f:
             f.write(data)
             f.file.flush()
             self._data = khmer.load_nodegraph(f.name)
     return self._data
Exemple #11
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
    nodegraph = khmer.load_nodegraph(graphbase)

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    nodegraph.load_tagset(graphbase + '.tagset')

    counting = khmer_args.create_countgraph(args)

    # divide up into SUBSET_SIZE fragments
    divvy = nodegraph.divide_tags_into_subsets(args.subset_size)
    divvy = list(divvy)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
    subset = nodegraph.do_subset_partition(start, end)

    # now, repartition...
    print('repartitioning to find HCKs.', file=sys.stderr)
    nodegraph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

    print('saving stop tags', file=sys.stderr)
    nodegraph.save_stop_tags(graphbase + '.stoptags')
    print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
    nodegraph = khmer.load_nodegraph(graphbase)

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    nodegraph.load_tagset(graphbase + '.tagset')

    counting = khmer_args.create_countgraph(args)

    # divide up into SUBSET_SIZE fragments
    divvy = nodegraph.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
    subset = nodegraph.do_subset_partition(start, end)

    # now, repartition...
    print('repartitioning to find HCKs.', file=sys.stderr)
    nodegraph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

    print('saving stop tags', file=sys.stderr)
    nodegraph.save_stop_tags(graphbase + '.stoptags')
    print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
Exemple #13
0
def test_n_occupied_save_load():
    filename = utils.get_test_data('random-20-a.fa')

    nodegraph = khmer.Nodegraph(20, 100000, 3)

    for _, record in enumerate(screed.open(filename)):
        nodegraph.consume(record.sequence)

    assert nodegraph.n_occupied() == 3884
    assert nodegraph.n_unique_kmers() == 3960

    savefile = utils.get_temp_filename('out')
    nodegraph.save(savefile)

    ng2 = khmer.load_nodegraph(savefile)
    assert ng2.n_occupied() == 3884, ng2.n_occupied()
    assert ng2.n_unique_kmers() == 0    # this is intended behavior, sigh.
Exemple #14
0
def test_n_occupied_save_load():
    filename = utils.get_test_data('random-20-a.fa')

    nodegraph = khmer.Nodegraph(20, 100000, 3)

    for _, record in enumerate(screed.open(filename)):
        nodegraph.consume(record.sequence)

    assert nodegraph.n_occupied() == 3884
    assert nodegraph.n_unique_kmers() == 3960

    savefile = utils.get_temp_filename('out')
    nodegraph.save(savefile)

    ng2 = khmer.load_nodegraph(savefile)
    assert ng2.n_occupied() == 3884, ng2.n_occupied()
    assert ng2.n_unique_kmers() == 0  # this is intended behavior, sigh.
Exemple #15
0
def load_node(node_dict, factory):

    graph = khmer.load_nodegraph(node_dict['filename'])

    if 'metadata' in node_dict: # must be a leaf
        return Leaf(node_dict['metadata'], node_dict['name'], graph)
        
    else:
        node = Node(factory)
        node.graph = graph

        left = node_dict['left']
        node.subnodes.append(load_node(left, factory))
        right = node_dict['right']
        node.subnodes.append(load_node(right, factory))

        node.children = node_dict['children']
        node.name = node_dict['name']
        return node
Exemple #16
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates a CSV file of similarity indicies between the"
        " input file and each of the sketches in the training/reference file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-f',
                        '--force',
                        action="store_true",
                        help="Force creation of new NodeGraph.")
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-ct',
        '--containment_threshold',
        type=restricted_float,
        help="Only return results with containment index above this value",
        default=0.02)
    parser.add_argument(
        '-c',
        '--confidence',
        type=restricted_float,
        help=
        "Desired probability that all results were returned with containment index above threshold [-ct]",
        default=0.95)
    parser.add_argument(
        '-ng',
        '--node_graph',
        help="NodeGraph/bloom filter location. Used if it exists; if not, one "
        "will be created and put in the same directory as the specified "
        "output CSV file.",
        default=None)
    parser.add_argument(
        '-b',
        '--base_name',
        action="store_true",
        help=
        "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file"
    )
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Option to only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space.")
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument(
        'training_data',
        help=
        "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)"
    )
    parser.add_argument('out_csv', help='Output CSV file')

    # Parse and check args
    args = parser.parse_args()
    base_name = args.base_name
    training_data = os.path.abspath(args.training_data)
    if not os.path.exists(training_data):
        raise Exception("Training/reference file %s does not exist." %
                        training_data)
    # Let's get the k-mer sizes in the training database
    ksizes = set()
    # Import all the training data
    sketches = MH.import_multiple_from_single_hdf5(training_data)
    # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters))
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
        )
    num_hashes = len(sketches[0]._kmers)
    for i in range(len(sketches)):
        sketch = sketches[i]
        if sketch._kmers is None:
            raise Exception(
                "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
            )
        if len(sketch._kmers) != num_hashes:
            raise Exception("Unequal number of hashes for sketch of %s" %
                            sketch.input_file_name)
        ksizes.add(sketch.ksize)
        if len(ksizes) > 1:
            raise Exception(
                "Training/reference data uses different k-mer sizes. Culprit was %s."
                % (sketch.input_file_name))
    # Get the appropriate k-mer size
    ksize = ksizes.pop()
    # Get number of threads to use
    num_threads = args.threads
    # Check and parse the query file
    query_file = os.path.abspath(args.in_file)
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size>
    if args.node_graph is None:  # If no node graph is specified, create one
        node_graph_out = os.path.join(
            os.path.dirname(os.path.abspath(args.out_csv)),
            os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
        if not os.path.exists(
                node_graph_out
        ):  # Don't complain if the default location works
            print("Node graph not provided (via -ng). Creating one at: %s" %
                  node_graph_out)
    elif os.path.exists(
            args.node_graph):  # If one is specified and it exists, use it
        node_graph_out = args.node_graph
    else:  # Otherwise, the specified one doesn't exist
        raise Exception("Provided NodeGraph %s does not exist." %
                        args.node_graph)
    # import and check the intersect nodegraph
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            training_data)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    results_file = os.path.abspath(args.out_csv)
    force = args.force
    fprate = args.fp_rate
    coverage_threshold = args.containment_threshold  # desired coverage cutoff
    confidence = args.confidence  # desired confidence that you got all the organisms with coverage >= desired coverage

    # Get names of training files for use as rows in returned tabular data
    training_file_names = []
    for i in range(len(sketches)):
        training_file_names.append(sketches[i].input_file_name)

    # Only form the Nodegraph if we need to
    global sample_kmers
    if not os.path.exists(node_graph_out) or force is True:
        hll = khmer.HLLCounter(0.01, ksize)
        hll.consume_seqfile(query_file)
        full_kmer_count_estimate = hll.estimate_cardinality()
        res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
        if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
            #sample_kmers.consume_seqfile(query_file)
            rparser = khmer.ReadParser(query_file)
            threads = []
            for _ in range(num_threads):
                cur_thrd = threading.Thread(
                    target=sample_kmers.consume_seqfile_with_reads_parser,
                    args=(rparser, ))
                threads.append(cur_thrd)
                cur_thrd.start()
            for thread in threads:
                thread.join()
        else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
            # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
            #  instead of J(query, training)
            # (TODO: fix this after khmer is updated)
            #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
            intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
            )  # Not technically correct, but I need to wait until khmer is updated
            if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
                res = optimal_size(intersect_nodegraph_kmer_count,
                                   fp_rate=fprate)
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            else:
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            for record in screed.open(query_file):
                seq = record.sequence
                for i in range(len(seq) - ksize + 1):
                    kmer = seq[i:i + ksize]
                    if intersect_nodegraph.get(kmer) > 0:
                        sample_kmers.add(kmer)
        # Save the sample_kmers
        sample_kmers.save(node_graph_out)
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)
    else:
        sample_kmers = khmer.load_nodegraph(node_graph_out)
        node_ksize = sample_kmers.ksize()
        if node_ksize != ksize:
            raise Exception(
                "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k."
                % (node_graph_out, node_ksize, ksize))
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)

    #num_sample_kmers = sample_kmers.n_unique_kmers()  # For some reason this only works when creating a new node graph, use the following instead
    num_sample_kmers = sample_kmers.n_occupied()

    # Compute all the indicies for all the training data
    pool = Pool(processes=num_threads)
    res = pool.map(
        unwrap_compute_indicies,
        zip(sketches, repeat(num_sample_kmers), repeat(true_fprate)))

    # Gather up the results in a nice form
    intersection_cardinalities = np.zeros(len(sketches))
    containment_indexes = np.zeros(len(sketches))
    jaccard_indexes = np.zeros(len(sketches))
    for i in range(len(res)):
        (intersection_cardinality, containment_index, jaccard_index) = res[i]
        intersection_cardinalities[i] = intersection_cardinality
        containment_indexes[i] = containment_index
        jaccard_indexes[i] = jaccard_index

    d = {
        'intersection': intersection_cardinalities,
        'containment index': containment_indexes,
        'jaccard index': jaccard_indexes
    }
    # Use only the basenames to label the rows (if requested)
    if base_name is True:
        df = pd.DataFrame(d, map(os.path.basename, training_file_names))
    else:
        df = pd.DataFrame(d, training_file_names)

    # Only get the rows above a certain threshold
    if coverage_threshold <= 0:
        est_threshold = 0
    else:
        est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate,
                                       confidence)
    filtered_results = df[df['containment index'] > est_threshold].sort_values(
        'containment index', ascending=False)
    # Export the results
    filtered_results.to_csv(results_file, index=True, encoding='utf-8')
Exemple #17
0
    def load(info, dirname):
        new_node = Node(info['factory'], name=info['name'])

        filename = os.path.join(dirname, info['filename'])
        new_node.data = khmer.load_nodegraph(filename)
        return new_node
Exemple #18
0
def main():
    info('find-knots.py', ['graph'])
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        ":doc:`partitioning-big-data`",
        "http://khmer.readthedocs.org/en/stable/user/"
        "partitioning-big-data.html")
    args = sanitize_help(parser).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
    graph = khmer.load_nodegraph(graphbase)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    graph.load_tagset(graphbase + '.tagset')

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
        graph.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)
    print('---', file=sys.stderr)
    print('output stoptags will be in',
          graphbase + '.stoptags',
          file=sys.stderr)
    if initial_stoptags:
        print('(these output stoptags will include the already-loaded set)',
              file=sys.stderr)
    print('---', file=sys.stderr)

    # create countgraph
    ksize = graph.ksize()
    counting = khmer_args.create_countgraph(args, ksize=ksize)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print('<-', subset_file, file=sys.stderr)
        subset = graph.load_subset_partitionmap(subset_file)

        print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
        graph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** merging subset... %s' % subset_file, file=sys.stderr)
        graph.merge_subset(subset)

        print('** repartitioning, round 2... %s' % subset_file,
              file=sys.stderr)
        size = graph.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** repartitioned size:', size, file=sys.stderr)

        print('saving stoptags binary', file=sys.stderr)
        graph.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)

    print('done!', file=sys.stderr)
Exemple #19
0
 def load(info):
     data = khmer.load_nodegraph(info['filename'])
     return Leaf(info['metadata'], data, name=info['name'])
Exemple #20
0
 def load(info):
     new_node = Node(info['factory'], name=info['name'])
     new_node.data = khmer.load_nodegraph(info['filename'])
     return new_node
Exemple #21
0
 def load(info, dirname):
     filepath = os.path.join(dirname, info['filename'])
     data = khmer.load_nodegraph(filepath)
     return Leaf(info['metadata'], data, name=info['name'])
Exemple #22
0
def main():
    info('partition-graph.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    basename = args.basename

    filenames = [basename, basename + '.tagset']
    for _ in filenames:
        check_input_files(_, args.force)

    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    if args.stoptags:
        print('stoptag file:', args.stoptags, file=sys.stderr)
    print('--', file=sys.stderr)

    print('loading nodegraph %s' % basename, file=sys.stderr)
    nodegraph = load_nodegraph(basename)
    nodegraph.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps:',
              ' stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print('starting %d threads' % n_threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker,
                                    args=(worker_q, basename,
                                          stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (basename, ),
          file=sys.stderr)
Exemple #23
0
def main():
    info('partition-graph.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    basename = args.basename

    filenames = [basename, basename + '.tagset']
    for _ in filenames:
        check_input_files(_, args.force)

    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    if args.stoptags:
        print('stoptag file:', args.stoptags, file=sys.stderr)
    print('--', file=sys.stderr)

    print('loading nodegraph %s' % basename, file=sys.stderr)
    nodegraph = load_nodegraph(basename)
    nodegraph.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps:',
              ' stop_big_traversals is true.', file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.', file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print('starting %d threads' % n_threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker, args=(worker_q, basename,
                                                         stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' %
          (basename,), file=sys.stderr)
Exemple #24
0
 def data(self):
     if self._data is None:
         # TODO: what if self._filename is None?
         self._data = khmer.load_nodegraph(self._filename)
     return self._data
Exemple #25
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        ":doc:`partitioning-big-data`",
        "http://khmer.readthedocs.io/en/stable/user/"
        "partitioning-big-data.html"
    )
    args = sanitize_help(parser).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
    graph = khmer.load_nodegraph(graphbase)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    graph.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
        graph.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)
    print('---', file=sys.stderr)
    print('output stoptags will be in',
          graphbase + '.stoptags', file=sys.stderr)
    if initial_stoptags:
        print(
            '(these output stoptags will include the already-loaded set)',
            file=sys.stderr)
    print('---', file=sys.stderr)

    # create countgraph
    ksize = graph.ksize()
    counting = khmer_args.create_countgraph(args, ksize=ksize)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print('<-', subset_file, file=sys.stderr)
        subset = graph.load_subset_partitionmap(subset_file)

        print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
        graph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** merging subset... %s' % subset_file, file=sys.stderr)
        graph.merge_subset(subset)

        print('** repartitioning, round 2... %s' %
              subset_file, file=sys.stderr)
        size = graph.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** repartitioned size:', size, file=sys.stderr)

        print('saving stoptags binary', file=sys.stderr)
        graph.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)

    print('done!', file=sys.stderr)
Exemple #26
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        help=
        "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space (unfortunately will also disable threading)."
    )
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument('out_dir', help='Output directory')

    # Parse and check args
    args = parser.parse_args()
    query_file = os.path.abspath(args.in_file)
    ksize = args.k_size
    num_threads = args.threads
    node_graph_out = os.path.join(
        os.path.abspath(args.out_dir),
        os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
    if args.intersect_nodegraph is not None:
        intersect_nodegraph_file = args.intersect_nodegraph
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    fprate = args.fp_rate
    hll = khmer.HLLCounter(0.01, ksize)
    hll.consume_seqfile(query_file)
    full_kmer_count_estimate = hll.estimate_cardinality()
    res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
    if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
        sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables)
        #sample_kmers.consume_seqfile(query_file)
        rparser = khmer.ReadParser(query_file)
        threads = []
        for _ in range(num_threads):
            cur_thrd = threading.Thread(
                target=sample_kmers.consume_seqfile_with_reads_parser,
                args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()
        for thread in threads:
            thread.join()
    else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
        # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
        #  instead of J(query, training)
        # (TODO: fix this after khmer is updated)
        #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
        intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
        )  # Doesnt work due to khmer bug
        if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
            res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate)
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        else:
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        for record in screed.open(query_file):
            seq = record.sequence
            for i in range(len(seq) - ksize + 1):
                kmer = seq[i:i + ksize]
                if intersect_nodegraph.get(kmer) > 0:
                    sample_kmers.add(kmer)
    # Save the sample_kmers
    sample_kmers.save(node_graph_out)