Ejemplo n.º 1
0
def test_load_graph_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    Nodegraph.load(out1)
Ejemplo n.º 2
0
def test_load_graph_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    Nodegraph.load(out1)
Ejemplo n.º 3
0
def test_hashbits_file_version_check():

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        nodegraph = Nodegraph.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Ejemplo n.º 4
0
def test_hashbits_file_version_check():

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        nodegraph = Nodegraph.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Ejemplo n.º 5
0
def test_nodegraph_file_type_check():
    kh = khmer.Countgraph(12, 1, 1)
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    try:
        nodegraph = Nodegraph.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Ejemplo n.º 6
0
def test_nodegraph_file_type_check():
    kh = khmer.Countgraph(12, 1, 1)
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    try:
        nodegraph = Nodegraph.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Ejemplo n.º 7
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
    nodegraph = Nodegraph.load(graphbase)

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    nodegraph.load_tagset(graphbase + '.tagset')

    counting = khmer_args.create_countgraph(args)

    # divide up into SUBSET_SIZE fragments
    divvy = nodegraph.divide_tags_into_subsets(args.subset_size)
    divvy = list(divvy)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
    subset = nodegraph.do_subset_partition(start, end)

    # now, repartition...
    print('repartitioning to find HCKs.', file=sys.stderr)
    nodegraph.repartition_largest_partition(counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD,
                                            subs=subset)

    print('saving stop tags', file=sys.stderr)
    nodegraph.save_stop_tags(graphbase + '.stoptags')
    print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
Ejemplo n.º 8
0
def test_n_occupied_save_load():
    filename = utils.get_test_data('random-20-a.fa')

    nodegraph = khmer.Nodegraph(20, 100000, 3)

    for _, record in enumerate(screed.open(filename)):
        nodegraph.consume(record.sequence)

    assert nodegraph.n_occupied() == 3884
    assert nodegraph.n_unique_kmers() == 3960

    savefile = utils.get_temp_filename('out')
    nodegraph.save(savefile)

    ng2 = Nodegraph.load(savefile)
    assert ng2.n_occupied() == 3884, ng2.n_occupied()
    assert ng2.n_unique_kmers() == 0    # this is intended behavior, sigh.
Ejemplo n.º 9
0
def test_n_occupied_save_load():
    filename = utils.get_test_data('random-20-a.fa')

    nodegraph = khmer.Nodegraph(20, 100000, 3)

    for _, record in enumerate(screed.open(filename)):
        nodegraph.consume(record.sequence)

    assert nodegraph.n_occupied() == 3884
    assert nodegraph.n_unique_kmers() == 3960

    savefile = utils.get_temp_filename('out')
    nodegraph.save(savefile)

    ng2 = Nodegraph.load(savefile)
    assert ng2.n_occupied() == 3884, ng2.n_occupied()
    assert ng2.n_unique_kmers() == 0    # this is intended behavior, sigh.
Ejemplo n.º 10
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    basename = args.basename

    filenames = [basename, basename + '.tagset']
    for _ in filenames:
        check_input_files(_, args.force)

    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    if args.stoptags:
        print('stoptag file:', args.stoptags, file=sys.stderr)
    print('--', file=sys.stderr)

    print('loading nodegraph %s' % basename, file=sys.stderr)
    nodegraph = Nodegraph.load(basename)
    nodegraph.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps:',
              ' stop_big_traversals is true.', file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.', file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print('starting %d threads' % n_threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker, args=(worker_q, basename,
                                                         stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' %
          (basename,), file=sys.stderr)
Ejemplo n.º 11
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        ":doc:`partitioning-big-data`",
        "http://khmer.readthedocs.io/en/stable/user/"
        "partitioning-big-data.html")
    args = sanitize_help(parser).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
    graph = Nodegraph.load(graphbase)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    graph.load_tagset(graphbase + '.tagset')

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
        graph.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)
    print('---', file=sys.stderr)
    print('output stoptags will be in',
          graphbase + '.stoptags',
          file=sys.stderr)
    if initial_stoptags:
        print('(these output stoptags will include the already-loaded set)',
              file=sys.stderr)
    print('---', file=sys.stderr)

    # create countgraph
    ksize = graph.ksize()
    counting = khmer_args.create_countgraph(args, ksize=ksize)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print('<-', subset_file, file=sys.stderr)
        subset = SubsetPartition.load(subset_file, graph)

        print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
        graph.repartition_largest_partition(counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD,
                                            subs=subset)

        print('** merging subset... %s' % subset_file, file=sys.stderr)
        graph.merge_subset(subset)

        print('** repartitioning, round 2... %s' % subset_file,
              file=sys.stderr)
        size = \
            graph.repartition_largest_partition(counting,
                                                EXCURSION_DISTANCE,
                                                EXCURSION_KMER_THRESHOLD,
                                                EXCURSION_KMER_COUNT_THRESHOLD)

        print('** repartitioned size:', size, file=sys.stderr)

        print('saving stoptags binary', file=sys.stderr)
        graph.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)

    print('done!', file=sys.stderr)
Ejemplo n.º 12
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        ":doc:`partitioning-big-data`",
        "http://khmer.readthedocs.io/en/stable/user/"
        "partitioning-big-data.html"
    )
    args = sanitize_help(parser).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
    graph = Nodegraph.load(graphbase)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    graph.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
        graph.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)
    print('---', file=sys.stderr)
    print('output stoptags will be in',
          graphbase + '.stoptags', file=sys.stderr)
    if initial_stoptags:
        print(
            '(these output stoptags will include the already-loaded set)',
            file=sys.stderr)
    print('---', file=sys.stderr)

    # create countgraph
    ksize = graph.ksize()
    counting = khmer_args.create_countgraph(args, ksize=ksize)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print('<-', subset_file, file=sys.stderr)
        subset = SubsetPartition.load(subset_file, graph)

        print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
        graph.repartition_largest_partition(counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD,
                                            subs=subset)

        print('** merging subset... %s' % subset_file, file=sys.stderr)
        graph.merge_subset(subset)

        print('** repartitioning, round 2... %s' %
              subset_file, file=sys.stderr)
        size = \
            graph.repartition_largest_partition(counting,
                                                EXCURSION_DISTANCE,
                                                EXCURSION_KMER_THRESHOLD,
                                                EXCURSION_KMER_COUNT_THRESHOLD)

        print('** repartitioned size:', size, file=sys.stderr)

        print('saving stoptags binary', file=sys.stderr)
        graph.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)

    print('done!', file=sys.stderr)