コード例 #1
0
ファイル: test_scripts.py プロジェクト: ashwinkalbhor/khmer
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    utils.runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    utils.runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0), x          # should be 2 partitions
コード例 #2
0
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
    in_dir = os.path.dirname(graphbase)

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0)  # should be 2 partitions
コード例 #3
0
ファイル: test_scripts.py プロジェクト: paulrigor/khmer
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data("random-20-a.fa"))
    in_dir = os.path.dirname(graphbase)

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + ".ht")
    ht.add_stop_tag("TTGCATACGTTGAGCCAGCG")
    stoptags_file = graphbase + ".stoptags"
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath("partition-graph.py")
    args = ["--stoptags", stoptags_file, graphbase]

    runscript(script, args)

    script = scriptpath("merge-partitions.py")
    args = [graphbase, "-k", str(20)]
    runscript(script, args)

    final_pmap_file = graphbase + ".pmap.merged"
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + ".ht")
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0)  # should be 2 partitions
コード例 #4
0
ファイル: test_scripts.py プロジェクト: Huanle/khmer
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
    in_dir = os.path.dirname(graphbase)

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0)          # should be 2 partitions
コード例 #5
0
ファイル: test_scripts.py プロジェクト: maarten1983/khmer
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    utils.runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    utils.runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0), x  # should be 2 partitions
コード例 #6
0
def test_partition_graph_nojoin_k21():
    # test with K=21
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), K=21)
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(21)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (99, 0)  # should be 99 partitions at K=21
コード例 #7
0
ファイル: filter-sodd.py プロジェクト: xcams/khmer
def main():
    htfile = sys.argv[1]
    outfiles = sys.argv[2:]

    print 'loading hashbits'
    ht = khmer.load_hashbits(htfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)

        if trim_at >= ht.ksize():
            return name, trim_seq

        return None, None

    for filename in outfiles:
        outpath = os.path.basename(filename) + '.sodd'
        outfp = open(outpath, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
コード例 #8
0
ファイル: test_scripts.py プロジェクト: Huanle/khmer
def test_partition_graph_nojoin_k21():
    # test with K=21
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), K=21)
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(21)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (99, 0)          # should be 99 partitions at K=21
コード例 #9
0
ファイル: find-unpart.py プロジェクト: charlesxian/khmer
def main():
    already_part = sys.argv[1]
    new_to_part = sys.argv[2]
    basename = os.path.basename(new_to_part)
    pmap_filename = sys.argv[3]

    # if not os.path.exists(already_part):
    #    print '%s doesn\'t exist! dying.' % already_part
    #    sys.exit(0)

    # create a fake-ish ht; K matters, but not hashtable size.
    ht = khmer.load_hashbits(already_part + '.ht')
    ht.load_tagset(already_part + '.tagset')
    ht.merge_subset_from_disk(pmap_filename)

    # find singletons
    n_singletons = ht.find_unpart(
        new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS)
    print 'found:', n_singletons

    print 'saving', basename + '.unpart'
    n_partitions = ht.output_partitions(new_to_part, basename + '.unpart')
    print 'saving', basename + '.pmap'
    ht.save_partitionmap(basename + '.pmap')

    ###

    (n_partitions, n_singletons) = ht.count_partitions()

    print 'output partitions:', n_partitions
    print 'pmap partitions:', n_partitions
    print 'singletons:', n_singletons
コード例 #10
0
ファイル: find-unpart.py プロジェクト: shafcodes/khmer
def main():
    already_part = sys.argv[1]
    new_to_part = sys.argv[2]
    basename = os.path.basename(new_to_part)
    pmap_filename = sys.argv[3]

    # if not os.path.exists(already_part):
    #    print '%s doesn\'t exist! dying.' % already_part
    #    sys.exit(0)

    # create a fake-ish ht; K matters, but not hashtable size.
    ht = khmer.load_hashbits(already_part + '.ht')
    ht.load_tagset(already_part + '.tagset')
    ht.merge_subset_from_disk(pmap_filename)

    # find singletons
    n_singletons = ht.find_unpart(new_to_part, TRAVERSE_ON_UNPART,
                                  STOP_BIG_TRAVERSALS)
    print 'found:', n_singletons

    print 'saving', basename + '.unpart'
    n_partitions = ht.output_partitions(new_to_part, basename + '.unpart')
    print 'saving', basename + '.pmap'
    ht.save_partitionmap(basename + '.pmap')

    ###

    (n_partitions, n_singletons) = ht.count_partitions()

    print 'output partitions:', n_partitions
    print 'pmap partitions:', n_partitions
    print 'singletons:', n_singletons
コード例 #11
0
ファイル: test_scripts.py プロジェクト: maarten1983/khmer
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = utils.runscript(script, args)

    assert 'Total number of k-mers: 3959' in err, err

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
コード例 #12
0
ファイル: filter-sodd.py プロジェクト: Canido/khmer
def main():
    htfile = sys.argv[1]
    outfiles = sys.argv[2:]

    print 'loading hashbits'
    ht = khmer.load_hashbits(htfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)

        if trim_at >= ht.ksize():
            return name, trim_seq

        return None, None

    for filename in outfiles:
        outpath = os.path.basename(filename) + '.sodd'
        outfp = open(outpath, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
コード例 #13
0
ファイル: test_scripts.py プロジェクト: Huanle/khmer
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = runscript(script, args)
    assert status == 0

    ht_file = outfile + '.ht'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
コード例 #14
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_file_status(infile)

    check_space([args.ptfile, args.fafile])

    print 'loading k-mer presence table from', args.ptfile
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n'
        f_curve_obj.write(to_print)
コード例 #15
0
ファイル: test_scripts.py プロジェクト: paulrigor/khmer
def test_load_graph():
    script = scriptpath("load-graph.py")
    args = ["-x", "1e7", "-N", "2", "-k", "20"]

    outfile = utils.get_temp_filename("out")
    infile = utils.get_test_data("random-20-a.fa")

    args.extend([outfile, infile])

    runscript(script, args)

    ht_file = outfile + ".ht"
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + ".tagset"
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
コード例 #16
0
ファイル: count-overlap.py プロジェクト: Canido/khmer
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_file_status(infile)

    check_space([args.ptfile, args.fafile])

    print 'loading k-mer presence table from', args.ptfile
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n'
        f_curve_obj.write(to_print)
コード例 #17
0
def main():
    parser = argparse.ArgumentParser(
        description="Find an initial set of highly connected k-mers.")

    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=DEFAULT_COUNTING_HT_N,
                        help='number of counting hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
                        default=DEFAULT_COUNTING_HT_SIZE,
                        help='lower bound on counting hashsize to use')
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (default 1e4 is prob ok)')
    parser.add_argument('--stoptags', '-S', dest='stoptags', default='',
                        help="Use stoptags in this file during partitioning")

    parser.add_argument('graphbase')

    args = parser.parse_args()

    graphbase = args.graphbase

    print 'loading ht %s.ht' % graphbase
    ht = khmer.load_hashbits(graphbase + '.ht')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        ht.load_stop_tags(args.stoptags)

    print 'loading tagset %s.tagset...' % graphbase
    ht.load_tagset(graphbase + '.tagset')

    K = ht.ksize()
    counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)

    # divide up into SUBSET_SIZE fragments
    divvy = ht.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print 'doing pre-partitioning from', start, 'to', end
    subset = ht.do_subset_partition(start, end)

    # now, repartition...
    print 'repartitioning to find HCKs.'
    ht.repartition_largest_partition(subset, counting,
                                     EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    print 'saving stop tags'
    ht.save_stop_tags(graphbase + '.stoptags')
コード例 #18
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, 'loading htable %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print >>sys.stderr, 'repartitioning to find HCKs.'
    htable.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

    print >>sys.stderr, 'saving stop tags'
    htable.save_stop_tags(graphbase + '.stoptags')
    print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
コード例 #19
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading htable %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print >>sys.stderr, 'repartitioning to find HCKs.'
    htable.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

    print >>sys.stderr, 'saving stop tags'
    htable.save_stop_tags(graphbase + '.stoptags')
    print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
コード例 #20
0
ファイル: test_scripts.py プロジェクト: zhangycbnu/khmer
def test_partition_graph_big_traverse():
    graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'),
                            do_partition=True, stop_big_traverse=False)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (1, 0)          # should be exactly one partition.
コード例 #21
0
ファイル: make-initial-stoptags.py プロジェクト: b-wyss/khmer
def main():

    info("make-initial-stoptags.py", ["graph"])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + ".pt", graphbase + ".tagset"]
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, "loading htable %s.pt" % graphbase
    htable = khmer.load_hashbits(graphbase + ".pt")

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, "loading stoptags from", args.stoptags
        htable.load_stop_tags(args.stoptags)

    print >>sys.stderr, "loading tagset %s.tagset..." % graphbase
    htable.load_tagset(graphbase + ".tagset")

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print >>sys.stderr, "doing pre-partitioning from", start, "to", end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print >>sys.stderr, "repartitioning to find HCKs."
    htable.repartition_largest_partition(
        subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD
    )

    print >>sys.stderr, "saving stop tags"
    htable.save_stop_tags(graphbase + ".stoptags")
    print >>sys.stderr, "wrote to:", graphbase + ".stoptags"
コード例 #22
0
ファイル: test_scripts.py プロジェクト: zhangycbnu/khmer
def test_partition_graph_no_big_traverse():
    # do NOT exhaustively traverse
    graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'),
                            do_partition=True, stop_big_traverse=True)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (4, 0), x       # should be four partitions, broken at knot.
コード例 #23
0
def test_partition_graph_big_traverse():
    graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'),
                            do_partition=True,
                            stop_big_traverse=False)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (1, 0)  # should be exactly one partition.
コード例 #24
0
def test_partition_graph_no_big_traverse():
    # do NOT exhaustively traverse
    graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'),
                            do_partition=True,
                            stop_big_traverse=True)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (4, 0), x  # should be four partitions, broken at knot.
コード例 #25
0
ファイル: test_scripts.py プロジェクト: Canido/khmer
def test_load_graph_no_tags():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    runscript(script, args)

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert not os.path.exists(tagset_file), tagset_file

    assert khmer.load_hashbits(ht_file)
コード例 #26
0
ファイル: test_scripts.py プロジェクト: paulrigor/khmer
def test_load_graph_no_tags():
    script = scriptpath("load-graph.py")
    args = ["-x", "1e7", "-N", "2", "-k", "20", "-n"]

    outfile = utils.get_temp_filename("out")
    infile = utils.get_test_data("random-20-a.fa")

    args.extend([outfile, infile])

    runscript(script, args)

    ht_file = outfile + ".ht"
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + ".tagset"
    assert not os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
コード例 #27
0
ファイル: test_scripts.py プロジェクト: maarten1983/khmer
def test_load_graph_no_tags():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    utils.runscript(script, args)

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert not os.path.exists(tagset_file), tagset_file

    assert khmer.load_hashbits(ht_file)
コード例 #28
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_input_files(infile, args.force)

    check_space([args.ptfile, args.fafile], args.force)

    print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')
    if args.csv:
        f_curve_obj_csv = csv.writer(f_curve_obj)
        # write headers:
        f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        if args.csv:
            f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
        else:
            print(list_curve[100 + i], list_curve[i], file=f_curve_obj)

    print('wrote to: ' + args.report_filename, file=sys.stderr)
コード例 #29
0
def test_partition_graph_1():
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (1, 0)  # should be exactly one partition.
コード例 #30
0
ファイル: test_scripts.py プロジェクト: zhangycbnu/khmer
def test_partition_graph_1():
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (1, 0)          # should be exactly one partition.
コード例 #31
0
ファイル: test_scripts.py プロジェクト: paulrigor/khmer
def test_partition_graph_1():
    graphbase = _make_graph(utils.get_test_data("random-20-a.fa"))
    in_dir = os.path.dirname(graphbase)

    script = scriptpath("partition-graph.py")
    args = [graphbase]

    runscript(script, args)

    script = scriptpath("merge-partitions.py")
    args = [graphbase, "-k", str(20)]
    runscript(script, args)

    final_pmap_file = graphbase + ".pmap.merged"
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + ".ht")
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (1, 0)  # should be exactly one partition.
コード例 #32
0
ファイル: test_scripts.py プロジェクト: Canido/khmer
def test_partition_graph_nojoin_k21():
    # test with K=21
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), ksize=21)

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(21)]
    runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (99, 0), x          # should be 99 partitions at K=21
コード例 #33
0
ファイル: test_scripts.py プロジェクト: maarten1983/khmer
def test_partition_graph_nojoin_k21():
    # test with K=21
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), ksize=21)

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    utils.runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(21)]
    utils.runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (99, 0), x  # should be 99 partitions at K=21
コード例 #34
0
ファイル: partition-graph.py プロジェクト: fomightez/khmer
def main():
    parser = argparse.ArgumentParser(description="Partition a graph.")

    parser.add_argument('basename')
    parser.add_argument('--stoptags', '-S', dest='stoptags', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')

    parser.add_argument('--no-big-traverse', dest='no_big_traverse',
                        action='store_true', default=False,
                        help='Truncate graph joins at big traversals')

    parser.add_argument('--threads', '-T', dest='n_threads',
                        default=DEFAULT_N_THREADS,
                        help='Number of simultaneous threads to execute')

    args = parser.parse_args()
    basename = args.basename

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    if args.stoptags:
        print 'stoptag file:', args.stoptags
    print '--'

    print 'loading ht %s.ht' % basename
    ht = khmer.load_hashbits(basename + '.ht')
    ht.load_tagset(basename + '.tagset')

    # retrieve K
    K = ht.ksize()

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        ht.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = ht.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for i in range(0, n_subsets):
        start = divvy[i]
        end = divvy[i + 1]
        worker_q.put((ht, i, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = int(args.n_threads)
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % n_threads
    print '---'

    threads = []
    for n in range(n_threads):
        t = threading.Thread(target=worker, args=(worker_q, basename,
                                                  stop_big_traversals))
        threads.append(t)
        t.start()

    print 'done starting threads'

    # wait for threads
    for t in threads:
        t.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (basename,)
コード例 #35
0
def main():
    info('find-knots.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, False)

    check_space(infiles, False)

    print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase
        htable.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
        (len(pmap_files), pmap_files[0])
    print >>sys.stderr, '---'
    print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print >>sys.stderr, \
            '(these output stoptags will include the already-loaded set)'
    print >>sys.stderr, '---'

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print >>sys.stderr, '<-', subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print >>sys.stderr, '** repartitioning subset... %s' % subset_file
        htable.repartition_largest_partition(subset, counting,
                                             EXCURSION_DISTANCE,
                                             EXCURSION_KMER_THRESHOLD,
                                             EXCURSION_KMER_COUNT_THRESHOLD)

        print >>sys.stderr, '** merging subset... %s' % subset_file
        htable.merge_subset(subset)

        print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print >>sys.stderr, '** repartitioned size:', size

        print >>sys.stderr, 'saving stoptags binary'
        htable.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files))

    print >>sys.stderr, 'done!'
コード例 #36
0
def main():
    info('partition-graph.py', ['graph'])
    args = get_parser().parse_args()
    basename = args.basename

    filenames = [basename + '.pt', basename + '.tagset']
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print >> sys.stderr, '--'
    print >> sys.stderr, 'SUBSET SIZE', args.subset_size
    print >> sys.stderr, 'N THREADS', args.threads
    if args.stoptags:
        print >> sys.stderr, 'stoptag file:', args.stoptags
    print >> sys.stderr, '--'

    print >> sys.stderr, 'loading ht %s.pt' % basename
    htable = khmer.load_hashbits(basename + '.pt')
    htable.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >> sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print >>sys.stderr, '** This script brakes for lumps:', \
                            ' stop_big_traversals is true.'
    else:
        print >>sys.stderr, '** Traverse all the things:', \
                            ' stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print >> sys.stderr, 'starting %d threads' % n_threads
    print >> sys.stderr, '---'

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker,
                                    args=(worker_q, basename,
                                          stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print >> sys.stderr, 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print >> sys.stderr, '---'
    print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
        (basename,)
コード例 #37
0
ファイル: find-knots.py プロジェクト: b-wyss/khmer
def main():
    info("find-knots.py", ["graph"])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + ".pt", graphbase + ".tagset"]
    if os.path.exists(graphbase + ".stoptags"):
        infiles.append(graphbase + ".stoptags")
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, "loading k-mer presence table %s.pt" % graphbase
    htable = khmer.load_hashbits(graphbase + ".pt")

    print >>sys.stderr, "loading tagset %s.tagset..." % graphbase
    htable.load_tagset(graphbase + ".tagset")

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + ".stoptags"):
        print >>sys.stderr, "loading stoptags %s.stoptags" % graphbase
        htable.load_stop_tags(graphbase + ".stoptags")
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + ".subset.*.pmap")

    print >>sys.stderr, "loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0])
    print >>sys.stderr, "---"
    print >>sys.stderr, "output stoptags will be in", graphbase + ".stoptags"
    if initial_stoptags:
        print >>sys.stderr, "(these output stoptags will include the already-loaded set)"
    print >>sys.stderr, "---"

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print >>sys.stderr, "<-", subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print >>sys.stderr, "** repartitioning subset... %s" % subset_file
        htable.repartition_largest_partition(
            subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD
        )

        print >>sys.stderr, "** merging subset... %s" % subset_file
        htable.merge_subset(subset)

        print >>sys.stderr, "** repartitioning, round 2... %s" % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD
        )

        print >>sys.stderr, "** repartitioned size:", size

        print >>sys.stderr, "saving stoptags binary"
        htable.save_stop_tags(graphbase + ".stoptags")
        os.rename(subset_file, subset_file + ".processed")
        print >>sys.stderr, "(%d of %d)\n" % (index, len(pmap_files))

    print >>sys.stderr, "done!"
コード例 #38
0
def main():
    parser = argparse.ArgumentParser(
        description="Find an initial set of highly connected k-mers.")

    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=DEFAULT_COUNTING_HT_N,
                        help='number of counting hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=DEFAULT_COUNTING_HT_SIZE,
                        help='lower bound on counting hashsize to use')
    parser.add_argument('--subset-size',
                        '-s',
                        default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size',
                        type=float,
                        help='Set subset size (default 1e4 is prob ok)')
    parser.add_argument('--stoptags',
                        '-S',
                        dest='stoptags',
                        default='',
                        help="Use stoptags in this file during partitioning")

    parser.add_argument('graphbase')

    args = parser.parse_args()

    graphbase = args.graphbase

    print 'loading ht %s.ht' % graphbase
    ht = khmer.load_hashbits(graphbase + '.ht')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        ht.load_stop_tags(args.stoptags)

    print 'loading tagset %s.tagset...' % graphbase
    ht.load_tagset(graphbase + '.tagset')

    K = ht.ksize()
    counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)

    # divide up into SUBSET_SIZE fragments
    divvy = ht.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print 'doing pre-partitioning from', start, 'to', end
    subset = ht.do_subset_partition(start, end)

    # now, repartition...
    print 'repartitioning to find HCKs.'
    ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    print 'saving stop tags'
    ht.save_stop_tags(graphbase + '.stoptags')
コード例 #39
0
def main():
    parser = argparse.ArgumentParser(
        description='Use bloom filter to count overlap k-mers')
    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_HASHSIZE)
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    parser.add_argument('--ksize', '-k', type=int, dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use '
                             '(should be the same as in htfile)')
    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='hashsize',
                        default=env_hashsize,
                        help='hashsize to use')
    parser.add_argument('htfile')
    parser.add_argument('fafile')
    parser.add_argument('report_filename')
    args = parser.parse_args()
    if not args.quiet:
        if args.hashsize == DEFAULT_HASHSIZE:
            print >>sys.stderr, \
                "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"
        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - hashsize = %-5.2g \t(-x)' % args.hashsize
        print >>sys.stderr, \
            'Estimated memory usage is %.2g bytes (n_hashes x hashsize / 8)' \
            % (args.n_hashes * args.hashsize / 8.)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.hashsize
    N_HT = args.n_hashes
    htfile = args.htfile
    fafile = args.fafile
    output_filename = args.report_filename
    curve_filename = output_filename + '.curve'

    print 'loading hashbits from', htfile
    ht1 = khmer.load_hashbits(htfile)
    K = ht1.ksize()

    output = open(output_filename, 'w')
    f_curve_obj = open(curve_filename, 'w')

    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)

    (n_unique, n_overlap, list) = ht2.count_overlap(fafile, ht1)

    printout1 = """\
dataset1(ht file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (htfile, fafile, n_unique, n_overlap)
    output.write(printout1)

    figure_list1 = []
    figure_list2 = []

    for i in range(100):
        to_print = str(list[100 + i]) + ' ' + str(list[i]) + '\n'
        f_curve_obj.write(to_print)
コード例 #40
0
def main():
    parser = argparse.ArgumentParser(description="Partition a graph.")

    parser.add_argument('basename')
    parser.add_argument('--stoptags',
                        '-S',
                        dest='stoptags',
                        default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size',
                        '-s',
                        default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size',
                        type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')

    parser.add_argument('--no-big-traverse',
                        dest='no_big_traverse',
                        action='store_true',
                        default=False,
                        help='Truncate graph joins at big traversals')

    parser.add_argument('--threads',
                        '-T',
                        dest='n_threads',
                        default=DEFAULT_N_THREADS,
                        help='Number of simultaneous threads to execute')

    args = parser.parse_args()
    basename = args.basename

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    if args.stoptags:
        print 'stoptag file:', args.stoptags
    print '--'

    print 'loading ht %s.ht' % basename
    ht = khmer.load_hashbits(basename + '.ht')
    ht.load_tagset(basename + '.tagset')

    # retrieve K
    K = ht.ksize()

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        ht.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = ht.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for i in range(0, n_subsets):
        start = divvy[i]
        end = divvy[i + 1]
        worker_q.put((ht, i, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = int(args.n_threads)
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % n_threads
    print '---'

    threads = []
    for n in range(n_threads):
        t = threading.Thread(target=worker,
                             args=(worker_q, basename, stop_big_traversals))
        threads.append(t)
        t.start()

    print 'done starting threads'

    # wait for threads
    for t in threads:
        t.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (basename, )
コード例 #41
0
ファイル: find-knots.py プロジェクト: SchwarzEM/khmer
def main():
    info('find-knots.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase
        htable.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
        (len(pmap_files), pmap_files[0])
    print >> sys.stderr, '---'
    print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print >>sys.stderr, \
            '(these output stoptags will include the already-loaded set)'
    print >> sys.stderr, '---'

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print >> sys.stderr, '<-', subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print >> sys.stderr, '** repartitioning subset... %s' % subset_file
        htable.repartition_largest_partition(subset, counting,
                                             EXCURSION_DISTANCE,
                                             EXCURSION_KMER_THRESHOLD,
                                             EXCURSION_KMER_COUNT_THRESHOLD)

        print >> sys.stderr, '** merging subset... %s' % subset_file
        htable.merge_subset(subset)

        print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print >> sys.stderr, '** repartitioned size:', size

        print >> sys.stderr, 'saving stoptags binary'
        htable.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files))

    print >> sys.stderr, 'done!'
コード例 #42
0
ファイル: find-unpart.py プロジェクト: RamRS/khmer
import glob

TRAVERSE_ON_UNPART = True
STOP_BIG_TRAVERSALS = True

already_part = sys.argv[1]
new_to_part = sys.argv[2]
basename = os.path.basename(new_to_part)
pmap_filename = sys.argv[3]

# if not os.path.exists(already_part):
#    print '%s doesn\'t exist! dying.' % already_part
#    sys.exit(0)

# create a fake-ish ht; K matters, but not hashtable size.
ht = khmer.load_hashbits(already_part + '.ht')
ht.load_tagset(already_part + '.tagset')
ht.merge_subset_from_disk(pmap_filename)

# find singletons
n_singletons = ht.find_unpart(
    new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS)
print 'found:', n_singletons

print 'saving', basename + '.unpart'
n_partitions = ht.output_partitions(new_to_part, basename + '.unpart')
print 'saving', basename + '.pmap'
ht.save_partitionmap(basename + '.pmap')

###
コード例 #43
0
import glob

TRAVERSE_ON_UNPART = True
STOP_BIG_TRAVERSALS = True

already_part = sys.argv[1]
new_to_part = sys.argv[2]
basename = os.path.basename(new_to_part)
pmap_filename = sys.argv[3]

# if not os.path.exists(already_part):
#    print '%s doesn\'t exist! dying.' % already_part
#    sys.exit(0)

# create a fake-ish ht; K matters, but not hashtable size.
ht = khmer.load_hashbits(already_part + '.ht')
ht.load_tagset(already_part + '.tagset')
ht.merge_subset_from_disk(pmap_filename)

# find singletons
n_singletons = ht.find_unpart(new_to_part, TRAVERSE_ON_UNPART,
                              STOP_BIG_TRAVERSALS)
print 'found:', n_singletons

print 'saving', basename + '.unpart'
n_partitions = ht.output_partitions(new_to_part, basename + '.unpart')
print 'saving', basename + '.pmap'
ht.save_partitionmap(basename + '.pmap')

###
コード例 #44
0
ファイル: find-knots.py プロジェクト: fomightez/khmer
def main():
    parser = argparse.ArgumentParser(
        description="Find all highly connected k-mers.")

    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=DEFAULT_COUNTING_HT_N,
                        help='number of counting hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
                        default=DEFAULT_COUNTING_HT_SIZE,
                        help='lower bound on counting hashsize to use')
    parser.add_argument('graphbase')

    args = parser.parse_args()

    graphbase = args.graphbase

    print 'loading ht %s.ht' % graphbase
    ht = khmer.load_hashbits(graphbase + '.ht')

    print 'loading tagset %s.tagset...' % graphbase
    ht.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print 'loading stoptags %s.stoptags' % graphbase
        ht.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])
    print '---'
    print 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print '(these output stoptags will include the already-loaded set)'
    print '---'

    # create counting hash
    K = ht.ksize()
    counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)

    # load & merge
    for n, subset_file in enumerate(pmap_files):
        print '<-', subset_file
        subset = ht.load_subset_partitionmap(subset_file)

        print '** repartitioning subset... %s' % subset_file
        ht.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

        print '** merging subset... %s' % subset_file
        ht.merge_subset(subset)

        print '** repartitioning, round 2... %s' % subset_file
        size = ht.repartition_largest_partition(None, counting,
                                                EXCURSION_DISTANCE,
                                                EXCURSION_KMER_THRESHOLD,
                                                EXCURSION_KMER_COUNT_THRESHOLD)

        print '** repartitioned size:', size

        print 'saving stoptags binary'
        ht.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print '(%d of %d)\n' % (n, len(pmap_files))

    print 'done!'
コード例 #45
0
def main():
    info('partition-graph.py', ['graph'])
    args = get_parser().parse_args()
    basename = args.basename

    filenames = [basename + '.pt', basename + '.tagset']
    for _ in filenames:
        check_file_status(_, args.force)

    check_space(filenames, args.force)

    print >>sys.stderr, '--'
    print >>sys.stderr, 'SUBSET SIZE', args.subset_size
    print >>sys.stderr, 'N THREADS', args.threads
    if args.stoptags:
        print >>sys.stderr, 'stoptag file:', args.stoptags
    print >>sys.stderr, '--'

    print >>sys.stderr, 'loading ht %s.pt' % basename
    htable = khmer.load_hashbits(basename + '.pt')
    htable.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print >>sys.stderr, '** This script brakes for lumps:', \
                            ' stop_big_traversals is true.'
    else:
        print >>sys.stderr, '** Traverse all the things:', \
                            ' stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print >>sys.stderr, 'starting %d threads' % n_threads
    print >>sys.stderr, '---'

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker, args=(worker_q, basename,
                                                         stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print >>sys.stderr, 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print >>sys.stderr, '---'
    print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
        (basename,)
コード例 #46
0
def main():
    parser = argparse.ArgumentParser(
        description="Find all highly connected k-mers.")

    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=DEFAULT_COUNTING_HT_N,
                        help='number of counting hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=DEFAULT_COUNTING_HT_SIZE,
                        help='lower bound on counting hashsize to use')
    parser.add_argument('graphbase')

    args = parser.parse_args()

    graphbase = args.graphbase

    print 'loading ht %s.ht' % graphbase
    ht = khmer.load_hashbits(graphbase + '.ht')

    print 'loading tagset %s.tagset...' % graphbase
    ht.load_tagset(graphbase + '.tagset')

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print 'loading stoptags %s.stoptags' % graphbase
        ht.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])
    print '---'
    print 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print '(these output stoptags will include the already-loaded set)'
    print '---'

    # create counting hash
    K = ht.ksize()
    counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)

    # load & merge
    for n, subset_file in enumerate(pmap_files):
        print '<-', subset_file
        subset = ht.load_subset_partitionmap(subset_file)

        print '** repartitioning subset... %s' % subset_file
        ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

        print '** merging subset... %s' % subset_file
        ht.merge_subset(subset)

        print '** repartitioning, round 2... %s' % subset_file
        size = ht.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print '** repartitioned size:', size

        print 'saving stoptags binary'
        ht.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print '(%d of %d)\n' % (n, len(pmap_files))

    print 'done!'