Example #1
0
def test_concat_2_fail():
    hs = khmer.HashSet(5, [10, 12])
    hs2 = khmer.HashSet(4, [10, 13])

    try:
        hs += hs2
        assert 0, "inplace concat should fail - different ksize"
    except ValueError:
        pass
Example #2
0
def test_bad_construct():
    try:
        hs = khmer.HashSet()
        assert 0, "HashSet constructor should fail w/o argument"
    except TypeError:
        pass

    try:
        hs = khmer.HashSet(5, [{}])
        assert 0, "HashSet constructor should fail w/o list of k-mers"
    except ValueError:
        pass
Example #3
0
def test_update_bad():
    hs = khmer.HashSet(5)
    x = [5, 10, 15, 2**35, {}]
    try:
        hs.update(x)
        assert 0, "cannot add dict to a HashSet"
    except ValueError:
        pass
Example #4
0
def test_remove_2():
    hs = khmer.HashSet(5, [8, 10])
    assert len(hs) == 2
    try:
        hs.remove(15)
        assert 0, "hs.remove should raise an Exception"
    except ValueError:
        pass
    assert len(hs) == 2
    assert list(sorted(hs)) == [8, 10]
Example #5
0
def test_traverse_linear_path():
    contigfile = utils.get_test_data('simple-genome.fa')
    contig = list(screed.open(contigfile))[0].sequence

    K = 21

    nodegraph = khmer.Nodegraph(K, 1e5, 4)
    stopgraph = khmer.Nodegraph(K, 1e5, 4)

    nodegraph.consume(contig)

    degree_nodes = khmer.HashSet(K)
    size, conns, visited = nodegraph.traverse_linear_path(
        contig[:K], degree_nodes, stopgraph)
    assert size == 980
    assert len(conns) == 0
    assert len(visited) == 980
Example #6
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('fastq_files', nargs='+')
    args = p.parse_args()

    cg = khmer.Countgraph(K, 1e8, 4)

    kept = 0
    hdn = khmer.HashSet(K)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < K:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < K:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
            elif cov == 30:
                contigs = lh.assemble_labeled_path(record.sequence[:K])
                for contig in contigs:
                    for t in translate(contig):
                        for o in extract_orfs(t):
                            if hash(o) not in output:
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
Example #7
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contig_files', nargs='+')
    args = p.parse_args()

    ng = khmer.Nodegraph(K, 1e8, 4)
    starts = []

    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            ng.consume(record.sequence)
            starts.append(record.sequence[:K])

    hdn = khmer.HashSet(K)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            hdn += ng.find_high_degree_nodes(record.sequence)

    lh = khmer._GraphLabels(ng)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
                lh.label_across_high_degree_nodes(record.sequence, hdn, n)

    counter = 0
    for k in starts:
        contigs = lh.assemble_labeled_path(k)
        if not contigs:
            print('nada...')
        for c in contigs:
            print('>%d\n%s' % (counter, c))
            counter += 1
Example #8
0
def test_update():
    hs = khmer.HashSet(5)
    x = [5, 10, 15, 2**35]
    hs.update(x)

    assert list(sorted(hs)) == [5, 10, 15, 2**35]
Example #9
0
def test_add():
    hs = khmer.HashSet(5)
    hs.add(7)
    hs.add(4)

    assert list(sorted(hs)) == [4, 7]
Example #10
0
def test_iter_single():
    hs = khmer.HashSet(5, [6])
    k = iter(hs)
    k2 = iter(k)
    assert k == k2
Example #11
0
def run(args):

    # @CTB this is kind of a hack - nothing tricky going on, just want to
    # specify memory on the command line rather than graph size...
    graph_tablesize = int(args.memory * 8.0 / 4.0)

    assert args.ksize % 2, "ksize must be odd"

    if args.label:
        label_list = []

    output_dir = args.output
    if not output_dir:
        if len(args.seqfiles) > 1:
            print('** please specify an output directory with -o',
                  file=sys.stderr)
            sys.exit(-1)

        output_dir = os.path.basename(args.seqfiles[0])
        if output_dir.endswith('.fa'):
            output_dir = output_dir[:-3]
        elif output_dir.endswith('.fa.gz'):
            output_dir = output_dir[:-6]

    # set this so we can read it for logging
    args.output = output_dir
    # gxtfile = os.path.basename(output_dir) + '.gxt'
    gxtfile = os.path.join(output_dir, "cdbg.gxt")
    contigfile = os.path.join(output_dir, "contigs.fa.gz")

    print('')
    print('placing output in directory:', output_dir)
    print('gxt will be:', gxtfile)
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        print('(note: directory already exists)')

    print('')
    if args.loadgraph:
        print('loading nodegraph from:', args.loadgraph)
        graph = khmer.Nodegraph.load(args.loadgraph)
        print('creating accompanying stopgraph')
        ksize = graph.ksize()
        hashsizes = graph.hashsizes()
        stop_bf = khmer.Nodegraph(ksize, 1, 1, primes=hashsizes)
    else:
        print('building graphs and loading files')

        # Create graph and a stop bloom filter - one for loading, one for
        # traversing. Create them all here so that we can error out quickly
        # if memory is a problem.

        # @CTB note that hardcoding '2' here is not nec a great idea.
        graph = khmer.Nodegraph(args.ksize, graph_tablesize, 2)
        stop_bf = khmer.Nodegraph(args.ksize, graph_tablesize, 2)
        n = 0

        # load in all of the input sequences, one file at a time.
        for seqfile in args.seqfiles:
            fp = screed.open(seqfile)
            for record in khmer.utils.clean_input_reads(fp):
                if len(record.cleaned_seq) < graph.ksize(): continue
                n += 1
                if n % 100000 == 0:
                    print('...', seqfile, n)
                graph.consume(record.cleaned_seq)
            fp.close()

        # complain if too small set of graphs was used.
        fp_rate = khmer.calc_expected_collisions(graph,
                                                 args.force,
                                                 max_false_pos=.05)

    ksize = graph.ksize()

    # initialize the object that will track information for us.
    pathy = Pathfinder(ksize, gxtfile, contigfile, not args.no_assemble)

    print('finding high degree nodes')
    if args.label:
        print('(and labeling them, per request)')
    degree_nodes = khmer.HashSet(ksize)
    linear_starts = khmer.HashSet(ksize)
    n = 0
    skipped = 0
    for seqfile in args.seqfiles:
        fp = screed.open(seqfile)
        for record in khmer.utils.clean_input_reads(fp):
            if len(record.cleaned_seq) < ksize:
                skipped += 1
                continue
            n += 1
            if n % 100000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them.
            these_hdn = graph.find_high_degree_nodes(record.cleaned_seq)
            if these_hdn:
                degree_nodes += these_hdn
            else:
                # possible linear node? check first and last k-mer.
                # (the logic here is that every purely linear node must
                # start or end in *some* record.sequence - so where we have
                # record sequences that have only 1 neighbor, those will be
                # all possible linear nodes.
                first_kmer = record.sequence[:ksize]
                last_kmer = record.sequence[-ksize:]
                assert len(last_kmer) == ksize

                if len(graph.neighbors(first_kmer)) == 1:
                    linear_starts.add(graph.hash(first_kmer))
                if len(graph.neighbors(last_kmer)) == 1:
                    linear_starts.add(graph.hash(last_kmer))

            if args.label:
                label_list.append(record.name)
                for kmer in these_hdn:
                    pathy.add_label(kmer, n)
        fp.close()

    print('read {}, skipped {} for being too short'.format(n, skipped))

    # get all of the degree > 2 kmers and give them IDs.
    for kmer in degree_nodes:
        pathy.new_hdn(kmer)
        stop_bf.add(kmer)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree node into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path &
    # assemble if desired.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the node ID of the primary segment.
        k_id = pathy.kmers_to_nodes[k]

        # here is where we would output this k-mer to the contig file if we
        # wanted to.
        nk_id = pathy.kmers_to_nodes[k]
        k_str = khmer.reverse_hash(k, ksize)
        pathy.add_assembly(nk_id, k_str)

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.kmers_to_nodes[nk.kmer_u]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes)

    # now, clean up at the end -- make sure we've hit all the possible
    # linear nodes.
    print('traversing from {} potential linear starts'.format(
        len(linear_starts)))
    for n, k in enumerate(linear_starts):
        traverse_and_mark_linear_paths(graph, k, stop_bf, pathy, degree_nodes)

    print('{} linear segments and {} high-degree nodes'.\
              format(pathy.node_counter, len(pathy.nodes)))

    del graph
    del stop_bf

    # save to GXT.
    print('saving gxtfile', gxtfile)

    all_labels = set()
    label_counts = {}

    pathy.adjfp.close()
    adj_fp = open(gxtfile + '.adj', 'rt')

    # this uniqifies the edges...
    for line in adj_fp:
        a, b = line.split(',')
        a = int(a)
        b = int(b)
        pathy.adjacencies[a].add(b)

    adj_fp.close()
    try:
        os.unlink(gxtfile + '.adj')
    except:
        print('cannot remove', gxtfile + '.adj')

    # ...and now print them out.
    edges = []
    for k, v in pathy.adjacencies.items():
        for dest in v:
            # don't add loops
            if (k != dest):
                edges.append((k, dest))

    with open(gxtfile, 'wt') as fp:
        write(fp, pathy.node_counter, edges)

    if not args.no_assemble:
        pathy.assemblyfp.close()

    if args.label:
        print('note: used/assigned %d labels total' % (len(set(all_labels)), ))
        print('counts:', label_counts)

        assert label_list
        print('dumping label list now.')
        label_file = os.path.basename(output_dir) + '.labels.txt'
        label_file = os.path.join(output_dir, label_file)

        with open(label_file, "wt") as fp:
            for n, label in enumerate(label_list):
                fp.write("{} {}\n".format(n + 0, label))
Example #12
0
def test_concat_2():
    hs = khmer.HashSet(5, [10, 12])
    hs2 = khmer.HashSet(5, [10, 13])

    hs += hs2
    assert list(sorted(hs)) == [10, 12, 13]
Example #13
0
def test_contains_1():
    hs = khmer.HashSet(5, [8, 10])
    assert 8 in hs
    assert 10 in hs
    assert 2**35 not in hs
Example #14
0
def test_remove():
    hs = khmer.HashSet(5, [8, 10])
    assert len(hs) == 2
    hs.remove(8)
    assert len(hs) == 1
    assert list(hs) == [10]
Example #15
0
def test_iter_single():
    hs = khmer.HashSet(5, [6])
    for k in hs:
        assert k == 6
        print(k)
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('seqfiles', nargs='+')
    parser.add_argument('-o', '--output', default=None)
    parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    parser.add_argument('-x',
                        '--tablesize',
                        default=NODEGRAPH_SIZE,
                        type=float)
    parser.add_argument('--force', action='store_true')
    args = parser.parse_args()

    assert args.ksize % 2, "ksize must be odd"
    assert args.output, "you probably want an output file"

    print('building graphs and loading files')

    # Create graph, and two stop bloom filters - one for loading, one for
    # traversing. Create them all here so that we can error out quickly
    # if memory is a problem.

    graph = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    print(graph.ksize(), graph.hashsizes())
    stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    n = 0

    # load in all of the input sequences, one file at a time.
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...', seqfile, n)
            graph.consume(record.sequence)

    # complain if too small set of graphs was used.
    fp_rate = khmer.calc_expected_collisions(graph,
                                             args.force,
                                             max_false_pos=.05)

    # initialize the object that will track information for us.
    pathy = Pathfinder(args.ksize)

    print('finding high degree nodes')
    degree_nodes = khmer.HashSet(args.ksize)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them. Don't do this on identical sequences.
            if min(stop_bf2.get_kmer_counts(record.sequence)) == 0:
                stop_bf2.consume(record.sequence)
                degree_nodes += graph.find_high_degree_nodes(record.sequence)
    del stop_bf2

    if not len(degree_nodes):
        print('no high degree nodes; exiting.')
        sys.exit(0)

    # get all of the degree > 2 nodes and give them IDs.
    for node in degree_nodes:
        pathy.new_segment(node)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree nodes into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path.  also
    # track minhashes while we're at it.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the segment ID of the primary node.
        k_id = pathy.segments_r[k]

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.segments_r[nk]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes)

    print(len(pathy.segments), 'segments, containing',
          sum(pathy.segments.values()), 'nodes')

    # save to GML
    if args.output:
        print('saving to', args.output)
        fp = open(args.output, 'w')
        w = GmlWriter(fp, [], [])

        for k, v in pathy.segments.items():
            w.add_vertex(k, v, [])

        for k, v in pathy.adjacencies.items():
            for edge in v:
                w.add_edge(k, edge, [])
Example #17
0
def test_iter_double():
    x = [6, 9, 20]
    hs = khmer.HashSet(5, x)
    for i, k in enumerate(hs):
        assert k == x[i], (k, x[i])
Example #18
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('-o',
                   type=argparse.FileType('w'),
                   default='assembly-stats.csv')
    args = p.parse_args()

    cg = create_countgraph(args)

    kept = 0
    hdn = khmer.HashSet(args.ksize)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()
    statswriter = csv.DictWriter(args.o,
                                 delimiter=',',
                                 fieldnames=[
                                     'read_n', 'action', 'cov', 'n_hdn',
                                     'contig_n', 'orf_n', 'new'
                                 ])

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < args.ksize:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
                statswriter.writerow({
                    'read_n': n,
                    'action': 'c',
                    'cov': cov,
                    'n_hdn': None,
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < args.ksize:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
                statswriter.writerow({
                    'read_n': n,
                    'action': 'l',
                    'cov': cov,
                    'n_hdn': len(hdn),
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov == 30:
                contigs = lh.assemble_labeled_path(
                    record.sequence[:args.ksize])
                for contig_n, contig in enumerate(contigs):
                    statswriter.writerow({
                        'read_n': n,
                        'action': 'a',
                        'cov': cov,
                        'n_hdn': None,
                        'contig_n': contig_n,
                        'orf_n': None,
                        'new': None
                    })
                    for t in translate(contig):
                        for orf_n, o in enumerate(extract_orfs(t)):
                            if hash(o) not in output:
                                new = True
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
                            else:
                                new = False
                            statswriter.writerow({
                                'read_n': n,
                                'action': 'a',
                                'cov': cov,
                                'n_hdn': None,
                                'contig_n': contig_n,
                                'orf_n': orf_n,
                                'new': new
                            })
Example #19
0
def test_contains_2():
    hs = khmer.HashSet(5, [8, 10])
    assert khmer.reverse_hash(8, 5) in hs
    assert khmer.reverse_hash(10, 5) in hs
    assert khmer.reverse_hash(2**35, 5) not in hs