Beispiel #1
0
def test_complete_no_collision():
    kh = khmer._Countgraph(4, [4 ** 4])

    n_entries = kh.hashsizes()[0]

    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:              # palindromes are singular
            n_palindromes += 1
        if kh.get(i):                   # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == n_entries, n_rc_filled
    assert n_palindromes == 16, n_palindromes
    assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \
        (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
Beispiel #2
0
def explore(ht, start_kmer, K):
   edges = set()
   discovered = set()
   explored = set()
   hash_ids = {}

   start_kmer_hash = khmer.forward_hash(start_kmer, K)

   if ht.get(khmer.reverse_hash(start_kmer_hash, K)):
      discovered.add(start_kmer_hash)
      hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1
   else:
      return hash_ids, edges

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)
            hash_ids[neigh_hash] = len(hash_ids.keys()) + 1
            edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
         elif ht.get(khmer.reverse_hash(neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered):
            edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
         
   return hash_ids, edges 
Beispiel #3
0
def explore(ht, start_kmer, K):
    edges = set()
    discovered = set()
    explored = set()
    hash_ids = {}

    start_kmer_hash = khmer.forward_hash(start_kmer, K)

    if ht.get(khmer.reverse_hash(start_kmer_hash, K)):
        discovered.add(start_kmer_hash)
        hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1
    else:
        return hash_ids, edges

    while (len(discovered) > 0 and (len(explored) < 2000000)):
        kmer_hash = discovered.pop()
        kmer_neighbors = get_neighbors(kmer_hash, K)

        explored.add(kmer_hash)

        for neigh_hash in kmer_neighbors:
            if ht.get(
                    khmer.reverse_hash(neigh_hash, K)
            ) and neigh_hash not in explored and neigh_hash not in discovered:
                discovered.add(neigh_hash)
                hash_ids[neigh_hash] = len(hash_ids.keys()) + 1
                edges.add(
                    tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
            elif ht.get(khmer.reverse_hash(
                    neigh_hash, K)) and (neigh_hash in explored
                                         or neigh_hash in discovered):
                edges.add(
                    tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))

    return hash_ids, edges
Beispiel #4
0
def test_complete_no_collision():
    kh = khmer._Countgraph(4, [4**4])

    n_entries = kh.hashsizes()[0]

    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:  # palindromes are singular
            n_palindromes += 1
        if kh.get(i):  # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == n_entries, n_rc_filled
    assert n_palindromes == 16, n_palindromes
    assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \
        (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
Beispiel #5
0
def test_reverse_hash():
    s = khmer.reverse_hash(0, 4)
    assert s == "AAAA"

    s = khmer.reverse_hash(85, 4)
    assert s == "TTTT"

    s = khmer.reverse_hash(170, 4)
    assert s == "CCCC"

    s = khmer.reverse_hash(255, 4)
    assert s == "GGGG"
Beispiel #6
0
def test_reverse_hash():
    s = khmer.reverse_hash(0, 4)
    assert s == "AAAA"

    s = khmer.reverse_hash(85, 4)
    assert s == "TTTT"

    s = khmer.reverse_hash(170, 4)
    assert s == "CCCC"

    s = khmer.reverse_hash(255, 4)
    assert s == "GGGG"
def test_complete_4_collision():
    kh = khmer._CountingHash(4, [3])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 64):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #       n_fwd_filled += 1

    assert n_rc_filled == 64, n_rc_filled
def test_complete_2_collision():
    kh = khmer.new_hashtable(4, 4)

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Beispiel #9
0
def test_complete_2_collision():
    kh = khmer.new_hashtable(4, 4)

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Beispiel #10
0
def test_reverse_hash_longs():
    # test explicitly with long integers, only needed for python2
    # the builtin `long` exists in the global scope only
    global long  # pylint: disable=global-variable-undefined
    if sys.version_info > (3,):
        long = int

    s = khmer.reverse_hash(long(0), 4)
    assert s == "AAAA"

    s = khmer.reverse_hash(long(85), 4)
    assert s == "TTTT"

    s = khmer.reverse_hash(long(170), 4)
    assert s == "CCCC"

    s = khmer.reverse_hash(long(255), 4)
    assert s == "GGGG"
Beispiel #11
0
def test_complete_4_collision():
    kh = khmer._CountingHash(4, [3])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 64):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #       n_fwd_filled += 1

    assert n_rc_filled == 64, n_rc_filled
Beispiel #12
0
def test_reverse_hash_longs():
    # test explicitly with long integers, only needed for python2
    # the builtin `long` exists in the global scope only
    global long  # pylint: disable=global-variable-undefined
    if sys.version_info > (3,):
        long = int

    s = khmer.reverse_hash(long(0), 4)
    assert s == "AAAA"

    s = khmer.reverse_hash(long(85), 4)
    assert s == "TTTT"

    s = khmer.reverse_hash(long(170), 4)
    assert s == "CCCC"

    s = khmer.reverse_hash(long(255), 4)
    assert s == "GGGG"
Beispiel #13
0
def test_complete_2_collision():
    kh = khmer._Countgraph(4, [5])

    n_entries = kh.hashsizes()[0]
    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Beispiel #14
0
def test_complete_2_collision():
    kh = khmer._Countgraph(4, [5])

    n_entries = kh.hashsizes()[0]
    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Beispiel #15
0
def get_neighbors(kmer_hash, K):
   neighbors = []
   kmer = khmer.reverse_hash(kmer_hash, K)

   begin = kmer[0:len(kmer)-1]
   end = kmer[1:len(kmer)]

   for base in bases:
      neighbors.append(khmer.forward_hash(base + begin, K))
      neighbors.append(khmer.forward_hash(end + base, K))

   return set(neighbors)
Beispiel #16
0
def get_neighbors(kmer_hash, K):
    neighbors = []
    kmer = khmer.reverse_hash(kmer_hash, K)

    begin = kmer[0:len(kmer) - 1]
    end = kmer[1:len(kmer)]

    for base in bases:
        neighbors.append(khmer.forward_hash(base + begin, K))
        neighbors.append(khmer.forward_hash(end + base, K))

    return set(neighbors)
Beispiel #17
0
def test_complete_no_collision():
    kh = khmer.new_hashtable(4, 4 ** 2)

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:  # palindromes are singular
            n_palindromes += 1
        if kh.get(i):  # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == kh.n_entries(), n_rc_filled
    assert n_palindromes == 16, n_palindromes  # @CTB check this
    assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, n_fwd_filled
Beispiel #18
0
def test_complete_no_collision():
    kh = khmer.new_hashtable(4, 4**2)

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:  # palindromes are singular
            n_palindromes += 1
        if kh.get(i):  # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == kh.n_entries(), n_rc_filled
    assert n_palindromes == 16, n_palindromes  # @CTB check this
    assert n_fwd_filled == kh.n_entries() / 2 + n_palindromes / 2, \
        n_fwd_filled
def explore(ht, start_kmer, K):
   discovered = set()
   explored = set()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)
   
   if ht.get(kmer):
      discovered.add(start_kmer_hash)
   else:
      return 0

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)

   return len(explored)
Beispiel #20
0
def explore(ht, start_kmer, K):
   discovered = set()
   explored = set()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)
   
   if ht.get(kmer):
      discovered.add(start_kmer_hash)
   else:
      return 0

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)

   return len(explored)
Beispiel #21
0
def get_all_kmers(ht, start_kmer, K, ht2, degs):
   q = list()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)

   if not ht2.get(start_kmer_hash):
      ht2.count(start_kmer)
   else:
      return ht2, degs

   neighs = find_neighbors(start_kmer, ht)

   degs = add_deg(degs, len(neighs))

   for neigh in neighs:
      neigh_hash = khmer.forward_hash(neigh, K)
      if not ht2.get(neigh):
         q.append(neigh_hash)
         ht2.count(neigh)

   counter = 0

   while len(q) != 0:
      counter += 1

      kmer_hash = q.pop()
      kmer = khmer.reverse_hash(kmer_hash, K)
      neighs = find_neighbors(kmer, ht)

      degs = add_deg(degs, len(neighs))

      for neigh in neighs:
         neigh_hash = khmer.forward_hash(neigh, K)
         if not ht2.get(neigh):
            q.append(neigh_hash)
            ht2.count(neigh)
Beispiel #22
0
def get_all_kmers(ht, start_kmer, K, ht2, degs):
    q = list()

    start_kmer_hash = khmer.forward_hash(start_kmer, K)

    if not ht2.get(start_kmer_hash):
        ht2.count(start_kmer)
    else:
        return ht2, degs

    neighs = find_neighbors(start_kmer, ht)

    degs = add_deg(degs, len(neighs))

    for neigh in neighs:
        neigh_hash = khmer.forward_hash(neigh, K)
        if not ht2.get(neigh):
            q.append(neigh_hash)
            ht2.count(neigh)

    counter = 0

    while len(q) != 0:
        counter += 1

        kmer_hash = q.pop()
        kmer = khmer.reverse_hash(kmer_hash, K)
        neighs = find_neighbors(kmer, ht)

        degs = add_deg(degs, len(neighs))

        for neigh in neighs:
            neigh_hash = khmer.forward_hash(neigh, K)
            if not ht2.get(neigh):
                q.append(neigh_hash)
                ht2.count(neigh)
Beispiel #23
0
def test_contains_2():
    hs = khmer.HashSet(5, [8, 10])
    assert khmer.reverse_hash(8, 5) in hs
    assert khmer.reverse_hash(10, 5) in hs
    assert khmer.reverse_hash(2**35, 5) not in hs
Beispiel #24
0
def run(args):

    # @CTB this is kind of a hack - nothing tricky going on, just want to
    # specify memory on the command line rather than graph size...
    graph_tablesize = int(args.memory * 8.0 / 4.0)

    assert args.ksize % 2, "ksize must be odd"

    if args.label:
        label_list = []

    output_dir = args.output
    if not output_dir:
        if len(args.seqfiles) > 1:
            print('** please specify an output directory with -o',
                  file=sys.stderr)
            sys.exit(-1)

        output_dir = os.path.basename(args.seqfiles[0])
        if output_dir.endswith('.fa'):
            output_dir = output_dir[:-3]
        elif output_dir.endswith('.fa.gz'):
            output_dir = output_dir[:-6]

    # set this so we can read it for logging
    args.output = output_dir
    # gxtfile = os.path.basename(output_dir) + '.gxt'
    gxtfile = os.path.join(output_dir, "cdbg.gxt")
    contigfile = os.path.join(output_dir, "contigs.fa.gz")

    print('')
    print('placing output in directory:', output_dir)
    print('gxt will be:', gxtfile)
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        print('(note: directory already exists)')

    print('')
    if args.loadgraph:
        print('loading nodegraph from:', args.loadgraph)
        graph = khmer.Nodegraph.load(args.loadgraph)
        print('creating accompanying stopgraph')
        ksize = graph.ksize()
        hashsizes = graph.hashsizes()
        stop_bf = khmer.Nodegraph(ksize, 1, 1, primes=hashsizes)
    else:
        print('building graphs and loading files')

        # Create graph and a stop bloom filter - one for loading, one for
        # traversing. Create them all here so that we can error out quickly
        # if memory is a problem.

        # @CTB note that hardcoding '2' here is not nec a great idea.
        graph = khmer.Nodegraph(args.ksize, graph_tablesize, 2)
        stop_bf = khmer.Nodegraph(args.ksize, graph_tablesize, 2)
        n = 0

        # load in all of the input sequences, one file at a time.
        for seqfile in args.seqfiles:
            fp = screed.open(seqfile)
            for record in khmer.utils.clean_input_reads(fp):
                if len(record.cleaned_seq) < graph.ksize(): continue
                n += 1
                if n % 100000 == 0:
                    print('...', seqfile, n)
                graph.consume(record.cleaned_seq)
            fp.close()

        # complain if too small set of graphs was used.
        fp_rate = khmer.calc_expected_collisions(graph,
                                                 args.force,
                                                 max_false_pos=.05)

    ksize = graph.ksize()

    # initialize the object that will track information for us.
    pathy = Pathfinder(ksize, gxtfile, contigfile, not args.no_assemble)

    print('finding high degree nodes')
    if args.label:
        print('(and labeling them, per request)')
    degree_nodes = khmer.HashSet(ksize)
    linear_starts = khmer.HashSet(ksize)
    n = 0
    skipped = 0
    for seqfile in args.seqfiles:
        fp = screed.open(seqfile)
        for record in khmer.utils.clean_input_reads(fp):
            if len(record.cleaned_seq) < ksize:
                skipped += 1
                continue
            n += 1
            if n % 100000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them.
            these_hdn = graph.find_high_degree_nodes(record.cleaned_seq)
            if these_hdn:
                degree_nodes += these_hdn
            else:
                # possible linear node? check first and last k-mer.
                # (the logic here is that every purely linear node must
                # start or end in *some* record.sequence - so where we have
                # record sequences that have only 1 neighbor, those will be
                # all possible linear nodes.
                first_kmer = record.sequence[:ksize]
                last_kmer = record.sequence[-ksize:]
                assert len(last_kmer) == ksize

                if len(graph.neighbors(first_kmer)) == 1:
                    linear_starts.add(graph.hash(first_kmer))
                if len(graph.neighbors(last_kmer)) == 1:
                    linear_starts.add(graph.hash(last_kmer))

            if args.label:
                label_list.append(record.name)
                for kmer in these_hdn:
                    pathy.add_label(kmer, n)
        fp.close()

    print('read {}, skipped {} for being too short'.format(n, skipped))

    # get all of the degree > 2 kmers and give them IDs.
    for kmer in degree_nodes:
        pathy.new_hdn(kmer)
        stop_bf.add(kmer)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree node into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path &
    # assemble if desired.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the node ID of the primary segment.
        k_id = pathy.kmers_to_nodes[k]

        # here is where we would output this k-mer to the contig file if we
        # wanted to.
        nk_id = pathy.kmers_to_nodes[k]
        k_str = khmer.reverse_hash(k, ksize)
        pathy.add_assembly(nk_id, k_str)

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.kmers_to_nodes[nk.kmer_u]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes)

    # now, clean up at the end -- make sure we've hit all the possible
    # linear nodes.
    print('traversing from {} potential linear starts'.format(
        len(linear_starts)))
    for n, k in enumerate(linear_starts):
        traverse_and_mark_linear_paths(graph, k, stop_bf, pathy, degree_nodes)

    print('{} linear segments and {} high-degree nodes'.\
              format(pathy.node_counter, len(pathy.nodes)))

    del graph
    del stop_bf

    # save to GXT.
    print('saving gxtfile', gxtfile)

    all_labels = set()
    label_counts = {}

    pathy.adjfp.close()
    adj_fp = open(gxtfile + '.adj', 'rt')

    # this uniqifies the edges...
    for line in adj_fp:
        a, b = line.split(',')
        a = int(a)
        b = int(b)
        pathy.adjacencies[a].add(b)

    adj_fp.close()
    try:
        os.unlink(gxtfile + '.adj')
    except:
        print('cannot remove', gxtfile + '.adj')

    # ...and now print them out.
    edges = []
    for k, v in pathy.adjacencies.items():
        for dest in v:
            # don't add loops
            if (k != dest):
                edges.append((k, dest))

    with open(gxtfile, 'wt') as fp:
        write(fp, pathy.node_counter, edges)

    if not args.no_assemble:
        pathy.assemblyfp.close()

    if args.label:
        print('note: used/assigned %d labels total' % (len(set(all_labels)), ))
        print('counts:', label_counts)

        assert label_list
        print('dumping label list now.')
        label_file = os.path.basename(output_dir) + '.labels.txt'
        label_file = os.path.join(output_dir, label_file)

        with open(label_file, "wt") as fp:
            for n, label in enumerate(label_list):
                fp.write("{} {}\n".format(n + 0, label))
Beispiel #25
0
def test_reverse_hash_raises():
    with pytest.raises(TypeError) as excinfo:
        khmer.reverse_hash('2345', 4)

    assert 'int' in str(excinfo.value)
Beispiel #26
0
def test_reverse_hash_raises():
    with pytest.raises(TypeError) as excinfo:
        khmer.reverse_hash('2345', 4)

    assert 'int' in str(excinfo.value)