Exemple #1
0
def test_lookup_set():
    x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0)

    q = set(range(10))
    y = x.lookup_many(q)

    for i in y:
        assert i is not None
Exemple #2
0
def test_all(tmpdir):
    x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0)
    assert x.lookup(9) == 8

    output = tmpdir.join('xxx')
    x.save(str(output))

    y = bbhash.load_mphf(str(output))
    assert y.lookup(9) == 8
def build_mphf(kh, records_iter_fn):
    # build a list of all k-mers in the cDBG
    all_kmers = list()

    records_iter = records_iter_fn()
    for n, record in enumerate(records_iter):
        if n % 50000 == 0 and n:
            print('... contig', n, end='\r')

        kmers = kh.get_kmer_hashes(record.sequence)
        all_kmers.extend(list(kmers))

    n_contigs = n + 1
    print('loaded {} contigs.\n'.format(n_contigs))

    # build MPHF (this is the CPU intensive bit)
    print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n_contigs))
    x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0)

    # build tables linking:
    # * mphf hash to k-mer hash (for checking exactness)
    # * mphf hash to cDBG ID
    # * cDBG ID to node size (in k-mers)

    mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64)
    mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32)
    sizes = numpy.zeros(n_contigs, numpy.uint32)

    print('second pass.')
    records_iter = records_iter_fn()
    for n, record in enumerate(records_iter):
        if n % 50000 == 0 and n:
            print('... contig {} of {}'.format(n, n_contigs), end='\r')

        # node ID is record name, must go from 0 to total-1
        cdbg_id = int(record.name)

        # get 64-bit numbers for each k-mer (doesn't really matter what hash)
        kmers = kh.get_kmer_hashes(record.sequence)

        # for each k-mer, find its MPHF hashval, & link to info.
        for kmer in kmers:
            mphf = x.lookup(kmer)
            mphf_to_kmer[mphf] = kmer
            mphf_to_cdbg[mphf] = cdbg_id

        # record each node size, while we're here.
        sizes[cdbg_id] = len(kmers)

    print('loaded {} contigs in pass2.\n'.format(n_contigs))
    assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg))

    return x, mphf_to_kmer, mphf_to_cdbg, sizes
Exemple #4
0
def main(argv):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix')
    p.add_argument('-k', '--ksize', default=31, type=int)
    a = p.parse_args(argv)

    kh = khmer.Nodetable(a.ksize, 1, 1)

    contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz')
    mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf')
    array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices')

    # build a list of all k-mers in the cDBG
    all_kmers = list()
    print('reading cDBG nodes from {}'.format(contigs_filename))
    for n, record in enumerate(screed.open(contigs_filename)):
        if n % 50000 == 0 and n:
            print('... contig', n, end='\r')

        kmers = kh.get_kmer_hashes(record.sequence)
        all_kmers.extend(list(kmers))

    n_contigs = n + 1
    print('loaded {} contigs.\n'.format(n_contigs))

    # build MPHF (this is the CPU intensive bit)
    print('building MPHF for {} k-mers in {} nodes.'.format(
        len(all_kmers), n_contigs))
    x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0)

    # build tables linking:
    # * mphf hash to k-mer hash (for checking exactness)
    # * mphf hash to cDBG ID
    # * cDBG ID to node size (in k-mers)

    mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64)
    mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32)
    sizes = numpy.zeros(n_contigs, numpy.uint32)

    print('second pass; reading cDBG nodes from {}'.format(contigs_filename))
    for n, record in enumerate(screed.open(contigs_filename)):
        if n % 50000 == 0 and n:
            print('... contig {} of {}'.format(n, n_contigs), end='\r')

        # node ID is record name, must go from 0 to total-1
        cdbg_id = int(record.name)

        # get 64-bit numbers for each k-mer (doesn't really matter what hash)
        kmers = kh.get_kmer_hashes(record.sequence)

        # for each k-mer, find its MPHF hashval, & link to info.
        for kmer in kmers:
            mphf = x.lookup(kmer)
            mphf_to_kmer[mphf] = kmer
            mphf_to_cdbg[mphf] = cdbg_id

        # record each node size, while we're here.
        sizes[cdbg_id] = len(kmers)

    print('loaded {} contigs in pass2.\n'.format(n_contigs))
    assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg))

    print('done! saving to {} and {}'.format(mphf_filename, array_filename))

    x.save(mphf_filename)
    with open(array_filename, 'wb') as fp:
        numpy.savez_compressed(fp,
                               mphf_to_kmer=mphf_to_kmer,
                               kmer_to_cdbg=mphf_to_cdbg,
                               sizes=sizes)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('unitigs')
    parser.add_argument('transcriptomes', nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-o', '--output')
    args = parser.parse_args()

    assert args.output

    kh = khmer.Nodetable(args.ksize, 1, 1)

    all_kmers = []
    for n, record in enumerate(screed.open(args.unitigs)):
        if n % 10000 == 0:
            print('... cdbg', n)
            if n > 20000 and 0:
                break

        all_kmers.extend(kh.get_kmer_hashes(record.sequence))

    print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n))
    x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0)

    ###

    mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64)
    mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32)

    for n, record in enumerate(screed.open(args.unitigs)):
        if n % 10000 == 0:
            print('... cdbg', n)
            if n > 20000 and 0:
                break

        cdbg_id = int(record.name.split(' ')[0])
        kmers = kh.get_kmer_hashes(record.sequence)

        for kmer in kmers:
            mphf = x.lookup(kmer)
            mphf_to_kmer[mphf] = kmer
            mphf_to_cdbg[mphf] = cdbg_id

    ###

    print('walking the transcriptome')

    family_ids = {}
    family_counter = 0

    cdbg_to_family_id = defaultdict(set)

    n = 0
    for tr_filename in args.transcriptomes:
        for record in screed.open(tr_filename):
            n += 1
            if n % 1000 == 0:
                print('...', tr_filename, n)
                if n > 5000 and 0:
                    break

            # get the family name
            family_name = record.name.split('|')[1]

            # convert to family ID, generating a new one if we need one
            family_id = family_ids.get(family_name)
            if family_id is None:
                family_id = family_counter
                family_counter += 1
                family_ids[family_name] = family_id

            # for all k-mers,
            hashvals = kh.get_kmer_hashes(record.sequence)
            for hashval in hashvals:

                # find cDBG ID
                mphf = x.lookup(hashval)
                if mphf is None:
                    continue

                assert mphf is not None
                cdbg_id = mphf_to_cdbg[mphf]

                # link cDBG ID to family ID
                cdbg_to_family_id[cdbg_id].add(family_id)

    mphf_filename = args.output + '.mphf'
    array_filename = args.output + '.arr'
    x.save(mphf_filename)

    with open(array_filename, 'wb') as fp:
        pickle.dump(
            (mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id), fp)
Exemple #6
0
def test_construct_from_set():
    with pytest.raises(TypeError):        # CTB: could fix this.
        x = bbhash.PyMPHF(set(range(10)), 10, 1, 1.0)
Exemple #7
0
def test_lookup():
    x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0)
    assert all(x.lookup(y) is not None for y in range(10))
    assert x.lookup(200) is None