Beispiel #1
0
def test_bytes_murmur():
    x = hash_murmur("ACG")
    assert x == 1731421407650554201

    x = hash_murmur(b"ACG")
    assert x == 1731421407650554201

    x = hash_murmur(u"ACG")
    assert x == 1731421407650554201
Beispiel #2
0
def test_bytes_murmur():
    x = hash_murmur("ACG")
    assert x == 1731421407650554201

    x = hash_murmur(b"ACG")
    assert x == 1731421407650554201

    x = hash_murmur(u"ACG")
    assert x == 1731421407650554201
Beispiel #3
0
def test_murmur():
    x = hash_murmur("ACG")
    assert x == 1731421407650554201

    try:
        x = hash_murmur()
        assert 0, "hash_murmur requires an argument"
    except TypeError:
        pass

    x = hash_murmur("ACG", 42)
    assert x == 1731421407650554201

    y = hash_murmur("ACG", 43)
    assert y != x
Beispiel #4
0
def test_murmur():
    x = hash_murmur("ACG")
    assert x == 1731421407650554201

    try:
        x = hash_murmur()
        assert 0, "hash_murmur requires an argument"
    except TypeError:
        pass

    x = hash_murmur("ACG", 42)
    assert x == 1731421407650554201

    y = hash_murmur("ACG", 43)
    assert y != x
def compute_matrix(group_info, group_ident, ksize, output):
    # first, make a consistently ordered list of all k-mers, and convert
    # them into hashes.
    all_kmers = make_all(ksize)
    all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers]))
    all_kmer_hashes.sort()

    # now, build a matrix of GROUP_N rows x 4**ksize columns, where each
    # row will be the set of k-mer abundances associated with each group.
    print('creating', len(group_info), 4**ksize)
    V = numpy.zeros((len(group_info), 4**ksize), dtype=numpy.uint16)
    node_id_to_group_idx = {}
    for i, n in enumerate(group_info):
        if i % 1000 == 0:
            print('...', i, len(group_info))
        mh = group_info[n]
        vec = dict(mh.get_mins(with_abundance=True))
        vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes]
        vec = numpy.array(vec)
        V[i] = vec

        node_id_to_group_idx[n] = i

    # save!
    print('saving matrix of size {} to {}'.format(str(V.shape), output))
    with open(output, 'wb') as fp:
        numpy.save(fp, V)

    with open(output + '.node_ids', 'wb') as fp:
        pickle.dump(node_id_to_group_idx, fp)

    with open(output + '.node_mh', 'wb') as fp:
        pickle.dump(group_ident, fp)
def hash_sequence(seqstr, input_type, ksize, alphabet, skipinfo=None):
    hashes = []
    # modify sequence if needed based on alphabet (e.g. protein --> dayhoff) # NOT nucl-> protein translation
    # hmm.. do this by kmer so we can revcomp in nucleotide space, then translate. Otherwise complement doesn't make sense
    #reencoded_seq = reencode_sequence(seqstr, input_type, alphabet)
    # check that we can kmerize?
    if len(seqstr) < ksize:
        return hashes
    for fwd_kmer in kmers(seqstr, ksize, skipinfo):
        if input_type == "nucleotide":
            # for nucleotide input, get reverse-complement, select smaller kmer
            rev_kmer = enc.reverse(enc.complement(fwd_kmer))
            if fwd_kmer < rev_kmer:  # just a consistent way to choose a kmer, right?
                kmer = fwd_kmer
            else:
                kmer = rev_kmer
        else:
            # protein input, no need to revcomp
            kmer = fwd_kmer

        # tranlate, then hash
        translated_kmer = reencode_sequence(kmer, input_type, alphabet)
        #print(f"orig: {kmer}")
        #print(f"trans: {translated_kmer}")
        hash = hash_murmur(translated_kmer)
        if hash < 0:
            hash += 2**64
        hashes += [hash]
        #yield hash
    return hashes
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('genomes', nargs='+')
    parser.add_argument('-o', '--output')
    parser.add_argument('-k', '--ksize', default=5, type=int,
                   help='k-mer size for vectors')
    args = parser.parse_args()

    assert args.output, "please specify -o"

    n = 0
    genome_n = 0
    group_info = {}
    group_ident = {}
    labels = {}
    node_id_to_group_idx = {}
    for genome in args.genomes:
        print(genome)
        genome_n += 1
        for record in screed.open(genome):
            for start in range(0, len(record.sequence), SIZE):
                mh = sourmash.MinHash(n=0, ksize=args.ksize,
                                          scaled=1, track_abundance=1)
                mh.add_sequence(record.sequence[start:start+SIZE], True)
                group_info[n] = mh
                
                mh = sourmash.MinHash(n=0, ksize=31, scaled=1000)
                mh.add_sequence(record.sequence[start:start+SIZE], True)
                group_ident[n] = mh

                labels[n] = genome_n
                node_id_to_group_idx[n] = n

                n += 1

    # ok, now we have a pile of k-mer vectors of size 4**args.ksize;
    # output in numpy format.

    # first, make a consistently ordered list of all k-mers, and convert
    # them into hashes.
    all_kmers = make_all(args.ksize)
    all_kmer_hashes = list(set([ hash_murmur(i) for i in all_kmers ]))
    all_kmer_hashes.sort()

    # now, build a matrix of GROUP_N rows x 4**ksize columns, where each
    # row will be the set of k-mer abundances associated with each group.
    V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16)
    for i, n in enumerate(group_info):
        mh = group_info[n]
        vec = dict(mh.get_mins(with_abundance=True))
        vec = [ vec.get(hashval,0) for hashval in all_kmer_hashes ]
        vec = numpy.array(vec)
        V[i] = vec

    # save!
    print('saving matrix of size {} to {}'.format(str(V.shape), args.output))
    with open(args.output, 'wb') as fp:
        numpy.save(fp, V)

    with open(args.output + '.labels', 'wb') as fp:
        dump(labels, fp)

    with open(args.output + '.node_ids', 'wb') as fp:
        pickle.dump(node_id_to_group_idx, fp)

    with open(args.output + '.node_mh', 'wb') as fp:
        pickle.dump(group_ident, fp)