Example #1
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()
Example #2
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()
Example #3
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Example #4
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Example #5
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5,
                 ksize=6,
                 is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size):
    a = MinHash(sketch_size, 10, track_abundance=True)
    oracle = dict(zip(hashes, abundances))

    a.set_abundances(oracle)

    mins = a.get_mins(with_abundance=True)
    size = min(sum(1 for v in oracle.values() if v > 0), sketch_size)
    assert len(mins) == size

    for k, v in mins.items():
        assert oracle[k] == v
def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
    a = MinHash(0, 10, track_abundance=True, scaled=scaled)
    oracle = dict(zip(hashes, abundances))

    a.set_abundances(oracle)

    max_hash = get_max_hash_for_scaled(scaled)
    below_max_hash = sum(1 for (k, v) in oracle.items()
                         if k <= max_hash and v > 0)

    mins = a.get_mins(with_abundance=True)
    assert len(mins) == below_max_hash

    for k, v in mins.items():
        assert oracle[k] == v
        assert k <= max_hash
        assert v > 0
Example #8
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5, ksize=6, is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
def main(argv):
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('mh_index_picklefile', help='pickled hashval index')
    p.add_argument('hashval_list', help='file with list of hashvals')
    p.add_argument('output')
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('--scaled',
                   default=1000,
                   type=float,
                   help="scaled value for contigs minhash output")
    p.add_argument('-v', '--verbose', action='store_true')

    args = p.parse_args(argv)

    # create output directory if it doesn't exist.
    outdir = args.output
    notify('putting output in {}', outdir)
    os.makedirs(os.path.join(outdir, "contigs"), exist_ok=True)

    if not os.path.isdir(outdir):
        error("output '{}' is not a directory and cannot be made", outdir)
        sys.exit(-1)

    # load picklefile
    with open(args.mh_index_picklefile, 'rb') as fp:
        hashval_to_contig_id = pickle.load(fp)
    notify('loaded {} hash value -> cdbg_id mappings from {}',
           len(hashval_to_contig_id), args.mh_index_picklefile)

    # load list of desired hashvals
    hashvals = [int(x.strip()) for x in open(args.hashval_list, 'rt')]
    hashvals = set(hashvals)
    notify('loaded {} search hashvalues from {}', len(hashvals),
           args.hashval_list)

    if not len(hashvals):
        print('No hash values to search!', file=sys.stderr)
        sys.exit(-1)

    # load catlas DAG
    catlas = CAtlas(args.catlas_prefix)
    notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix)
    notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg))

    # find the contigs filename
    contigs_file = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # get a single ksize & scaled
    ksize = int(args.ksize)
    scaled = int(args.scaled)

    # record command line
    with open(os.path.join(outdir, 'command.txt'), 'wt') as fp:
        fp.write(str(sys.argv))
        fp.write("\n")

    # output results.csv in the output directory:
    csvoutfp = open(os.path.join(outdir, 'hashval_results.csv'), 'wt')
    csv_writer = csv.writer(csvoutfp)
    csv_writer.writerow(['hashval', 'bp', 'contigs'])

    # iterate over each query, do the thing.
    n_found = 0
    for hashval in hashvals:
        notify('----')
        notify('QUERY HASHVAL: {}', hashval)

        mh = MinHash(0, ksize, scaled=scaled)
        result = execute_query(hashval, catlas, hashval_to_contig_id, mh=mh)
        notify('done searching!')
        if not result:
            notify("no result for hashval {}", hashval)
            continue

        result.retrieve_contigs(contigs_file)
        result.write(csv_writer, csvoutfp, outdir)

        assert hashval in mh.get_mins()

        n_found += 1
    # end main loop!

    notify('----')
    notify("Done! Found {} hashvals of {} in {} with k={}", n_found,
           len(hashvals), args.catlas_prefix, ksize)
    notify("Results are in directory '{}'", outdir)

    return 0