Ejemplo n.º 1
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't compare different sizes without downsampling
    with pytest.raises(TypeError):
        d.compare(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())
    assert d.compare(a) == 1.0

    c = c.downsample_n(b.num)
    assert c.compare(b) == 1.0
Ejemplo n.º 2
0
def test_mh_inplace_concat_asymmetric(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.__copy__()
    c += b

    d = b.__copy__()
    d += a

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    try:
        d.compare(a)
    except TypeError as exc:
        assert 'must have same num' in str(exc)

    a = a.downsample_n(d.num)
    assert d.compare(a) == 1.0  # see: d += a, above.

    c = c.downsample_n(b.num)
    assert c.compare(b) == 0.5
Ejemplo n.º 3
0
def test_mh_inplace_concat_asymmetric(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.__copy__()
    c += b

    d = b.__copy__()
    d += a

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    try:
        d.compare(a)
    except TypeError as exc:
        assert 'must have same num' in str(exc)

    a = a.downsample_n(d.num)
    assert d.compare(a) == 1.0 # see: d += a, above.

    c = c.downsample_n(b.num)
    assert c.compare(b) == 0.5
Ejemplo n.º 4
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't compare different sizes without downsampling
    with pytest.raises(TypeError):
        d.compare(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())
    assert d.compare(a) == 1.0

    c = c.downsample_n(b.num)
    assert c.compare(b) == 1.0
Ejemplo n.º 5
0
def test_mh_asymmetric(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    assert a.count_common(b) == 10
    assert b.count_common(a) == 10

    with pytest.raises(TypeError):
        a.compare(b)

    a = a.downsample_n(10)
    assert a.compare(b) == 0.5
    assert b.compare(a) == 0.5
Ejemplo n.º 6
0
def test_mh_asymmetric(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    assert a.count_common(b) == 10
    assert b.count_common(a) == 10

    with pytest.raises(TypeError):
        a.compare(b)

    a = a.downsample_n(10)
    assert a.compare(b) == 0.5
    assert b.compare(a) == 0.5
Ejemplo n.º 7
0
def test_mh_jaccard_asymmetric_num(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    assert a.count_common(b) == 10
    assert b.count_common(a) == 10

    # with 'jaccard', this will raise an error b/c different num
    with pytest.raises(TypeError):
        a.jaccard(b)

    a = a.downsample_n(10)
    # CTB note: this used to be 'compare', is now 'jaccard'
    assert a.jaccard(b) == 0.5
    assert b.jaccard(a) == 0.5
Ejemplo n.º 8
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't use jaccard on different nums without downsampling
    with pytest.raises(TypeError):
        d.jaccard(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())

    if track_abundance:
        assert round(d.similarity(a), 3) == 0.91
    else:
        assert round(d.similarity(a), 3) == 1.0

    c = c.downsample_n(b.num)
    if track_abundance:
        assert round(c.similarity(b), 3) == 0.91
    else:
        assert c.similarity(b) == 1.0
Ejemplo n.º 9
0
def test_mh_downsample_n_error(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    with pytest.raises(ValueError):
        a.downsample_n(30)