Example #1
0
    def testLSH(self):
        strings = [
                   "abcdefghijklmnopqrstuvwxyz",
                   "abcdefghijklmnopqrstuvw",
                   "defghijklmnopqrstuvw",
                   "zyxwvutsrqponmlkjihgfedcba",
                   "1abcdefghijklmnopuvw1",
                   "123456789",
                   "012345678",
                   "234567890",
                   ]
        for i, a in enumerate(strings):
            for j, b in enumerate(strings[i+1:]):
                print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b)))

        random.seed(12345)
        lsh = LSHCache(shingler=Shingler(1))
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set(),
                              set([5]),
                              set([5,6])], lsh.insert_batch(strings))
Example #2
0
 def testBadArgs(self):
     with self.assertRaises(AssertionError):
         LSHCache(b=10, r=7, n=100)
     with self.assertRaises(AssertionError):
         # prime number of rows
         LSHCache(n=101)
     with self.assertRaises(AssertionError):
         LSHCache(n=100, r=7)
     with self.assertRaises(AssertionError):
         LSHCache(n=100, b=7)
Example #3
0
def lsh():
    lns = [ln.decode('utf-8') for ln in open('clean_data').readlines()]
    cache = LSHCache()
    docs = []
    for ln in lns:
        word_dic = []
        for wd in list(jieba.cut(ln)):
            # if len(wd) > 1:
            word_dic.append(wd)
        docs.append(' '.join(word_dic))
    dups = {}

    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)
    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
            for dup in duplist:
                print'\tdup : [%d] %s' % (dup, docs[dup])
        else:
            print 'no dups found for doc [%d] : %s' % (i, docs[i])
Example #4
0
def lsh_cache_from_args(args):
    seed_from_args(args)
    kwargs = {"shingler": Shingler(*args.shingle_len)}
    if args.minhash:
        kwargs['minhash'] = minhash_choices[args.minhash]
    for arg_key, kwarg_key in (('num_total', 'n'), ('num_bands',
                                                    'b'), ('num_rows', 'r'),
                               ('min_support', 'm'), ('universe_size', ) * 2):
        value = getattr(args, arg_key)
        if value:
            kwargs[kwarg_key] = value
    cache = LSHCache(**kwargs)
    # logging.info(str(cache))
    return cache
Example #5
0
import pprint
import sys, os

sys.path.insert(0, os.path.abspath('../..'))
from lsh import LSHCache


if __name__ == '__main__':
    cache = LSHCache()
    
    docs = [
        "lipstick on a pig",
        "you can put lipstick on a pig",
        "you can put lipstick on a pig but it's still a pig",
        "you can put lipstick on a pig it's still a pig",
        "i think they put some lipstick on a pig but it's still a pig",
        "putting lipstick on a pig",
        "you know you can put lipstick on a pig",
        "they were going to send us binders full of women",
        "they were going to send us binders of women",
        "a b c d e f",
        "a b c d f"
        ]

    dups = {}
    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)

    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
Example #6
0
import pprint
import sys, os

sys.path.insert(0, os.path.abspath('../..'))
from lsh import LSHCache


if __name__ == '__main__':
    cache = LSHCache()
    
    docs = [
        "lipstick on a pig",
        "you can put lipstick on a pig",
        "you can put lipstick on a pig but it's still a pig",
        "you can put lipstick on a pig it's still a pig",
        "i think they put some lipstick on a pig but it's still a pig",
        "putting lipstick on a pig",
        "you know you can put lipstick on a pig",
        "they were going to send us binders full of women",
        "they were going to send us binders of women",
        "a b c d e f",
        "a b c d f"
        ]

    dups = {}
    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)

    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
Example #7
0
sys.path.insert(0, os.path.abspath('../..'))
from lsh import LSHCache


def random_int_list(start, stop, length):
    start, stop = (int(start), int(stop)) if start <= stop else (int(stop),
                                                                 int(start))
    length = int(abs(length)) if length else 0
    random_list = []
    for i in range(length):
        random_list.append(random.randint(start, stop))
    return random_list


if __name__ == '__main__':
    cache = LSHCache()

    docs = [
        "lipstick on a pig", "you can put lipstick on a pig",
        "you can put lipstick on a pig but it's still a pig",
        "you can put lipstick on a pig it's still a pig",
        "i think they put some lipstick on a pig but it's still a pig",
        "putting lipstick on a pig", "you know you can put lipstick on a pig",
        "they were going to send us binders full of women",
        "they were going to send us binders of women", "a b c d e f",
        "a b c d f"
    ]
    sig_mat = []
    dups = {}

    if (0):
Example #8
0
    def testExample(self):
        docs = [
                "lipstick on a pig",
                "you can put lipstick on a pig",
                "you may put lipstick on a pig but it's still a pig",
                "you can put lipstick on a pig it's still a pig",
                "i think they put some lipstick on a pig but it's still a pig",
                "putting lipstick on a pig",
                "you know you can put lipstick on a pig",
                "they were going to send us binders full of women",
                "they were going to send us binders of women",
                "a b c d e f",
                "a b c d f"]

        # least strict
        random.seed(12345)
        cache = LSHCache(b=50,r=2)
        self.assertListEqual([set(),
                              set([0]),
                              set([0,1]),
                              set([0,1,2]),
                              set([0,1,2,3]),
                              set([0,1,2,3,4]),
                              set([0,1,2,3,4,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))

        # stricter
        random.seed(12345)
        cache = LSHCache(b=25,r=4)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([1]),
                              set([2]),
                              set([0,1]),
                              set([0,1,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))
        # stricter still
        random.seed(12345)
        cache = LSHCache(b=20,r=5)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([1]),
                              set(),
                              set([0,1]),
                              set([0,1,3,5]),
                              set(),
                              set([7]),
                              set(),
                              set([])],
                              cache.insert_batch([doc.split() for doc in docs]))
        # most strict
        random.seed(12345)
        cache = LSHCache(b=10,r=10)
        self.assertListEqual([set(),
                              set(),
                              set(),
                              set(),
                              set(),
                              set(),
                              set([1]),
                              set(),
                              set(),
                              set(),
                              set()],
                              cache.insert_batch([doc.split() for doc in docs]))

        # least strict
        random.seed(12345)
        cache = LSHCache(b=50,r=2,m=3)
        self.assertListEqual([set(),
                              set([0]),
                              set(),
                              set([0,1,2]),
                              set([0,2,3]),
                              set([0,1,3]),
                              set([0,1,3,5]),
                              set(),
                              set([7]),
                              set(),
                              set([9])],
                              cache.insert_batch([doc.split() for doc in docs]))
Example #9
0
    def testLSHArgs(self):
        lsh = LSHCache()
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(b=10, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())

        lsh = LSHCache(n=70, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(n=70, b=10, m=3)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(3, lsh.min_support())

        lsh = LSHCache(n=70, b=10, r=7)
        self.assertEqual(10, lsh.num_bands())
        self.assertEqual(7, lsh.num_rows_per_band())
        self.assertEqual(70, lsh.num_total_rows())
        self.assertEqual(2, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(shingler=Shingler(5))
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual(5, lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
        
        lsh = LSHCache(shingler=Shingler(2,3))
        self.assertEqual(20, lsh.num_bands())
        self.assertEqual(5, lsh.num_rows_per_band())
        self.assertEqual(100, lsh.num_total_rows())
        self.assertEqual((2,3,), lsh.shingler().shingle_len())
        self.assertEqual(1, lsh.min_support())
Example #10
0
 def testPercentFound(self):
     lsh = LSHCache(b=2,r=1)
     self.assertEqual(0.75, lsh.theoretical_percent_found(0.5))
     self.assertAlmostEqual(0.96, lsh.theoretical_percent_found(0.8))
     lsh = LSHCache(b=1,r=2)
     self.assertEqual(0.25, lsh.theoretical_percent_found(0.5))
     self.assertAlmostEqual(0.64, lsh.theoretical_percent_found(0.8))
     lsh = LSHCache(b=10,r=10)
     self.assertAlmostEqual(0.0097, lsh.theoretical_percent_found(0.5), places=4)
     self.assertAlmostEqual(0.6789, lsh.theoretical_percent_found(0.8), places=4)
     lsh = LSHCache(b=20,r=5)
     self.assertAlmostEqual(0.4701, lsh.theoretical_percent_found(0.5), places=4)
     self.assertAlmostEqual(0.9996, lsh.theoretical_percent_found(0.8), places=4)
     lsh = LSHCache(b=25,r=4)
     self.assertAlmostEqual(0.8008, lsh.theoretical_percent_found(0.5), places=4)
     self.assertAlmostEqual(1.0000, lsh.theoretical_percent_found(0.8), places=4)
     lsh = LSHCache(b=25,r=4,m=3)
     self.assertAlmostEqual(0.2032, lsh.theoretical_percent_found(0.5), places=4)
     self.assertAlmostEqual(0.9997, lsh.theoretical_percent_found(0.8), places=4)
Example #11
0
 def testClear(self):
     random.seed(12345)
     lsh = LSHCache()
     self.assertSetEqual(set(), lsh.insert("123456789"))
     self.assertSetEqual(set([0]), lsh.insert("34567890"))
     self.assertSetEqual(set([0]), lsh.insert("0123456"))
     self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))
     lsh.clear()
     self.assertSetEqual(set(), lsh.insert("123456789"))
     self.assertSetEqual(set([0]), lsh.insert("34567890"))
     self.assertSetEqual(set([0]), lsh.insert("0123456"))
     self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))