コード例 #1
0
 def find_more_then_threshold(self, threshold, curr_file_name):
     lsh = MinHashLSH(threshold)
     current_m = self.min_hash_text(set(self.file_to_words('/'.join([self.doc_dir, curr_file_name]))))
     for k, v in self.min_hash_dict.iteritems():
         lsh.insert(k, v)
     result = lsh.query(current_m)
     print("Candidates with Jaccard similarity > " + str(threshold), result)
コード例 #2
0
ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch
 def test_init(self):
     lsh = MinHashLSH(threshold=0.8)
     self.assertTrue(lsh.is_empty())
     b1, r1 = lsh.b, lsh.r
     lsh = MinHashLSH(threshold=0.8, weights=(0.2,0.8))
     b2, r2 = lsh.b, lsh.r
     self.assertTrue(b1 < b2)
     self.assertTrue(r1 > r2)
コード例 #3
0
ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
コード例 #4
0
ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
コード例 #5
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
コード例 #6
0
ファイル: lsh_test.py プロジェクト: Amano-Ginji/datasketch
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        
        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
コード例 #7
0
def eg1():
    m1 = MinHash()
    m2 = MinHash()
    m3 = MinHash()
    for d in data1:
        m1.update(d.encode('utf8'))
    for d in data2:
        m2.update(d.encode('utf8'))
    for d in data3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
コード例 #8
0
ファイル: lsh_test.py プロジェクト: cytora/datasketch
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
コード例 #9
0
ファイル: similarity.py プロジェクト: jaegeral/timesketch
def new_lsh_index(events,
                  field,
                  delimiters=None,
                  num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(event.source[field], num_perm,
                                        delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes
コード例 #10
0
 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])
コード例 #11
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, lsh.insert, "c", m3)
コード例 #12
0
ファイル: similarity.py プロジェクト: Onager/timesketch
def new_lsh_index(events, field, delimiters=None, num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(
                event.source[field], num_perm, delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes
コード例 #13
0
    def test_insert(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        for t in lsh.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in lsh)
        self.assertTrue("b" in lsh)
        for i, H in enumerate(lsh.keys["a"]):
            self.assertTrue("a" in lsh.hashtables[i][H])

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.insert, "c", m3)
コード例 #14
0
ファイル: lsh_test.py プロジェクト: zhangjunqiang/datasketch
    def test_pickle(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)
コード例 #15
0
 def test_pickle(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     lsh.insert("a", m1)
     lsh.insert("b", m2)
     lsh2 = pickle.loads(pickle.dumps(lsh))
     result = lsh.query(m1)
     self.assertTrue("a" in result)
     result = lsh.query(m2)
     self.assertTrue("b" in result)
コード例 #16
0
    def test_query(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)
        result = lsh.query(m1)
        self.assertTrue("a" in result)
        result = lsh.query(m2)
        self.assertTrue("b" in result)

        m3 = MinHash(18)
        self.assertRaises(ValueError, lsh.query, m3)
コード例 #17
0
 def __init__(self, threshold=0.9, num_perm=128, num_part=16, m=8, weights=(0.5,0.5)):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]") 
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 2:
         raise ValueError("num_part must be at least 2")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     self.indexes = [dict((r, MinHashLSH(num_perm=self.h, params=(int(self.h/r), r))) for r in rs)
                     for _ in range(0, num_part)] 
     self.lowers = [None for _ in self.indexes]
コード例 #18
0
ファイル: lshensemble.py プロジェクト: galt2x/datasketch-1
 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5),
              storage_config=None,
              prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     storage_config = {
         'type': 'dict'
     } if not storage_config else storage_config
     basename = storage_config.get('basename', _random_name(11))
     self.indexes = [
         dict((r,
               MinHashLSH(num_perm=self.h,
                          params=(int(self.h / r), r),
                          storage_config=self._get_storage_config(
                              basename, storage_config, partition, r),
                          prepickle=prepickle)) for r in rs)
         for partition in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]
コード例 #19
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=4)
        mg = WeightedMinHashGenerator(10, 4)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
コード例 #20
0
    def test_remove(self):
        lsh = MinHashLSH(threshold=0.5, num_perm=16)
        m1 = MinHash(16)
        m1.update("a".encode("utf8"))
        m2 = MinHash(16)
        m2.update("b".encode("utf8"))
        lsh.insert("a", m1)
        lsh.insert("b", m2)

        lsh.remove("a")
        self.assertTrue("a" not in lsh.keys)
        for table in lsh.hashtables:
            for H in table:
                self.assertGreater(len(table[H]), 0)
                self.assertTrue("a" not in table[H])

        self.assertRaises(ValueError, lsh.remove, "c")
コード例 #21
0
    def test_query_redis(self):
        with patch('redis.Redis', fake_redis) as mock_redis:
            lsh = MinHashLSH(threshold=0.5,
                             num_perm=16,
                             storage_config={
                                 'type': 'redis',
                                 'redis': {
                                     'host': 'localhost',
                                     'port': 6379
                                 }
                             })
            m1 = MinHash(16)
            m1.update("a".encode("utf8"))
            m2 = MinHash(16)
            m2.update("b".encode("utf8"))
            lsh.insert("a", m1)
            lsh.insert("b", m2)
            result = lsh.query(m1)
            self.assertTrue("a" in result)
            result = lsh.query(m2)
            self.assertTrue("b" in result)

            m3 = MinHash(18)
            self.assertRaises(ValueError, lsh.query, m3)
コード例 #22
0
import pandas as pd
from datasketch.minhash import MinHash
from datasketch.lsh import MinHashLSH
from preprocess import tokenize_sentence
"""
To find similar questions in O(1) we are using jaccard similarity and minHash. 
Question with similar minHash are candidates to be similar. 
To compare if two candidate senteces are similar we are using jaccard similarity  
"""

df = pd.read_csv("proccessed.csv")
total_questions = df.shape[0]
threshold_jacard = 0.30
lsh = MinHashLSH(threshold=threshold_jacard)

#calculate minhash for each sentence in column question1
for index, row in df.iterrows():
    min_Hash = MinHash()
    question = tokenize_sentence(str(row['question1']))
    for word in question:
        min_Hash.update(word.encode('utf8'))
    lsh.insert(str(index), min_Hash)

total = 0
return_result = 0
correct = 0
total_correct = 0
#for each sentense in column question2 find similar questions
for i in range(0, total_questions):
    question_minHash = MinHash()
    question = tokenize_sentence(str(df['question2'][i]))