Python MinHashLSH Examples, datasketch.MinHashLSH Python Examples

Example #1

0

Show file

File: eda_utils_text.py Project: gefena/eda_utils

def get_minhashes_of_unique_str_list(unique_str_list):

    t0 = time.time()
    # Create an MinHashLSH index optimized for Jaccard threshold 0.5,
    # that accepts MinHash objects with 128 permutations functions
    threshold = 0.7
    lsh = MinHashLSH(threshold=threshold, num_perm=128)

    # Create MinHash objects
    minhashes = {}
    for i, s in enumerate(unique_str_list):
        minhash = MinHash(num_perm=128)
        for d in ngrams(s, 3):
            minhash.update("".join(d).encode('utf-8'))
        lsh.insert(i, minhash)
        minhashes[i] = minhash

        if i % 5000 == 0:
            print("counter:", i)
            elapsed_time = time.time() - t0
            print("[exp msg] elapsed time for subprocess: " +
                  str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

    elapsed_time = time.time() - t0
    print("[exp msg] elapsed time for process: " +
          str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

    #for i in range(len(minhashes.keys())):
    #for i in range(10):
    #    result = lsh.query(minhashes[i])
    #    print("Candidates with Jaccard similarity > " + str(threshold) + " for input", i, ":", result)
    return (lsh, minhashes)

Example #2

0

Show file

File: lsh_debug.py Project: DavidNemeskey/cc_corpus

def find_duplicates(minhashes, threshold, permutations, name_hashes):
    """
    Find the duplicates amongst the minhashes.

    Arguments:
    - minhashes: a list of minhashes
    - threshold: the Jaccard threshold for similarity / identity
    - permutations: the number of permutations. Must be the same as for the
                    minhash objects
    - name_hashes: list of document hashes (or any ID type, really). If not
                   empty, similarities between documents with the same ID are
                   taken for granted and are not reported.
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    for i, mh in enumerate(minhashes, start=1):
        lsh.insert(str(i), mh, check_duplication=False)
    for i, mh in enumerate(minhashes, start=1):
        similar = lsh.query(mh)
        similar.remove(str(i))
        if name_hashes:
            # Remove matches that occur in the same document
            similar = [
                s for s in similar
                if name_hashes[i - 1] != name_hashes[int(s) - 1]
            ]
        if similar:
            print('{}\t{}'.format(i, ' '.join(similar)))

Example #3

0

Show file

    def _train_LSH(self):

        # create LSH model
        lsh = MinHashLSH(num_perm=128, params=(5, 7))

        # train LSH model
        for dom in self.new_domains:

            # remove TLD
            tld_info = get_tld('http://' + dom,
                               as_object=True,
                               fail_silently=True)

            try:
                d = tld_info.domain
            except:
                continue

            # ignore super short new domains
            if len(d) <= 3: \
                    continue

            # create bigram set
            bigrams = [d[i:i + 2] for i in range(len(d) - 1)]
            bigrams = set(bigrams)
            minhash = MinHash(num_perm=128)
            for b in bigrams:
                minhash.update(b.encode('utf-8'))

            minhash_lean = LeanMinHash(minhash)

            lsh.insert(dom, minhash_lean)

        print('LSH Trained!')
        return lsh

Example #4

0

Show file

def get_topn_similarity_documents_lsh(keywords, n=3):
    lsh = MinHashLSH(threshold=0.1, num_perm=128)
    documents_en = docs_col.find({"lang": 'english'})
    documents_min = [
        lsh_json(str(item["_id"]), item["keyword"]) for item in documents_en
    ]
    for item in documents_min:
        minhash = MinHash(num_perm=128)
        list_keyword = item["keyword"].split(",")
        for k in list_keyword:
            minhash.update(k.encode("utf-8"))
        lsh.insert(str(item["id"]), minhash)

    min = MinHash(num_perm=128)
    keywords = keywords.split(",")
    for k in keywords:
        # print(k)
        min.update(k.encode("utf-8"))
    result = lsh.query(min)
    list_docs = []
    if result:
        for item in result:
            doc = docs_col.find_one({"_id": ObjectId(str(item))})
            doc.pop('_id', None)
            list_docs.append(doc)
    print(list_docs)
    return list_docs

Example #5

0

Show file

File: minhash_funcs.py Project: GMwang550146647/code

def make_lsh_partial(batch_id, batch_size, filename, out_filename, byte_start, nperm=N_PERM, thresh=0.5):
    """
    Generate the LSH index over a subset of the data. 
    :param batch_id: Batch id, used to determine output filename
    :param batch_size: Specifies number of lines of the file to read
    :param filename: Input file, generated using the make_lsh_file family of functions
    :param out_filename: Output file prefix, batch_id is appended to distinguish each block.
    :param byte_start: Byte offset for the partial file - this allows make_lsh_partial to read the middle sections of 
    a file using the seek() command.
    :param nperm: number of permutations in the Min-Hash index.
    :param thresh: Jaccard index threshold to return
    :return: filename of the dumped LSH file.
    """
    lsh = MinHashLSH(threshold=thresh, num_perm=nperm)
    current_batch = 0
    with open(filename, 'r', encoding='utf-8', errors='ignore') as fhandle:
        fhandle.seek(byte_start)
        for line in fhandle:
            lsplit = line.split(':')
            if len(lsplit) > 1:
                lnum = lsplit[0]
                line_sub = lsplit[1]
                wordlist = line_sub.split(' ')
                if len(wordlist) > 3 and (not lsh.__contains__(line_sub)): #
                    lsh.insert((lnum + ':' + line_sub).encode('utf-8'), make_hash(wordlist, nperm))
            current_batch += 1
            if current_batch >= batch_size:
                break
    outfile = out_filename + '_' + str(batch_id) + '.obj'
    dump_lsh(lsh, outfile)
    return outfile

Example #6

0

Show file

File: Check.py Project: serkindmtr/MinHashEmultaionGenerator

def main() -> None:
    for _ in tqdm(range(1), desc="Create finding example:"):
        minhash = MinHash(num_perm=256)
        list_strings = []
        for _ in range(200):
            rand_string = ''.join(
                random.choice(string.ascii_lowercase) for i in range(5))
            list_strings.append(rand_string)
        minhash.update_batch([s.encode('utf-8') for s in list_strings])

    for _ in tqdm(range(1), desc="Connect to existing db:"):
        lsh = MinHashLSH(threshold=0.5,
                         num_perm=256,
                         storage_config={
                             'type': 'cassandra',
                             'basename': b'perftest',
                             'cassandra': {
                                 'seeds': ['127.0.0.1'],
                                 'keyspace': config.KEY_SPACE,
                                 'replication': {
                                     'class': 'SimpleStrategy',
                                     'replication_factor': '1',
                                 },
                                 'drop_keyspace': False,
                                 'drop_tables': False,
                             }
                         })

    try:
        for _ in tqdm(range(1), desc="Find minHash similarity:"):
            result = lsh.query(minhash)
        print("Approximate neighbours with Jaccard similarity > 0.5", result)
    except BaseException as e:
        print(str(e))
        print("Error")

Example #7

0

Show file

class LSH():
    def __init__(self,rawlist,shingle_length=2,threshold=0.8):
        self.indoc = rawlist
        self.make_lsh(shingle_length=shingle_length,threshold=threshold)
        
    def make_shingles(self,doc,length=2):
        s = []
        for i in range(len(doc)-(length-1)):
            s.append(doc[i:i+length])
        return s

    def make_shingle_sets(self,doclst=None,length=2):
        if doclst == None: doclst=self.indoc
        sets = {}
        for d in doclst:
            sets[d] = self.make_shingles(d,length)    
        return sets
          
    def make_lsh(self,shingle_length=2,threshold=0.8):
        print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}')
        sets = self.make_shingle_sets(self.indoc,shingle_length)
        self.minhashes = {}
        self.lsh = MinHashLSH(threshold=threshold, num_perm=128)
        for k in sets.keys():
            m = MinHash(num_perm=128)
            for item in sets[k]:
                m.update(item.encode('utf8'))
                self.minhashes[k] = m
            self.lsh.insert(k,m)
    def get_minhash(self,doc):
        return self.minhashes[doc]
    def get_bucket(self,target_mh):
        return self.lsh.query(target_mh)

Example #8

0

Show file

File: part2.py Project: yan123yan/frequentPatterns

def LSH():
    return_result = []
    result = part1.readFile(k=4)
    num_perm = 1024
    '''
    threshold (float)  – Jaccard 距离阈值设定，默认为0.5
    num_perm (int, optional) – 哈希置换函数设定个数，在weighted-MinHash中为样本规模大小。
    params (tuple, optional) – bands 的数量与规模大小。
    '''
    lsh = MinHashLSH(threshold=0.9, num_perm=num_perm)  #num_perm=128
    index = 1
    for each in result:
        #每一个each是一个set
        doc = MinHash(num_perm=num_perm)
        for d in each:
            doc.update(d.encode('utf8'))
        lsh.insert(str(index), doc)
        index = index + 1

    for each_doc in result:
        doc_target = MinHash(num_perm=num_perm)
        for e in each_doc:
            doc_target.update(e.encode('utf8'))
        re = lsh.query(doc_target)
        print("Approximate neighbours with Jaccard similarity > 0.35", re)
        return_result.append(re)
    return clean_data(return_result)

Example #9

0

Show file

def build_content_sim_mh_text(network, mh_signatures):
    def connect(nid1, nid2, score):
        network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score)

    # Materialize signatures for convenience
    mh_sig_obj = []

    content_index = MinHashLSH(threshold=0.7, num_perm=512)

    # Create minhash objects and index
    for nid, mh_sig in mh_signatures:
        mh_obj = MinHash(num_perm=512)
        mh_array = np.asarray(mh_sig, dtype=int)
        mh_obj.hashvalues = mh_array
        content_index.insert(nid, mh_obj)
        mh_sig_obj.append((nid, mh_obj))

    # Query objects
    for nid, mh_obj in mh_sig_obj:
        res = content_index.query(mh_obj)
        for r_nid in res:
            if r_nid != nid:
                connect(nid, r_nid, 1)

    return content_index

Example #10

0

Show file

def pd_text_hash_create_lsh(df, col, sep=" ", threshold=0.7, num_perm=10):
    '''
    For each of the entry create a hash function
    '''
    from datasketch import MinHash, MinHashLSH
    #Create LSH
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

    #Intialize list
    hash_lines = []

    ll = df[col].values
    for index, sentence in enumerate(ll):

        #Get tokens of individual elements
        tokens = sentence.split(sep)

        #Create local hash funtion
        v = MinHash(num_perm=num_perm)

        for j in set(tokens):
            v.update(j.encode('utf8'))

        #Append
        hash_lines.append(v)
        lsh.insert(str(index), v)
    return hash_lines, lsh

Example #11

0

Show file

def search_lsh_jaccard_topk(index_data, query_data, b, r, k):
    (index_sets, index_keys, index_minhashes) = index_data
    (query_sets, query_keys, query_minhashes) = query_data
    num_perm = b * r
    print("Building LSH Index.")
    start = time.perf_counter()
    index = MinHashLSH(num_perm=num_perm, params=(b, r))
    # Use the indices of the indexed sets as keys in LSH.
    for i in range(len(index_keys)):
        index.insert(i, index_minhashes[num_perm][i])
    end = time.perf_counter()
    print("Indexing time: {:.3f}.".format(end-start))
    print("Querying.")
    times = []
    results = []
    for query_minhash, query_key, query_set in \
            zip(query_minhashes[num_perm], query_keys, query_sets):
        start = time.perf_counter()
        result = index.query(query_minhash)
        # Recover the retrieved indexed sets and 
        # compute the exact Jaccard similarities.
        result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])]
                               for i in result]
        # Sort by similarity.
        result.sort(key=lambda x : x[1], reverse=True)
        # Take the first k.
        result = result[:k]
        duration = time.perf_counter() - start
        times.append(duration)
        results.append((query_key, result))
        sys.stdout.write(f"\rQueried {len(results)} sets")
    sys.stdout.write("\n")
    return (results, times)

Example #12

0

Show file

File: lsh_debug.py Project: DavidNemeskey/cc_corpus

def deduplicate_file(file_prefix, output_dir, threshold, permutations):
    """
    Deduplicates a set of minhashed documents (3 files with the same minhash
    prefix) and writes them to output_dir.

    Warning: only works for full documents at this point!
    """
    lsh = MinHashLSH(threshold=threshold, num_perm=permutations)
    file_base = op.basename(file_prefix)
    logging.info('Processing batch {}...'.format(file_base))
    total_read = 0
    with closing(
            BatchWriter(sys.maxsize, output_dir, len(file_base),
                        int(file_base))) as bw:
        for input_file, results in read_batch(file_prefix):
            minhashes, new_minhashes = results['minhash'], []
            doc_ids, new_doc_ids = results['id'], []
            total_read += len(doc_ids)
            for i, minhash in enumerate(minhashes):
                if not lsh.query(minhash):
                    lsh.insert('_'.join(doc_ids[i]), minhash)
                    new_minhashes.append(minhash)
                    new_doc_ids.append(doc_ids[i])
            bw.write_results(input_file, {
                'id': new_doc_ids,
                'minhash': new_minhashes
            })
            logging.debug('Kept {} documents out of {}'.format(
                len(new_doc_ids), len(doc_ids)))
    logging.info('Processed batch {}; kept {} documents out of {}.'.format(
        file_base, bw.total_written, total_read))

Example #13

0

Show file

def minHash_LSH(data):
    # Create an MinHashLSH index optimized for Jaccard threshold 0.5,
    # that accepts MinHash objects with 128 permutations functions
    # Create LSH index
    lsh = MinHashLSH(threshold=0.65, num_perm=256)
    
    # Create MinHash objects
    minhashes = {}
    for c, i in enumerate(data):
      #c è l'indice, i è la tupla
      #print(i)
      minhash = MinHash(num_perm=256)
      for el in i:
          minhash.update(el.encode('utf8'))
#      for d in ngrams(i, 3):
#        minhash.update("".join(d).encode('utf-8'))
      lsh.insert(c, minhash)
      minhashes[c] = minhash
      #print(str(c)+" "+str(minhashes[c]))
      
    res_match=[]
    for i in range(len(minhashes.keys())):
      result = lsh.query(minhashes[i])
      
      if result not in res_match and len(result)==2:
          res_match.append(result)
          #print("Candidates with Jaccard similarity > 0.6 for input", i, ":", result)
    #print(res)
#    for i in range(len(res_match)):
#        print(data[res_match[i][0]])
#        print(data[res_match[i][1]])
    return res_match

Example #14

0

Show file

File: hashsim.py Project: wushicanASL/sentiment_classify

def text2lsh(filename='data/comments_words_std.csv',
             threshold=0.9,
             num_perm=128,
             is_save=True,
             lshfile='data/output'):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    X = pd.read_csv(filename)
    #X=X[X.words_size>3]
    n = X.shape[0]
    hash_list = []
    for i in tqdm(range(0, n)):
        try:
            id = int(X.id[i])
            score = int(X.score[i])
            m = MinHash(num_perm=num_perm)
            words = X.words[i].split(' ')
            for d in words:
                m.update(d.encode('utf8'))
            lsh.insert((id, score), m)
            hash_list.append((id, m))
        except Exception as e:
            print(e, words)
            #print(id, score)
    if is_save:
        f = open(lshfile + '_lsh.pkl', 'wb')
        pickle.dump(lsh, f, 0)
        f.close()
        f = open(lshfile + '_hash_list.pkl', 'wb')
        pickle.dump(hash_list, f, 0)
        f.close()
    return lsh, hash_list

Example #15

0

Show file

File: deduplicater.py Project: kapant/abbyy_corpuses

def main():
    print('Загружаем корпус')
    all_csv = pd.read_csv("./all.csv", encoding="utf-8")
    raw_corpus = all_csv["text"]

    print('Приводим его к стандартному виду')
    normalized_copus: List[List[str]] = [
        normalize(proverb) for proverb in raw_corpus
    ]

    print('Составляем индекс для поиска дублей')
    lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT)
    deduplicated_corpus = []
    for i, words in enumerate(normalized_copus):
        words_hash = to_minhash(words)
        duplicates = lsh.query(words_hash)
        if duplicates:
            print(f'Найдены совпадения для ({i}): {raw_corpus[i]}')
            all_csv.drop(duplicates)
            for idx in duplicates:
                print(f'\t{idx:>5d}. {raw_corpus[idx]}')
        else:
            lsh.insert(i, words_hash)
            deduplicated_corpus.append(raw_corpus[i])
    print('Удалено дублей:', len(raw_corpus) - len(deduplicated_corpus))

    print(
        f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} рецензий)'
    )
    all_csv.to_csv("./all_deduplicated.csv", encoding="utf-8", index=False)

Example #16

0

Show file

File: analysis_utils.py Project: PaulWestenthanner/databeers

def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5):
    """
    use local similarity hashing to efficiently remove tweets that are similar to others
    (might be autogenerated or retweets)
    english tweets only
    """
    t0 = time.time()
    df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col])
    tweets = [t.split(" ") for t in df["tweet_clean"]]
    t1 = time.time()
    print t1 - t0, "cleaned tweets"
    lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64)  # jaccard similarity
    idx_selected = {}
    df_indices = df.index.values.tolist()
    for idx, tweet in zip(df_indices, tweets):
        s = MinHash(num_perm=64)
        for word in tweet:
            s.update(word.encode('utf8'))
        # only add if the tweet is not similar to existing ones
        if len(lsh.query(s)) == 0:
            lsh.insert(idx, s)
            idx_selected[idx] = True
    t2 = time.time()
    print t2 - t1, "created lsh"
    # only select the first tweet in a group of similar tweets
    df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices)
    print df["select"].value_counts()
    t3 = time.time()
    print t3-t2, "selected df"
    return df[df["select"]]

Example #17

0

Show file

class DuplicationIndex:
    def __init__(
        self,
        *,
        duplication_jaccard_threshold: float = 0.85,
    ):
        self._duplication_jaccard_threshold = duplication_jaccard_threshold
        self._num_perm = NUM_PERM
        self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)

        self._duplicate_clusters = defaultdict(set)

    def add(self, code_key: Tuple, min_hash: MinHash) -> None:
        """Add a key to _index (MinHashLSH)
        the min_hash is used to query closest matches based on the jaccard_threshold.
        The new key is either added to a existing cluster of one close match,
        or a new cluster is created. The clusters created in this way, depend on the order of add.

        Args:
            code_key (Tuple of (index, repo_name, path)):
                Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
            min_hash: MinHash of the code_key.
        """
        close_duplicates = self._index.query(min_hash)
        if code_key in self._index.keys:
            print(f"Duplicate key {code_key}")
            return

        self._index.insert(code_key, min_hash)
        if len(close_duplicates) > 0:

            for base_duplicate in close_duplicates:
                if base_duplicate in self._duplicate_clusters:
                    self._duplicate_clusters[base_duplicate].add(code_key)
                    break
            else:
                self._duplicate_clusters[close_duplicates[0]].add(code_key)

    def get_duplicate_clusters(self) -> List[List[Dict]]:
        """Export the duplicate clusters.
        For each cluster, the first element is the base element of the cluster.
        The base element has an estimation jaccard similarity higher than the threshold with all the other elements.

        Returns:
            duplicate_clusters (List[List[Dict]]):
                List of duplicate clusters.
        """
        duplicate_clusters = []
        for base, duplicates in self._duplicate_clusters.items():
            cluster = [base] + list(duplicates)
            # reformat the cluster to be a list of dict
            cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster]
            duplicate_clusters.append(cluster)
        return duplicate_clusters

    def save(self, filepath) -> None:
        duplicate_clusters = self.get_duplicate_clusters()
        with open(filepath, "w") as f:
            json.dump(duplicate_clusters, f)

Example #18

0

Show file

File: 05_deduplication.py Project: fateevda/creating_dataset

def main():
    """Точка входа в приложение."""
    corpus_root = Path('corpus/clean')
    """Находим названия всех файлов"""
    list_files = file_searcher(corpus_root)

    print('Загружаем корпус')
    raw_corpus = []
    for file in list_files:
        with open(file, 'r', encoding='utf-8') as src:
            text_news = '\n'.join([line.rstrip('\r\n') for line in src])
        raw_corpus.append(text_news)

    print('Приводим его к стандартному виду')
    normalized_copus: List[List[str]] = [
        normalize(news) for news in raw_corpus
    ]

    print('Составляем индекс для поиска дублей')
    dst = open('duplicate.txt', 'w', encoding='utf-8')

    lsh = MinHashLSH(num_perm=HASH_PERMUTATIONS_COUNT)
    deduplicated_corpus = []
    for i, (file, words) in enumerate(zip(list_files, normalized_copus)):
        words_hash = to_minhash(words)
        duplicates = lsh.query(words_hash)
        if duplicates:
            print(f'Найдены совпадения для ({file}): {raw_corpus[i]}',
                  file=dst)
            for idx in duplicates:
                print(f'\t{list_files[idx]}. {raw_corpus[idx]}', file=dst)
            print('\n\n\n\n', file=dst)
        else:
            lsh.insert(i, words_hash)
            deduplicated_corpus.append((raw_corpus[i], list_files[i]))
    print('Удалено дублей:',
          len(raw_corpus) - len(deduplicated_corpus),
          file=dst)

    print(
        f'Сохраняем дедуплицированный корпус ({len(deduplicated_corpus)} новостей)'
    )

    # Создаем пустые папки
    all_genre = [
        'Политика', 'В мире', 'Экономика', 'Общество', 'Происшествия', 'Армия',
        'Наука', 'Культура', 'Религия', 'Спорт', 'Туризм'
    ]
    import os
    for genre in all_genre:
        newpath = 'corpus/super clean/' + genre
        if not os.path.exists(newpath):
            os.makedirs(newpath)

    # Сохраняем корпус
    for text, name in deduplicated_corpus:
        with open('corpus/super clean/' + name[13:], 'w',
                  encoding='utf-8') as dst:
            print(text, file=dst)

Example #19

0

Show file

def build_lsh(code_set, jaccard):
    lsh = MinHashLSH(threshold=jaccard, num_perm=128)
    minhashes = []
    for i, c in enumerate(code_set):
        m = minhashing(c)
        lsh.insert(str(i), m)
        minhashes.append([i, m])
    return minhashes, lsh

Example #20

0

Show file

File: headline-pred.py Project: divkakwani/webcorpus

 def __init__(self, lang):
     self.lang = lang
     self.wikient = WikiEntities(self.lang)
     normalizer_factory = IndicNormalizerFactory()
     self.normalizer = normalizer_factory.get_normalizer(self.lang)
     self.articles = []
     self.ent_sets = []
     self.minhashes = []
     self.lsh = MinHashLSH(threshold=0.5, num_perm=128)

Example #21

0

Show file

 def build_lsh(self, threshold=0.5):
     start = time.time()
     print 'Buidling LSH...'
     lsh = MinHashLSH(threshold=threshold, num_perm=128)
     with lsh.insertion_session() as session:
         for i, entity in enumerate(self.entities):
             session.insert(i, self.minhash(entity.value))
     print '[{} s]'.format(time.time() - start)
     return lsh

Example #22

0

Show file

def find_relation_class_name_matchings(network, kr_handlers):
    # Retrieve relation names
    st = time.time()
    names = []
    seen_sources = []
    for (db_name, source_name, _, _) in network.iterate_values():
        original_source_name = source_name
        if source_name not in seen_sources:
            seen_sources.append(source_name)  # seen already
            source_name = nlp.camelcase_to_snakecase(source_name)
            source_name = source_name.replace('-', ' ')
            source_name = source_name.replace('_', ' ')
            source_name = source_name.lower()
            m = MinHash(num_perm=32)
            for token in source_name.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('relation', (db_name, original_source_name), m))

    num_relations_inserted = len(names)

    # Retrieve class names
    for kr_name, kr_handler in kr_handlers.items():
        all_classes = kr_handler.classes()
        for cl in all_classes:
            original_cl_name = cl
            cl = nlp.camelcase_to_snakecase(cl)
            cl = cl.replace('-', ' ')
            cl = cl.replace('_', ' ')
            cl = cl.lower()
            m = MinHash(num_perm=32)
            for token in cl.split():
                if token not in stopwords.words('english'):
                    m.update(token.encode('utf8'))
            names.append(('class', (kr_name, original_cl_name), m))

    # Index all the minhashes
    lsh_index = MinHashLSH(threshold=0.5, num_perm=32)

    for idx in range(len(names)):
        lsh_index.insert(idx, names[idx][2])

    matchings = []
    for idx in range(0, num_relations_inserted):  # Compare only with classes
        N = lsh_index.query(names[idx][2])
        for n in N:
            kind_q = names[idx][0]
            kind_n = names[n][0]
            if kind_n != kind_q:
                # match.format is db_name, source_name, field_name -> class_name
                match = ((names[idx][1][0], names[idx][1][1], "_"),
                         names[n][1])
                matchings.append(match)
    et = time.time()
    print("Time to relation-class (name): " + str(et - st))
    return matchings

Example #23

0

Show file

    def fit(self, trainset):
        '''Computes the  signature matrix for the training set.'''

        AlgoBase.fit(self, trainset)

        self.lsh = MinHashLSH(threshold=self.tr, num_perm=self.n_perm)
        for user in tqdm(self.trainset.ur, desc='Computing LSH'):
            self.lsh.insert(user, self.compute_minhash_signature(user))

        return self

Example #24

0

Show file

    def __init__(
        self,
        *,
        duplication_jaccard_threshold: float = 0.85,
    ):
        self._duplication_jaccard_threshold = duplication_jaccard_threshold
        self._num_perm = NUM_PERM
        self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)

        self._duplicate_clusters = defaultdict(set)

Example #25

0

Show file

File: blocking.py Project: asevans48/DeduplicationUtils

 def setup_lsh(self):
     """
     Create Minhash lsh
     """
     if self.__storage_config:
         self.__lsh = MinHashLSH(threshold=self.__threshold,
                                 num_perm=self.__num_perm,
                                 storage_config=self.__storage_config)
     else:
         raise ValueError("Storage Backend Required Due to Use of Session")

Example #26

0

Show file

File: lsh_func.py Project: nestauk/openjobs-SDS-NOS-2019

def perform_lsh(lsh_text,
                standard_labels,
                title_labels,
                char_ngram=5,
                savefile=''):
    t0 = time.time()
    shingled_desc = [shingles(desc) for desc in lsh_text]
    print_elapsed(t0, 'splitting the text into groups of characters')

    #Create hash signatures for shingles
    t0 = time.time()
    hash_objects = []
    for i in range(len(shingled_desc)):
        m = MinHash(num_perm=200)
        hash_objects.append(m)
    print_elapsed(t0, 'creating hash signatures')

    t0 = time.time()
    for ix, desc in enumerate(shingled_desc):
        for d in desc:
            hash_objects[ix].update(d.encode('utf8'))
    print_elapsed(t0, 'encoding hash objects')

    #Define LSH and Jaccard similarity threshold
    lsh = MinHashLSH(threshold=0.8, num_perm=200)

    content = []
    for ix, desc in enumerate(shingled_desc):
        content.append((standard_labels[ix], hash_objects[ix]))

    for ix, elem in enumerate(content):
        #lsh.insert('{}'.format(ix), elem[1]) #elem[0], elem[1])
        lsh.insert(elem[0], elem[1])

    #For each standard search all signatures and identify potential clashes (e.g. other standards with Jaccard similarity
    #of shingle sets greater or equal to the threshold). Note: some of the candidates might be false positives.
    candidates = {}
    for ix, desc in enumerate(shingled_desc):
        result = lsh.query(hash_objects[ix])
        if len(result) > 1:
            candidates[standard_labels[ix] + ': ' + title_labels[ix]] = [
                (res, df_nos['Title'].loc[res]) for res in result
            ]
            #candidates.append(result)
            print(standard_labels[ix] + ': ' + title_labels[ix], ': ',
                  [(res, df_nos['Title'].loc[res]) for res in result])
            #print(standard_labels[ix], ': ',result)
            print('***************')
        else:
            candidates[standard_labels[ix]] = 'none'

    if len(savefile):
        pd.DataFrame.from_dict(candidates, orient='index').to_csv(savefile)
    return candidates, shingled_desc, content, lhs

Example #27

0

Show file

File: DBCEJournal.py Project: ppBruce/dbce

    def compute_lsh(self, entry):
        """
        Indexes the WARC entry using LSH
        """

        if not self.lsh:
            self.lsh = MinHashLSH(
                threshold=self.config.getfloat(self.domain, 'lsh_threshold'),
                num_perm=self.config.getint(self.domain,
                                            'number_of_minhash_buckets'))
        self.lsh.insert(entry['item_id'], entry['minhash'])

Example #28

0

Show file

 def make_lsh(self,shingle_length=2,threshold=0.8):
     print(f'Making LSH with threshold of {threshold}, shingle length of {shingle_length}')
     sets = self.make_shingle_sets(self.indoc,shingle_length)
     self.minhashes = {}
     self.lsh = MinHashLSH(threshold=threshold, num_perm=128)
     for k in sets.keys():
         m = MinHash(num_perm=128)
         for item in sets[k]:
             m.update(item.encode('utf8'))
             self.minhashes[k] = m
         self.lsh.insert(k,m)

Example #29

0

Show file

def apply_lsh(group, col):
    lsh = MinHashLSH(threshold=0.9, num_perm=256)
    minhashes = {}
    for idx, text in group[col].iteritems():
        minhash = MinHash(num_perm=256)
        for d in ngrams(text, 3):
            minhash.update("".join(d).encode('utf-8'))
        index = group.loc[idx, 'productId']
        lsh.insert(key=index, minhash=minhash)
        minhashes[index] = minhash
    return lsh, minhashes

Example #30

0

Show file

 def reset(self, domain):
     """Resets the bookkeeping and statistics objects."""
     self.lsh = MinHashLSH(threshold=self.threshold,
                           num_perm=self.permutations)
     self.freq_ps = {}  # type: Dict[str, PData]
     self.num_dup = 0
     # Bootstrap the domain frequency counts if previous data is available
     _, docs, pdatas = self.bootstrap.get(domain, self.BOOTSTRAP_TUPLE)
     self.stats = CollectStats(domains=1, docs=docs)
     for pdata_id, pdata in enumerate(pdatas, start=1):
         self.lsh.insert(str(pdata_id), pdata.minhash)
         self.freq_ps[str(pdata_id)] = pdata

Example #31

0

Show file

File: lsh_benchmark.py Project: Amano-Ginji/datasketch

def benchmark_lsh(threshold, index_data, query_data):
    print("Building LSH index")
    num_perm = len(index_data.minhashes[0].hashvalues)
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.filenames, index_data.minhashes):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for minhash in query_data.minhashes:
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(result)
    return times, results

Example #32

0

Show file

File: analyze_possible_duplicates.py Project: barravi/undercrawler

def analyze_file(name, f, verbose=False):
    urls = []
    Doc = namedtuple('Doc', ['item', 'min_hash'])
    documents = {} # key -> Doc
    lsh = MinHashLSH(threshold=0.9, num_perm=128)
    too_common = get_too_common_shingles(f, name, limit=300)
    for i, item in enumerate(item_reader(f, name)):
        urls.append(item['url'])
        min_hash = get_min_hash(item['extracted_text'], too_common)
        key = 'item_{}'.format(i)
        item = {'url': item['url']}
        documents[key] = Doc(item, min_hash)
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
    paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)]
    duplicates = get_duplicates(lsh, documents, verbose=verbose)
    print(name.ljust(40), '\t'.join(map(str, [
        len(urls), len(set(urls)), len(set(paths)),
        n_unique(documents, duplicates),
        ])))

Example #33

0

Show file

File: NearDuplicate.py Project: malberich/pgds-etl-filters

 def load(self):
     """Loads the stored model data from previous runs"""
     if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)):
         self.lsh = pickle.load(
             open(
                 './minhash-%s--%d-%.2f.pkl' % (
                     self.lang,
                     self.permutations,
                     self.threshold
                 ),
                 'rb'
             )
         )
     else:
         self.lsh = MinHashLSH(
             threshold=self.threshold,
             num_perm=self.permutations
         )

Example #34

0

Show file

File: analyze_possible_duplicates.py Project: barravi/undercrawler

def learn_duplicates(name, f, verbose=False):
    print(name)
    logging.basicConfig(level=logging.DEBUG)
    texts_sample = [
        item['extracted_text'] for item in item_reader(f, name, limit=300)]
    dupe_predictor = DupePredictor(texts_sample)

    lsh = MinHashLSH(threshold=0.9, num_perm=128)  # separate from dupe_predictor
    too_common_shingles = dupe_predictor.too_common_shingles
    threshold = 0.98
    y_pred, y_true = [], []
    def _report_pr():
        tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
        fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
        fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
        n_dup = tp + fn
        print('precision: %.3f, recall %.3f at %.2f threshold '
                '(%d duplicates)' % (
            tp / (tp + fp) if tp else 0.,
            tp / n_dup if n_dup else 0., threshold, n_dup))
    for i, item in enumerate(item_reader(f, name)):
        dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
        y_pred.append(dupe_prob)
        min_hash = get_min_hash(item['extracted_text'], too_common_shingles)
        if dupe_prob < threshold:
            duplicates = [url for url, _ in dupe_predictor.update_model(
                item['url'], item['extracted_text'])]
        else:
            # We think this is a duplicate: replicate crawling
            # and do not update the model.
            duplicates = list(lsh.query(min_hash))
        key = canonicalize_url(item['url'])
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
        y_true.append(bool(duplicates))
        if verbose:
            if duplicates and dupe_prob < threshold:
                path = _full_path(item['url'])
                sample = [url for url in duplicates
                          if _full_path(url) == path] or duplicates
                print('false negative %s (%s, %d more)' % (
                    item['url'], sample[0], len(sample) - 1))
            elif not duplicates and dupe_prob > threshold:
                print('false positive', item['url'])
        if i % 100 == 0:
            _report_pr()
    _report_pr()

Example #35

0

Show file

File: NearDuplicate.py Project: malberich/pgds-etl-filters

class NearDuplicate(EtlProcessor):
    """A class that acts over the raw tweets collected from the twitter stream
       in order to detect whether the tweet is duplicate, near-duplicate or
       nothing at all"""

    punct = re.compile(r"[\.,;:]\\xe2", re.IGNORECASE)

    langs = {
        "es": "spanish",
        "en": "english"
    }

    process_count = 0

    def __init__(
        self,
        connector=None,
        lang='en',
        threshold=0.8,
        permutations=90,
        autostart=True
    ):
        self.permutations = permutations
        self.threshold = threshold
        self.lang = lang
        self.connector = None
        self.lsh = None

        EtlProcessor.__init__(self, connector=connector, autostart=autostart)
        if autostart:
            self.load()
            self.listen()

    def listen(self):
        """Performs a model check on whether the current tweet
        resembles at least to a 80% level as other previous tweets"""
        for msg in self.connector.listen():
            tweet = json.loads(msg.value())
            try:
                if self.is_unique(tweet):
                    self.connector.send(
                        msg.value()
                    )
                    self.connector.log(
                        json.dumps({
                            "id_str": tweet['id_str'],
                            "source": self.connector.consumer_topic,
                            "dest": self.connector.producer_topic
                        })
                    )
            except ValueError:
                self.connector.send(
                    json.dumps({
                        "id_str": tweet['id_str'],
                        "source": self.connector.consumer_topic,
                        "dest": "error"
                    })
                )
                continue
            finally:
                self.process_count += 1
                if self.process_count % 1000 == 0:
                    self.save()

    def load(self):
        """Loads the stored model data from previous runs"""
        if os.path.isfile('./minhash-%s-%.2f.pkl' % (self.lang, self.threshold)):
            self.lsh = pickle.load(
                open(
                    './minhash-%s--%d-%.2f.pkl' % (
                        self.lang,
                        self.permutations,
                        self.threshold
                    ),
                    'rb'
                )
            )
        else:
            self.lsh = MinHashLSH(
                threshold=self.threshold,
                num_perm=self.permutations
            )

    def save(self):
        """Stores the currently processed data for this model"""
        pickle.dump(
            self.lsh,
            open(
                './minhash-%s--%d-%.2f.pkl' % (
                    self.lang,
                    self.permutations,
                    self.threshold
                ),
                'wb+'
            )
        )

    def replace_urls(self, tweet):
        """Convenience function that replaces the compressed URLs by
        their expanded counterparts, in order to treat the same real URL
        as it is (and not obfuscating the same URL in diferent tweets by
        a different t.co link)"""
        removed_characters = 0
        if 'entities' in tweet and 'urls' in tweet['entities']:
            for url in tweet['entities']['urls']:
                tweet['text'] = tweet['text'][:(url['indices'][0] - removed_characters - 1)] + \
                    tweet['text'][(url['indices'][1] - removed_characters - 1):]
                removed_characters += url['indices'][1] - url['indices'][0]
            for url in tweet['entities']['urls']:
                tweet['text'] += ' ' + url['expanded_url']
        return tweet

    @lru_cache(maxsize=1e06)
    def minhash_tweet(self, tweet_text):
        """Minhashing operation that allows for a caching of up to
        1M tweets in order to speed up the checking procedure when it's
        the same tweet text"""
        tweet_hash = MinHash(num_perm=self.permutations)
        for word in tweet_text.split():
            tweet_hash.update(
                self.punct.sub(
                    "",
                    word.encode('utf8')
                )
            )
        return tweet_hash

    def is_unique(self, tweet):
        """Core method to check whether this tweet resembles enough to other previous
        tweets to label it as unique or near-duplicate"""
        is_unique_tweet = False
        urlfied_tweet = self.replace_urls(tweet)
        mht = self.minhash_tweet(
            urlfied_tweet['text']
        )
        if 'minteressa' not in tweet:
            tweet['minteressa'] = {}
        if self.lsh.is_empty() is not True:
            similars = self.lsh.query(mht)
            if len(similars) == 0:
                # It's a unique tweet
                try:
                    self.lsh.insert(
                        tweet['id_str'],
                        mht
                    )
                    is_unique_tweet = True
                except ValueError:
                    logging.error(ValueError)
            else:
                # nondupe
                for tweet_idx in similars:
                    if 'nearduplicates' not in tweet['minteressa']:
                        tweet['minteressa']['nearduplicates'] = 0

                tweet['minteressa']['nearduplicates'] += 1
        else:
            is_unique_tweet = True
            self.lsh.insert(
                tweet['id_str'],
                mht
            )
        return is_unique_tweet

Example #36

0

Show file

File: crawl_stats.py Project: barravi/undercrawler

def print_stats(
        f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
        print_duplicates=False, print_urls=False, limit=None):
    stats = Counter()
    if not skip_unique:
        lsh = MinHashLSH(threshold=0.9, num_perm=128)
        too_common = get_too_common_shingles(f, limit=1000)
    urls = {}
    min_timestamp = max_timestamp = None
    for i, item in enumerate(item_reader(f, limit=limit)):
        if print_urls:
            print(item['url'])
        content_type = item.get('content_type', 'missing')
        stats.update([
            'content_type: ' + content_type,
            'content_type[0]: ' + content_type.split('/')[0]])
        if min_timestamp is None:
            min_timestamp = item['timestamp']
        max_timestamp = item['timestamp']
        if duration_limit and \
                (max_timestamp - min_timestamp) / 1000 > duration_limit:
            break
        if 'extracted_text' not in item:
            assert item['obj_stored_url']
            stats.update(['documents'])
            continue
        stats.update(['items'])
        for key, value in item['extracted_metadata'].items():
            if key == 'forms':
                for form in value:
                    stats.update(['form_{}'.format(form['form'])])
                    stats.update(['form_field {}'.format(f)
                                  for f in form['fields'].values()])
            if isinstance(value, list):
                value = len(value)
            if isinstance(value, int) and not isinstance(value, bool):
                if value >= max_int_value:
                    value = '{}+'.format(max_int_value)
                key = '{}_{}'.format(key, value)
            if value:
                stats.update([key])
                if key == show:
                    print(item['url'])
        if not skip_unique:
            min_hash = get_min_hash(item['extracted_text'], too_common)
            duplicates = lsh.query(min_hash)
            if not duplicates:
                stats.update(['unique_items'])
            elif print_duplicates:
                print('{} {} duplicates: {}'.format(
                    item['url'], len(duplicates),
                    ' '.join(urls[k] for k in duplicates[:10])))
            key = 'item_{}'.format(i)
            lsh.insert(key, min_hash)
            urls[key] = item['url']

    if max_timestamp and min_timestamp:
        stats['duration'] = (max_timestamp - min_timestamp) / 1000
    for k, v in sorted(stats.items()):
        print(k.ljust(20), v)
    return stats

Example #37

0

Show file

File: findFrequentShingle.py Project: MorPhingG/Duplication_Detection

        allshingle.append(''.join(shingle[i][j]).split())

# Create MinHash objects
m = []
for i in range(0,allshingle.__len__()):
    m.append(MinHash(num_perm=128))


for i in range(allshingle.__len__()):
    for d in allshingle[i]:
        m[i].update(d.encode('utf8'))


# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=1, num_perm=128)

# Insert m into the index
for i in range(0, m.__len__()):
    lsh.insert("m%d"%i, m[i])

# Search all the frequent shingle which frequency bigger than 100
result = []
for i in range(0, m.__len__()):
    if len(lsh.query(m[i])) > 100:
        result.append(lsh.query(m[i]))

#Find the frequency of the shingle
index = []
for i in range(0,result.__len__()):
    tem = len(result[i])

Example #38

0

Show file

File: db_organize.py Project: fake-name/wlnupdates

def minhash_merger_series(interactive=True):


	matchlogger = MatchLogBuilder()
	if interactive:
		callback=askuser_callback_series
	else:
		callback=matchlogger.add_match_series

	print("fetching series")
	with app.app_context():
		items = models.Series.query.options(
			joinedload(Series.alternatenames)
			).all()
		altn = []
		for item in items:
			for name in item.alternatenames:
				altn.append((name.id, name.series, name.cleanname, item.title))

	print("Building mapping dictionaries")
	# Map altname id to series id
	altnid_sid_dict  = dict([(tmp[0], tmp[1]) for tmp in altn])
	altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
	sid_sname_dict   = dict([(tmp[1], tmp[3]) for tmp in altn])

	sid_altnid_dict = {}
	for nid, sid in altnid_sid_dict.items():
		sid_altnid_dict.setdefault(sid, [])
		sid_altnid_dict[sid].append(nid)


	print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))

	perms = 512
	gram_sz = 3
	minhashes = {}
	lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)

	print("Building lsh minhash data structure")
	with ProcessPoolExecutor(max_workers=8) as ex:
		print("Submitting jobs")
		futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
				for
					key, content
				in
					altnid_name_dict.items()
				if
					len(content) >= 5
			]

		print("Consuming futures")
		for key, future in tqdm.tqdm(futures):
			minhash = future.result()
			lsh.insert(key, minhash)
			minhashes[key] = minhash

	print("Doing search")

	for key, minhash in minhashes.items():

		result = lsh.query(minhashes[key])
		if key in result:
			result.remove(key)
		if result:
			sid = altnid_sid_dict[result[0]]
			src_sid = altnid_sid_dict[key]
			if sid != src_sid:
				sname = sid_sname_dict[sid]
				res_sids = set([altnid_sid_dict[tmp] for tmp in result])
				names = []
				for res_id in result:
					if altnid_sid_dict[res_id] != src_sid:
						names.append((altnid_sid_dict[res_id], res_id, altnid_name_dict[res_id]))
				if names:
					names.sort()
					print("Search returned %s results in %s series for %s:%s" % (len(result), len(res_sids), src_sid, sname))
					for sid, nid, name in names:
						print("	%s -> %s: %s" % (str(sid).rjust(8), str(nid).rjust(8), name))


	if not interactive:
		matchlogger.save_log("./seriesname-matchset-minhash.json")

Example #39

0

Show file

File: generate.py Project: herp-a-derp/tob2

	def consolidate_dupes(self, agg_files):
		# Remove short items
		for key, value in agg_files.items():
			for fkey in list(value['files'].keys()):
				# print("File params: ", value['files'][fkey].keys())
				if not 'content_text' in value['files'][fkey]:
					print("Missing file:", key, fkey)
					value['files'].pop(fkey)
				elif len(value['files'][fkey]['content_text']) < 100:
					print("Removing short file: ", (key, fkey))
					value['files'].pop(fkey)

		smap = {}
		for key, value in agg_files.items():
			for fkey in value['files']:
				smap[(key, fkey)] = value['files'][fkey]['content_text']

		perms = 512
		gram_sz = 10
		thresh = 0.5
		lsh = MinHashLSH(threshold=thresh, num_perm=perms)

		print("Loading word hashes")
		minhashes = {}

		with ProcessPoolExecutor(max_workers=10) as ex:
			print("Submitting jobs")
			futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
					for
						key, content
					in
						smap.items()
				]
			print("Submitted %s jobs. Consuming futures" % len(futures))
			for key, future in tqdm.tqdm(futures, "Hashing"):
				minhash = future.result()
				lsh.insert(key, minhash)
				minhashes[key] = minhash


		lens = {}
		for key, content in smap.items():
			clen = len(content)
			lens.setdefault(clen, [])
			lens[clen].append(key)
		lenl = list(lens.keys())
		lenl.sort()

		print("%s items in file map before dupe elimination" % len(smap))

		for clen in lenl:
			tgt_keys = lens[clen]
			for key in tgt_keys:
				if key not in smap:
					continue
				if key not in minhashes:
					continue

				result = lsh.query(minhashes[key])
				if key in result:
					result.remove(key)
				if result:
					still_ok = [tmp for tmp in result if tmp in smap]
					if still_ok:
						smap.pop(key)
						akey, fkey = key
						agg_files[akey]['files'].pop(fkey)

					# for res in result:
					# print(key)
					# print("Similar: ", result)

		print("%s items in file map after dupe elimination" % len(smap))

		return agg_files