def candidate_duplicates(document_feed, char_ngram=5, seeds=100, bands=10, hashbytes=4): char_ngram = 5 sims = [] hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=hashbytes) if seeds % bands != 0: raise ValueError( 'Seeds has to be a multiple of bands. {} % {} != 0'.format( seeds, bands)) lshcache = cache.Cache(num_bands=bands, hasher=hasher) for i_line, line in enumerate(document_feed): line = line.decode('utf8') docid, headline_text = line.split('\t', 1) fingerprint = hasher.fingerprint(headline_text.encode('utf8')) # in addition to storing the fingerpring store the line # number and document ID to help analysis later on lshcache.add_fingerprint(fingerprint, doc_id=(i_line, docid)) candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len(b[bucket_id]) > 1: pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) return candidate_pairs
def clean_duplicates(text_df, text_col='body_text', method=100, char_ngram=5, seeds=100, bands=5, hashbytes=4, thresh=.9): start = len(text_df) if method == 'latlon': #must have 'latitude', 'longitude','price' colums to use this method text_df = text_df.drop_duplicates(['latitude', 'longitude', 'price']) if type(method) == int: text_df['body_100'] = text_df[text_col].str.slice(stop=method).copy() text_df = text_df.drop_duplicates(subset='body_100').drop('body_100', axis=1) #use LSH minhash to clean dupes requires LSH and minhash3 to run!!! if method == 'lsh': try: text_df = text_df.sort_values('listingDate').reset_index() except (KeyError): try: module_logger.info( 'text_df has insufficient date information, dropping by month/day' ) text_df = text_df.sort_values(['scraped_month', 'scraped_day']).reset_index() except: module_logger.info( 'text_df has insufficient date information, dropping by index' ) text_df = text_df.reset_index() # first get candidate_pairs candidate_pairs = candidate_duplicates(text_df, text_col, char_ngram, seeds, bands, hashbytes) module_logger.info("Found " + str(len(candidate_pairs)) + " possible duplicate pairs") # then we make sure jaccard similarity is above .9 lines = text_df[text_col].str.lower().values hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=hashbytes) lshcache = cache.Cache(bands=bands, hasher=hasher) similarities = [] for (line_a, line_b) in candidate_pairs: doc_a, doc_b = lines[line_a], lines[line_b] similarities.append( (line_a, line_b, jaccard_sim(doc_a, doc_b, char_ngram))) if len(similarities) % 10000 == 0: module_logger.info("Processed " + str(len(similarities)) + " possible duplicates") # reduce to only jaccards above .9 and check which pair is older drop_list = [ min(pair[0], pair[1]) for pair in similarities if pair[2] >= thresh ] text_df = text_df.drop(set(drop_list)).set_index("index") dupes = start - len(text_df) module_logger.info("Dropped " + str(dupes) + " duplicates") return text_df
def _get_candidate_pairs(self, df): hasher = minhash.MinHasher(seeds=self.seeds, char_ngram=self.char_ngram, hashbytes=4) lshcache = cache.Cache(num_bands=self.bands, hasher=hasher) df['fingerprint'] = df['text'].apply(lambda t: hasher.fingerprint(t)) df.apply(lambda f: lshcache.add_fingerprint(f['fingerprint'], doc_id=f.name), axis=1) communities = [] candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len(b[bucket_id]) > 1: pairs = set(itertools.combinations(b[bucket_id], r=2)) # communities.append(self.find_community(df, pairs)) candidate_pairs.update(pairs) return candidate_pairs
def candidate_duplicates(text_df, text_col, char_ngram=5, seeds=100, bands=5, hashbytes=4): hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=hashbytes) if seeds % bands != 0: raise ValueError( 'Seeds has to be a multiple of bands. {} % {} != 0'.format( seeds, bands)) lshcache = cache.Cache(num_bands=bands, hasher=hasher) for i in range(len(text_df)): line = text_df[text_col].iloc[i] lshcache.add_fingerprint(hasher.fingerprint(line), doc_id=i) candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len(b[bucket_id]) > 1: pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) return candidate_pairs
# in addition to storing the fingerpring store the line # number and document ID to help analysis later on lshcache.add_fingerprint(fingerprint, doc_id=(i_line, docid)) candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len(b[bucket_id]) > 1: pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) return candidate_pairs hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4) lines = [] start_time = time.time() with open('dataset.txt', 'rb') as fh: # read the first 1000 lines into memory so we can compare them for line in itertools.islice(fh, 10000): lines.append(line.decode('utf8')) # reset file pointer and do LSH fh.seek(0) feed = itertools.islice(fh, 10000) candidates = candidate_duplicates(feed, char_ngram=5, seeds=100, bands=10,
def find_and_store_duplicate_syllabi(grid_name, year, field_name): global stop try: # connect to existing database conn = psycopg2.connect( "dbname='litindex' user='******' host='0.0.0.0' password='******'" ) # Open a cursor to perform database operations cur = conn.cursor() param_list = [grid_name, year, field_name] select_query = "SELECT id, text_md5, text from open_syllabi where grid_name='{}' and year='{}' and field_name='{}'".format( *param_list) #unpack the list cur.execute(select_query) df = pd.DataFrame(cur.fetchall(), columns=['id', 'text_md5', 'text']) print("\tNO OF RECORDS = {}", len(df)) punctuation_translator = str.maketrans('', '', string.punctuation) # PRE-PROCESSING REQUIRED: # normalize by lowering the case, removing punctuations, removing numbers and english stop words df['text_lower_case_words'] = df['text'].apply(lambda x: ' '.join([ word for word in x.lower().translate(punctuation_translator).split( ) if not word.isdigit() and word not in stop ])) # the following pre-processing is required to improve quality of LSH results # especially considering highly templated text in course descriptions df['text_unique_words'] = df['text'].apply(lambda x: ' '.join([ word for word in list( set(x.lower().translate(punctuation_translator).split())) if not word.isdigit() and word not in stop ])) common_words_series = pd.Series(' '.join( df['text_unique_words']).lower().strip( string.punctuation).split()).value_counts() most_common_words_series = common_words_series[common_words_series > ( 0.5 * len(df))].dropna() most_common_words_list = most_common_words_series.index.tolist() df['text_without_common_words'] = df['text'].apply(lambda x: ' '.join([ word for word in x.lower().translate(punctuation_translator).split( ) if word not in (most_common_words_list) and word not in stop ])) # STEP 1: use LSH algorithm to find candidate duplicates # find duplicates # run through adding documents to the LSH cache hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4) lshcache = cache.Cache(bands=10, hasher=hasher) for idx in range(0, (len(df) - 1)): lshcache.add_fingerprint( hasher.fingerprint(df.loc[idx, 'text_without_common_words']), df.loc[idx, 'id']) # for every bucket in the LSH cache get the candidate duplicates # note this fast way to get candidate pairs with reasonable accuracy, that will be filtered later candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len( b[bucket_id] ) > 1: # if the bucket contains more than a single document pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) list_candidate_pairs = list(candidate_pairs) tsl = [] # df = df.set_index('id') print("\tcandidate pairs found = {}", len(list_candidate_pairs)) # STEP 2: use TFIDF to process the records associated with the candidate duplicates and generate signature text tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(df['text_lower_case_words']) feature_names = tf.get_feature_names() dense = tfidf_matrix.todense() for item in list_candidate_pairs: idx1 = df.index[df['id'] == int(item[0])] idx2 = df.index[df['id'] == int(item[1])] episode1 = dense[idx1].tolist()[0] episode2 = dense[idx2].tolist()[0] phrase_scores1 = [ pair for pair in zip(range(0, len(episode1)), episode1) if pair[1] > 0 ] sorted_phrase_scores1 = sorted(phrase_scores1, key=lambda t: t[1] * -1) phrase_scores2 = [ pair for pair in zip(range(0, len(episode2)), episode2) if pair[1] > 0 ] sorted_phrase_scores2 = sorted(phrase_scores2, key=lambda t: t[1] * -1) list_summarized_text1 = [] list_summarized_text2 = [] for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores1 ][:10]: # print('{0: <20} {1}'.format(phrase, score)) list_summarized_text1.append(phrase) for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores2 ][:10]: # print('{0: <20} {1}'.format(phrase, score)) list_summarized_text2.append(phrase) summarized_text1 = ' '.join(list_summarized_text1) summarized_text2 = ' '.join(list_summarized_text2) # STEP 3: apply fuzzy match for the two signature texts to generate accuracy score fuzz_ratio = fuzz.token_set_ratio(summarized_text1, summarized_text2) tsl.append( (grid_name, field_name, int(year), int(item[0]), int(item[1]), summarized_text1, summarized_text2, fuzz_ratio)) # for item in list_candidate_pairs: insert_duplicate_pairs(tsl) df = df.set_index('id') return df except Exception as e: if conn: conn.rollback() # print("Unexpected error:", sys.exc_info()[0]]) print(e) sys.exit(1) finally: # Close communication with the database if cur: cur.close() if conn: conn.close()
from api_client import APIclient import pandas as pd import community import networkx as nx import numpy as np from collections import defaultdict import pickle char_ngram = 4 bands = 20 seeds = 100 jaccard_min = 0.7 jaccard_max = 0.95 api_client = APIclient() hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=4) def generate_shingles(text): return set(text[head:head + char_ngram] for head in range(0, len(text) - char_ngram)) def jaccard(set_a, set_b): intersection = set_a & set_b union = set_a | set_b return len(intersection) / len(union) def clean_text(df):
def lsh_blocking(lhs_table, rhs_table, hashing_col_position, id_position, id_names, char_ngram=5, seeds=100, bands=5): ''' https://www.youtube.com/watch?v=n3dCcwWV4_k https://nbviewer.jupyter.org/github/mattilyra/LSH/blob/master/examples/Introduction.ipynb hashing_col_position = Bands = Number of Pieces or Bins The size of each bin is inferred Outputs: Returns a Dataframe of Candidate tuples ''' hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=4) if seeds % bands != 0: raise ValueError('Seeds has to be a multiple of bands. {} % {} != 0'.format(seeds, bands)) #Print Hashing information for _ in lhs_table.itertuples(): print(f"Hashing Column name in LHS table is: {lhs_table.columns[hashing_col_position] }, RHS: {rhs_table.columns[hashing_col_position] }") print(f"Id Column name in LHS table is: {lhs_table.columns[id_position ]}, RHS {rhs_table.columns[id_position]}") break lshcache = cache.Cache(num_bands=bands, hasher=hasher) lshcache.clear() # NB! iterating over tuples puts the index in the FIRST position (adds a col in the beginning) therefore we scale forward the index # as specified by the usual column position by 1 print("Adding Fingerprints") for x in rhs_table.itertuples(): #document_string = x[hashing_col_position[0]+1] + " " + str(x[hashing_col_position[1]+1]) document_string = str(x[hashing_col_position + 1]) # If the doc string is ShORTER than char_ngram will throw an error with no message if (len(document_string) < char_ngram ): document_string = document_string + " "*(char_ngram-len(document_string)) docid = x[id_position + 1] # add finger print for entity to the collection #print(f"docid {docid}" ) lshcache.add_fingerprint(hasher.fingerprint(document_string.encode('utf8')), docid) for x in lhs_table.itertuples(): #document_string = x[hashing_col_position[0]+1] + " " + str(x[hashing_col_position[1]+1]) document_string = str(x[hashing_col_position + 1]) if (len(document_string) < char_ngram ): document_string = document_string + " "*(char_ngram-len(document_string)) docid = x[id_position + 1] lshcache.add_fingerprint(hasher.fingerprint(document_string.encode('utf8')), docid) print("Generating Possible Pairs") candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len(b[bucket_id]) > 1: pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) # Assign Id_names for generating DataFrame lhs_table = lhs_table.set_index(id_names[0]) rhs_table = rhs_table.set_index(id_names[1]) print("Pruning and re-arranging possible pair indices.") appropriate_indices = set() #Faster Way than interating through all pairs candidate_indices = pd.Index(candidate_pairs) # Split indices into position 1 and 2 and then we check to see where it matches up candidate_indices_p1 = pd.Index([ x[0] for x in candidate_indices]) candidate_indices_p2 = pd.Index([ x[1] for x in candidate_indices]) # Central issue is that by default within table candidate pairs are given and sometimes the lhs and rhs indices are swapped # Check if correct lhs + rhs alignment is met and store those indices candidate_pairs_correct_alignment_bool = ((candidate_indices_p1.isin(lhs_table.index)) & (candidate_indices_p2.isin(rhs_table.index))) correct_alignment_indices = pd.MultiIndex.from_arrays([candidate_indices_p1[candidate_pairs_correct_alignment_bool],candidate_indices_p2[candidate_pairs_correct_alignment_bool]]) # Now consider the fact that the indices for lhs are in the SECOND posiition in candidate indices and save accordingly # Check for VALID across table pairs BUT the order has just been switched candidate_pairs_switched_alignment_bool = ((candidate_indices_p2.isin(lhs_table.index)) & (candidate_indices_p1.isin(rhs_table.index))) switched_alignment_indices = pd.MultiIndex.from_arrays([candidate_indices_p2[candidate_pairs_switched_alignment_bool],candidate_indices_p1[candidate_pairs_switched_alignment_bool]]) # Now merge the two sets of indices together appropriate_indices = correct_alignment_indices.union(switched_alignment_indices) appropriate_indices.names = id_names candidate_pair_df = pd.concat([lhs_table.loc[appropriate_indices.get_level_values(0)].reset_index(), rhs_table.loc[appropriate_indices.get_level_values(1)].reset_index()],axis = 1) candidate_pair_df = candidate_pair_df.set_index(keys = id_names) # Remove instances where id_names contain null entries non_null_entries = (~candidate_pair_df.index.get_level_values(0).isnull()) & (~candidate_pair_df.index.get_level_values(1).isnull()) candidate_pair_df = candidate_pair_df.loc[non_null_entries, :] lshcache.clear() return candidate_pair_df