Example #1
0
def candidate_duplicates(document_feed,
                         char_ngram=5,
                         seeds=100,
                         bands=10,
                         hashbytes=4):
    char_ngram = 5
    sims = []
    hasher = minhash.MinHasher(seeds=seeds,
                               char_ngram=char_ngram,
                               hashbytes=hashbytes)
    if seeds % bands != 0:
        raise ValueError(
            'Seeds has to be a multiple of bands. {} % {} != 0'.format(
                seeds, bands))

    lshcache = cache.Cache(num_bands=bands, hasher=hasher)
    for i_line, line in enumerate(document_feed):
        line = line.decode('utf8')
        docid, headline_text = line.split('\t', 1)
        fingerprint = hasher.fingerprint(headline_text.encode('utf8'))

        # in addition to storing the fingerpring store the line
        # number and document ID to help analysis later on
        lshcache.add_fingerprint(fingerprint, doc_id=(i_line, docid))

    candidate_pairs = set()
    for b in lshcache.bins:
        for bucket_id in b:
            if len(b[bucket_id]) > 1:
                pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                candidate_pairs.update(pairs_)

    return candidate_pairs
Example #2
0
def clean_duplicates(text_df,
                     text_col='body_text',
                     method=100,
                     char_ngram=5,
                     seeds=100,
                     bands=5,
                     hashbytes=4,
                     thresh=.9):
    start = len(text_df)
    if method == 'latlon':
        #must have 'latitude', 'longitude','price' colums to use this method
        text_df = text_df.drop_duplicates(['latitude', 'longitude', 'price'])
    if type(method) == int:
        text_df['body_100'] = text_df[text_col].str.slice(stop=method).copy()
        text_df = text_df.drop_duplicates(subset='body_100').drop('body_100',
                                                                  axis=1)
    #use LSH minhash to clean dupes requires LSH and minhash3 to run!!!
    if method == 'lsh':
        try:
            text_df = text_df.sort_values('listingDate').reset_index()
        except (KeyError):
            try:
                module_logger.info(
                    'text_df has insufficient date information, dropping by month/day'
                )
                text_df = text_df.sort_values(['scraped_month',
                                               'scraped_day']).reset_index()
            except:
                module_logger.info(
                    'text_df has insufficient date information, dropping by index'
                )
                text_df = text_df.reset_index()
        # first get candidate_pairs
        candidate_pairs = candidate_duplicates(text_df, text_col, char_ngram,
                                               seeds, bands, hashbytes)
        module_logger.info("Found " + str(len(candidate_pairs)) +
                           " possible duplicate pairs")
        # then we make sure jaccard similarity is above .9
        lines = text_df[text_col].str.lower().values
        hasher = minhash.MinHasher(seeds=seeds,
                                   char_ngram=char_ngram,
                                   hashbytes=hashbytes)
        lshcache = cache.Cache(bands=bands, hasher=hasher)
        similarities = []
        for (line_a, line_b) in candidate_pairs:
            doc_a, doc_b = lines[line_a], lines[line_b]
            similarities.append(
                (line_a, line_b, jaccard_sim(doc_a, doc_b, char_ngram)))
            if len(similarities) % 10000 == 0:
                module_logger.info("Processed " + str(len(similarities)) +
                                   " possible duplicates")
        # reduce to only jaccards above .9 and check which pair is older
        drop_list = [
            min(pair[0], pair[1]) for pair in similarities if pair[2] >= thresh
        ]
        text_df = text_df.drop(set(drop_list)).set_index("index")
        dupes = start - len(text_df)
        module_logger.info("Dropped " + str(dupes) + " duplicates")
    return text_df
Example #3
0
    def _get_candidate_pairs(self, df):
        hasher = minhash.MinHasher(seeds=self.seeds,
                                   char_ngram=self.char_ngram,
                                   hashbytes=4)
        lshcache = cache.Cache(num_bands=self.bands, hasher=hasher)
        df['fingerprint'] = df['text'].apply(lambda t: hasher.fingerprint(t))
        df.apply(lambda f: lshcache.add_fingerprint(f['fingerprint'],
                                                    doc_id=f.name),
                 axis=1)

        communities = []
        candidate_pairs = set()
        for b in lshcache.bins:
            for bucket_id in b:
                if len(b[bucket_id]) > 1:
                    pairs = set(itertools.combinations(b[bucket_id], r=2))
                    # communities.append(self.find_community(df, pairs))
                    candidate_pairs.update(pairs)
        return candidate_pairs
Example #4
0
def candidate_duplicates(text_df,
                         text_col,
                         char_ngram=5,
                         seeds=100,
                         bands=5,
                         hashbytes=4):
    hasher = minhash.MinHasher(seeds=seeds,
                               char_ngram=char_ngram,
                               hashbytes=hashbytes)
    if seeds % bands != 0:
        raise ValueError(
            'Seeds has to be a multiple of bands. {} % {} != 0'.format(
                seeds, bands))
    lshcache = cache.Cache(num_bands=bands, hasher=hasher)
    for i in range(len(text_df)):
        line = text_df[text_col].iloc[i]
        lshcache.add_fingerprint(hasher.fingerprint(line), doc_id=i)
    candidate_pairs = set()
    for b in lshcache.bins:
        for bucket_id in b:
            if len(b[bucket_id]) > 1:
                pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                candidate_pairs.update(pairs_)
    return candidate_pairs
Example #5
0
        # in addition to storing the fingerpring store the line
        # number and document ID to help analysis later on
        lshcache.add_fingerprint(fingerprint, doc_id=(i_line, docid))

    candidate_pairs = set()
    for b in lshcache.bins:
        for bucket_id in b:
            if len(b[bucket_id]) > 1:
                pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                candidate_pairs.update(pairs_)

    return candidate_pairs


hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)

lines = []
start_time = time.time()
with open('dataset.txt', 'rb') as fh:
    # read the first 1000 lines into memory so we can compare them
    for line in itertools.islice(fh, 10000):
        lines.append(line.decode('utf8'))

    # reset file pointer and do LSH
    fh.seek(0)
    feed = itertools.islice(fh, 10000)
    candidates = candidate_duplicates(feed,
                                      char_ngram=5,
                                      seeds=100,
                                      bands=10,
Example #6
0
def find_and_store_duplicate_syllabi(grid_name, year, field_name):
    global stop
    try:
        # connect to existing database
        conn = psycopg2.connect(
            "dbname='litindex' user='******' host='0.0.0.0' password='******'"
        )
        # Open a cursor to perform database operations
        cur = conn.cursor()

        param_list = [grid_name, year, field_name]
        select_query = "SELECT id, text_md5, text from open_syllabi where grid_name='{}' and year='{}' and field_name='{}'".format(
            *param_list)  #unpack the list
        cur.execute(select_query)
        df = pd.DataFrame(cur.fetchall(), columns=['id', 'text_md5', 'text'])
        print("\tNO OF RECORDS = {}", len(df))

        punctuation_translator = str.maketrans('', '', string.punctuation)

        # PRE-PROCESSING REQUIRED:
        # normalize by lowering the case, removing punctuations, removing numbers and english stop words
        df['text_lower_case_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in x.lower().translate(punctuation_translator).split(
            ) if not word.isdigit() and word not in stop
        ]))
        # the following pre-processing is required to improve quality of LSH results
        # especially considering highly templated text in course descriptions
        df['text_unique_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in list(
                set(x.lower().translate(punctuation_translator).split()))
            if not word.isdigit() and word not in stop
        ]))
        common_words_series = pd.Series(' '.join(
            df['text_unique_words']).lower().strip(
                string.punctuation).split()).value_counts()
        most_common_words_series = common_words_series[common_words_series > (
            0.5 * len(df))].dropna()
        most_common_words_list = most_common_words_series.index.tolist()
        df['text_without_common_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in x.lower().translate(punctuation_translator).split(
            ) if word not in (most_common_words_list) and word not in stop
        ]))

        # STEP 1: use LSH algorithm to find candidate duplicates
        # find duplicates
        # run through adding documents to the LSH cache
        hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)
        lshcache = cache.Cache(bands=10, hasher=hasher)

        for idx in range(0, (len(df) - 1)):
            lshcache.add_fingerprint(
                hasher.fingerprint(df.loc[idx, 'text_without_common_words']),
                df.loc[idx, 'id'])

        # for every bucket in the LSH cache get the candidate duplicates
        # note this fast way to get candidate pairs with reasonable accuracy, that will be filtered later
        candidate_pairs = set()
        for b in lshcache.bins:
            for bucket_id in b:
                if len(
                        b[bucket_id]
                ) > 1:  # if the bucket contains more than a single document
                    pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                    candidate_pairs.update(pairs_)
        list_candidate_pairs = list(candidate_pairs)
        tsl = []
        # df = df.set_index('id')
        print("\tcandidate pairs found = {}", len(list_candidate_pairs))

        # STEP 2: use TFIDF to process the records associated with the candidate duplicates and generate signature text
        tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 1),
                             min_df=0,
                             stop_words='english')
        tfidf_matrix = tf.fit_transform(df['text_lower_case_words'])
        feature_names = tf.get_feature_names()
        dense = tfidf_matrix.todense()

        for item in list_candidate_pairs:
            idx1 = df.index[df['id'] == int(item[0])]
            idx2 = df.index[df['id'] == int(item[1])]
            episode1 = dense[idx1].tolist()[0]
            episode2 = dense[idx2].tolist()[0]
            phrase_scores1 = [
                pair for pair in zip(range(0, len(episode1)), episode1)
                if pair[1] > 0
            ]
            sorted_phrase_scores1 = sorted(phrase_scores1,
                                           key=lambda t: t[1] * -1)
            phrase_scores2 = [
                pair for pair in zip(range(0, len(episode2)), episode2)
                if pair[1] > 0
            ]
            sorted_phrase_scores2 = sorted(phrase_scores2,
                                           key=lambda t: t[1] * -1)
            list_summarized_text1 = []
            list_summarized_text2 = []
            for phrase, score in [(feature_names[word_id], score)
                                  for (word_id, score) in sorted_phrase_scores1
                                  ][:10]:
                # print('{0: <20} {1}'.format(phrase, score))
                list_summarized_text1.append(phrase)
            for phrase, score in [(feature_names[word_id], score)
                                  for (word_id, score) in sorted_phrase_scores2
                                  ][:10]:
                # print('{0: <20} {1}'.format(phrase, score))
                list_summarized_text2.append(phrase)

            summarized_text1 = ' '.join(list_summarized_text1)
            summarized_text2 = ' '.join(list_summarized_text2)
            # STEP 3: apply fuzzy match for the two signature texts to generate accuracy score
            fuzz_ratio = fuzz.token_set_ratio(summarized_text1,
                                              summarized_text2)
            tsl.append(
                (grid_name, field_name, int(year), int(item[0]), int(item[1]),
                 summarized_text1, summarized_text2, fuzz_ratio))
        # for item in list_candidate_pairs:
        insert_duplicate_pairs(tsl)

        df = df.set_index('id')
        return df
    except Exception as e:
        if conn:
            conn.rollback()
        # print("Unexpected error:", sys.exc_info()[0]])
        print(e)
        sys.exit(1)
    finally:
        # Close communication with the database
        if cur:
            cur.close()
        if conn:
            conn.close()
Example #7
0
from api_client import APIclient
import pandas as pd
import community
import networkx as nx
import numpy as np
from collections import defaultdict
import pickle

char_ngram = 4
bands = 20
seeds = 100
jaccard_min = 0.7
jaccard_max = 0.95
api_client = APIclient()

hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=4)


def generate_shingles(text):
    return set(text[head:head + char_ngram]
               for head in range(0,
                                 len(text) - char_ngram))


def jaccard(set_a, set_b):
    intersection = set_a & set_b
    union = set_a | set_b
    return len(intersection) / len(union)


def clean_text(df):
def lsh_blocking(lhs_table, rhs_table, hashing_col_position, id_position, id_names, char_ngram=5, seeds=100, bands=5):
    '''
    https://www.youtube.com/watch?v=n3dCcwWV4_k
    https://nbviewer.jupyter.org/github/mattilyra/LSH/blob/master/examples/Introduction.ipynb
    
    hashing_col_position = 
    Bands = Number of Pieces or Bins
    The size of each bin is inferred

    Outputs:
        Returns a Dataframe of Candidate tuples 
    '''

    hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=4)
    if seeds % bands != 0:
        raise ValueError('Seeds has to be a multiple of bands. {} % {} != 0'.format(seeds, bands))



    #Print Hashing information
    for _ in lhs_table.itertuples():
        print(f"Hashing Column name in LHS table is: {lhs_table.columns[hashing_col_position] }, RHS: {rhs_table.columns[hashing_col_position] }")
        print(f"Id Column name in LHS table is: {lhs_table.columns[id_position ]}, RHS {rhs_table.columns[id_position]}")
        break


    lshcache = cache.Cache(num_bands=bands, hasher=hasher)
    lshcache.clear()
    # NB! iterating over tuples puts the index in the FIRST position (adds a col in the beginning) therefore we scale forward the index
    # as specified by the usual column position by 1
    print("Adding Fingerprints")
    for x in rhs_table.itertuples():
        #document_string = x[hashing_col_position[0]+1] + " " +  str(x[hashing_col_position[1]+1]) 
        document_string = str(x[hashing_col_position + 1])
        # If the doc string is ShORTER than char_ngram will throw an error with no message
        if (len(document_string) < char_ngram ):
            document_string = document_string + " "*(char_ngram-len(document_string))
        docid  = x[id_position + 1]
        # add finger print for entity to the collection
        #print(f"docid {docid}" )
        lshcache.add_fingerprint(hasher.fingerprint(document_string.encode('utf8')), docid)

    for x in lhs_table.itertuples():
        #document_string = x[hashing_col_position[0]+1] + " " +  str(x[hashing_col_position[1]+1]) 
        document_string = str(x[hashing_col_position + 1])
        if (len(document_string) < char_ngram ):
            document_string = document_string + " "*(char_ngram-len(document_string)) 
        docid  = x[id_position + 1]
        lshcache.add_fingerprint(hasher.fingerprint(document_string.encode('utf8')), docid)
    

    print("Generating Possible Pairs")
    candidate_pairs = set()
    for b in lshcache.bins:
        for bucket_id in b:
            if len(b[bucket_id]) > 1:
                pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                candidate_pairs.update(pairs_)
    
    # Assign Id_names for generating DataFrame
    lhs_table = lhs_table.set_index(id_names[0])
    rhs_table = rhs_table.set_index(id_names[1])

    print("Pruning and re-arranging possible pair indices.")
    appropriate_indices = set()
    #Faster Way than interating through all pairs
    candidate_indices = pd.Index(candidate_pairs)

    # Split indices into position 1 and 2 and then we check to see where it matches up
    candidate_indices_p1 = pd.Index([ x[0] for x in candidate_indices])
    candidate_indices_p2 = pd.Index([ x[1] for x in candidate_indices])
    # Central issue is that by default within table candidate pairs are given and sometimes the lhs and rhs indices are swapped

    # Check if correct lhs + rhs alignment is met and store those indices
    candidate_pairs_correct_alignment_bool = ((candidate_indices_p1.isin(lhs_table.index)) & (candidate_indices_p2.isin(rhs_table.index)))
    correct_alignment_indices = pd.MultiIndex.from_arrays([candidate_indices_p1[candidate_pairs_correct_alignment_bool],candidate_indices_p2[candidate_pairs_correct_alignment_bool]])
    # Now consider the fact that the indices for lhs are in the SECOND posiition in candidate indices and save accordingly
    # Check for VALID across table pairs BUT the order has just been switched
    candidate_pairs_switched_alignment_bool = ((candidate_indices_p2.isin(lhs_table.index)) & (candidate_indices_p1.isin(rhs_table.index)))
    switched_alignment_indices = pd.MultiIndex.from_arrays([candidate_indices_p2[candidate_pairs_switched_alignment_bool],candidate_indices_p1[candidate_pairs_switched_alignment_bool]])
    # Now merge the two sets of indices together
    appropriate_indices = correct_alignment_indices.union(switched_alignment_indices)
    appropriate_indices.names = id_names


    candidate_pair_df = pd.concat([lhs_table.loc[appropriate_indices.get_level_values(0)].reset_index(), rhs_table.loc[appropriate_indices.get_level_values(1)].reset_index()],axis = 1)
    candidate_pair_df = candidate_pair_df.set_index(keys = id_names)
    # Remove instances where id_names contain null entries
    non_null_entries = (~candidate_pair_df.index.get_level_values(0).isnull()) & (~candidate_pair_df.index.get_level_values(1).isnull())
    candidate_pair_df = candidate_pair_df.loc[non_null_entries, :]

    lshcache.clear()

    return candidate_pair_df