def tok_wspace(input_string): """ This function splits the input string into a list of tokens (based on the white space). Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace(None) nan """ if pd.isnull(input_string): return pd.np.NaN # input_string = remove_non_ascii(input_string) if not (isinstance(input_string, six.string_types) or isinstance(input_string, bytes)): input_string = str(input_string) else: if isinstance(input_string, bytes): input_string = input_string.decode('utf-8') measure = sm.WhitespaceTokenizer() return measure.tokenize(input_string)
def tok_wspace(input_string): """ This function splits the input string into a list of tokens (based on the white space). Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace(None) nan """ if pd.isnull(input_string): return pd.np.NaN # input_string = remove_non_ascii(input_string) input_string = gh.convert_to_str_unicode(input_string) measure = sm.WhitespaceTokenizer() return measure.tokenize(input_string)
def candidate_matching(dataset): import tempfile cache_base_dir = tempfile.mkdtemp('remp') (l1, l2) = dataset.label labels_1 = dataset.attributes_1[ dataset.attributes_1['a'] == l1][['s', 'v']] labels_2 = dataset.attributes_2[ dataset.attributes_2['a'] == l2][['s', 'v']] labels_1['v'] = labels_1['v'].apply( str).apply(unidecode.unidecode).str.lower() labels_2['v'] = labels_2['v'].apply( str).apply(unidecode.unidecode).str.lower() tokenizer = sm.WhitespaceTokenizer(return_set=True) num_pairs, pair_files = jaccard_join(labels_1['v'], labels_2['v'], labels_1['s'], labels_2['s'], tokenizer, 0.3, cache_base_dir + '/', n_jobs=-1) M_c = [pd.read_pickle(f) for f in pair_files] return pd.DataFrame(sum(M_c, []), columns=['s1', 's2']).drop_duplicates()
def jaccard_similarity(self, dataset, threshold): df = add_key_reindex(dataset) # concatenate all columns and convert as one string # for each row with '*' as separator A = dataset.applymap(str) A = A.apply(lambda x: '*'.join(x.values.tolist()), axis=1) A = A.astype(str) A = A.str.replace(" ", "") df['row'] = A ssj.profile_table_for_join(df) ws = sm.WhitespaceTokenizer(return_set=True) # auto join output_pairs = ssj.jaccard_join(df, df, 'New_ID', 'New_ID', 'row', 'row', ws, threshold, l_out_attrs=['row'], r_out_attrs=['row'], n_jobs=-1) dup = output_pairs[output_pairs['l_New_ID'] != output_pairs['r_New_ID']] dataset = df[~df['New_ID'].isin(dup['r_New_ID'])] dataset.drop(["New_ID", "row"], axis=1, inplace=True) print("Number of duplicate rows removed:", len(set(dup['r_New_ID']))) return dataset
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb')) nlmInsampleFile = 'NLMdata/dataCached/insample_abstracts_outfile' nlmOutsampleFile = 'NLMdata/dataCached/outSample_abstracts_outfile' nlmInsampleData = pickle.load(open(nlmInsampleFile, 'rb')) nlmOutsampleData = pickle.load(open(nlmOutsampleFile, 'rb')) # Instantiate FVComponent instances csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) LVDist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) FVCList = [csAbstract, csSentence, cosM, LVDist] def classifyAndPredict(insampleData, outsampleData, folderName, componentList): print len(insampleData[0]) print len(outsampleData[1]) # Declare instance of a join object with input arguments easyJoin = myJoin.join(insampleData, outsampleData, folderName) easyJoin.setComponentList(componentList) # Build feature vector easyJoin.buildInsampleFV() easyJoin.buildOutsampleFVReduced(0.01) # Classify and predict with logistic regression
import os def ensure_dir(file_path): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile' OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile' OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile' csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True)) cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True)) LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein()) DEFAULTFV = [jacq3,cosM,cosMq3,LVdist] DEFAULTMODEL = LR() DEFAULTMODELNAME = 'LogisiticRegression' DEFAULTITERATIONS = 25 class join: def __init__(self,insampleData,outsampleData,dataFolder): self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles self.dataFolder = dataFolder self.labels = insampleData[1]
def blocking_rules(x, y): # return True if x and y survive the blocking rules # x and y are pandas series x_directors = str(x['directors']).split(';') y_directors = str(y['directors']).split(';') x_writers = str(x['writers']).split(';') y_writers = str(y['writers']).split(';') x_actors = str(x['cast']).split(';') y_actors = str(y['cast']).split(';') director_match = False writer_match = False actor_match = False overlap_size = 0 # create a tokenizer ws_tok = sm.WhitespaceTokenizer() # create a Jaccard similarity measure object jac = sm.Jaccard() for x_director in x_directors: if director_match == True: break else: # tokenize x_director using whitespace if x_director == 'nan': continue else: x_director = ws_tok.tokenize(x_director) for y_director in y_directors: if y_director == 'nan': continue else: # tokenize y_director using whitespace y_director = ws_tok.tokenize(y_director) if jac.get_sim_score(x_director, y_director) >= 0.8: director_match == True break for x_writer in x_writers: if writer_match == True: break else: if x_writer == 'nan': continue else: x_writer = ws_tok.tokenize(x_writer) for y_writer in y_writers: if y_writer == 'nan': continue else: y_writer = ws_tok.tokenize(y_writer) if jac.get_sim_score(x_writer, y_writer) >= 0.8: writer_match = True break for x_actor in x_actors: if actor_match == True: break else: if x_actor == 'nan': continue else: x_actor = ws_tok.tokenize(x_actor) for y_actor in y_actors: if y_actor == 'nan': continue else: y_actor = ws_tok.tokenize(y_actor) if jac.get_sim_score(x_actor, y_actor) >= 0.8: actor_match = True break if actor_match == False and director_match == False and writer_match == False: return True else: return False
timeObj1=datetime.strptime(row[6].strip(), '%M:%S').time() row[6] = timeObj1 timeObj1=datetime.strptime(row[10].strip(), '%M:%S').time() row[10] = timeObj1 # iteration #2: trim whitespaces from artist and track labels row[3] = row[3].strip() row[4] = row[4].strip() row[7] = row[7].strip() row[8] = row[8].strip() sampledList.append(row) f.close() # Converting every row in to a feature vector featList = [] label = [] ws = ps.WhitespaceTokenizer() for item in sampledList: fi = [] jaro1 = ps.Jaro() # iteration #3: # pull the feature value to zero if none of the token pairs from either artist strings have a high # enough similarity score f1 = 0 for t1 in ws.tokenize(item[3]): if max([jaro1.get_raw_score(t1, t2) for t2 in ws.tokenize(item[7])]) > .75: f1 = jaro1.get_raw_score(item[3], item[7]) break # iteration #3:
def markStudDFSBlockAnswer(processQuestionId, studentAnswerId): # Connect to Graph graph = connectToGraph() whiteSpaceTokenizer = py_stringmatching.WhitespaceTokenizer( return_set=True) jaccard = py_stringmatching.Jaccard() levenshtein = py_stringmatching.Levenshtein() teacherStartNodeKey = graph.data( "MATCH (node:Teacher) WHERE node.text='start' RETURN node.key") studentStartNodeKey = graph.data( "MATCH (node:Student) WHERE node.text='start' RETURN node.key") teachStack = [teacherStartNodeKey[0]['node.key']] studStack = [studentStartNodeKey[0]['node.key']] teachVisitedNodes = [] studVisitedNodes = [] # keeps track of the nodes matched in each level matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] # keeps track of all the nodes visited throughout graph traversal and a node is added to this each time it is visited allMatchedTeachNodes = [] allMatchedStudNodes = [] additionalNodes = [] deletedNodes = [] substitutedNodes = [] addOrSubNodes = [] delOrSubNodes = [] totNoOfAdditionalNodes = 0 totNoOfDeletedNodes = 0 totNoOfSubstitutedNodes = 0 totNoOfOtherIncorrectNodes = 0 totNoOfOtherSubstitutedNodes = 0 totNoOfMatchedNodes = 0 feedback = "" while teachStack or studStack: if teachStack and studStack: teachCurrent = teachStack.pop() studCurrent = studStack.pop() teacherCurrentText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": teachCurrent}) studentCurrentText = graph.data( "MATCH (node:Student) WHERE node.key= {key} RETURN node.text", parameters={"key": studCurrent}) teacherChildNodes = graph.data( "MATCH (parent:Teacher)-[:TO]->(child:Teacher) WHERE parent.key= {key} RETURN child", parameters={"key": teachCurrent}) #teacherStartNodeKey[0]['node.key'] studentChildNodes = graph.data( "MATCH (parent:Student)-[:TO]->(child:Student) WHERE parent.key= {key} RETURN child", parameters={"key": studCurrent}) #studentStartNodeKey[0]['node.key'] teachChildNodesList = list(teacherChildNodes) studChildNodesList = list(studentChildNodes) for teacherChild in teachChildNodesList: teachText = teacherChild['child']['text'] # teachTextTokens = whiteSpaceTokenizer.tokenize(teacherChild['child']['text']) print(teachText) matchFound = 'false' for studentChild in studChildNodesList: if not studentChild['child']['key'] in matchedStudentNodes: print('current stud child') print(studentChild['child']['text']) childText = studentChild['child']['text'] synsetSim_score = getPhraseSimilarity( teachText, childText) if re.match(teachText, childText, re.IGNORECASE) or synsetSim_score >= 0.55: print( 'threshold similarity added to Student stack') feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is correct. ' matchFound = 'true' if not teacherChild['child'][ 'key'] in teachVisitedNodes: studStack.append(studentChild['child']['key']) teachStack.append(teacherChild['child']['key']) if not studentChild['child'][ 'key'] in allMatchedStudNodes and not studentChild[ 'child']['text'] == 'end': totNoOfMatchedNodes = totNoOfMatchedNodes + 1 allMatchedTeachNodes.append( teacherChild['child']['key']) allMatchedStudNodes.append( studentChild['child']['key']) if len(teachChildNodesList) > len( studChildNodesList): matchedTeacherNodes.append( teacherChild['child']['key']) # add to student matched node set too to check while looping through the current level children (above) matchedStudentNodes.append( studentChild['child']['key']) elif len(teachChildNodesList) < len( studChildNodesList): matchedStudentNodes.append( studentChild['child']['key']) else: matchedStudentNodes.append( studentChild['child']['key']) break if matchFound == 'false' and not teacherChild['child'][ 'key'] in teachVisitedNodes: # len(teachChildNodesList) == len(studChildNodesList) and notMatchedParentTeacherNodes.append( teacherChild['child']['key']) elif matchFound == 'false' and teacherChild['child'][ 'key'] in teachVisitedNodes: feedback = feedback + 'The block:' + teacherChild['child']['text'] + \ ' should be connected to block:' + teacherCurrentText[0]['node.text'] + '. ' totNoOfOtherIncorrectNodes = totNoOfOtherIncorrectNodes + 1 if len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) == 1: print('^^^ONE SUBSTITUTED NODE') totNoOfSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback = \ addTheOnlyUnmatchedNode('NotMatchedNode', graph, notMatchedParentTeacherNodes, teachStack, studChildNodesList, matchedStudentNodes, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) > 1: totNoOfSubstitutedNodes = totNoOfSubstitutedNodes + len( notMatchedParentTeacherNodes) againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'substitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(againNotMatchedTeacherNodes) == 1: totNoOfOtherIncorrectNodes, feedback = addTheOnlyUnmatchedNode( 'NotMatchedChildrenNode', graph, againNotMatchedTeacherNodes, teachStack, studChildNodesList, handledStudentNodeList, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(againNotMatchedTeacherNodes) > 1: for studentChild in studChildNodesList: if not studentChild['child'][ 'key'] in handledStudentNodeList and not studentChild[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is substituted, and it ' for againNotTeacherNode in againNotMatchedTeacherNodes: teacherNodeText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": againNotTeacherNode}) feedback = feedback + ' should be:' + teacherNodeText[ 0]['node.text'] + ' or' feedback = feedback + ' one of the mentioned blocks. The immediate blocks that follow ' +\ 'this block:' + studentChild['child']['text'] + ' are also wrong. Please check them. ' substitutedNodes.append( studentChild['child']['key']) # handles scenario where student graph has deleted child nodes for the current node under consideration if len(teachChildNodesList) > len(studChildNodesList): totNoOfDeletedNodes = totNoOfDeletedNodes + ( len(teachChildNodesList) - len(studChildNodesList)) if len(matchedStudentNodes) == len(studChildNodesList): for child in teachChildNodesList: if not child['child'][ 'key'] in matchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'Missing Block:' + child['child']['text'] + \ ' should be connected to block:' + studentCurrentText[0]['node.text'] + '. ' deletedNodes.append(child['child']['key']) elif len(matchedStudentNodes) < len(studChildNodesList): feedback = feedback + 'There is/are ' + str(len(teachChildNodesList) - len(studChildNodesList)) + \ ' missing block(s) that should be connected to block:' + studentCurrentText[0]['node.text'] + \ ' and ' + str(len(studChildNodesList) - len(matchedStudentNodes)) + \ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + \ ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'deletedSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + \ ' that should be connected to block:' + studentCurrentText[0]['node.text'] +\ ' is missing and ' deletedNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + ' block:' + child['child']['text'] + \ ' that should be/is connected to block:' + studentCurrentText[0]['node.text'] + \ ' is deleted/substituted and the immediate child blocks of this block are also wrong, please check them, and ' delOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' # handles scenario where student graph has additional child nodes for the current node under consideration elif len(teachChildNodesList) < len(studChildNodesList): totNoOfAdditionalNodes = totNoOfAdditionalNodes + ( len(studChildNodesList) - len(teachChildNodesList)) # handles scenario where all teacher nodes are matched and there are additional nodes if len(matchedStudentNodes) == len(teachChildNodesList): for child in studChildNodesList: if not child['child'][ 'key'] in matchedStudentNodes and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional Block:' + child['child']['text'] +\ ' is connected to block:' + studentCurrentText[0]['node.text'] + '. ' additionalNodes.append(child['child']['key']) elif not child['child'][ 'key'] in matchedStudentNodes and child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional connection from block:' + studentCurrentText[0]['node.text'] +\ ' to block:' + child['child']['text'] + '. ' elif len(matchedStudentNodes) < len(teachChildNodesList): feedback = feedback + 'There is/are ' + str(len(studChildNodesList) - len(teachChildNodesList)) + \ ' additional block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' and ' +\ str(len(teachChildNodesList) - len(matchedStudentNodes)) +\ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'additionalSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) == (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] + ' is additional and ' additionalNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) > (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + ' block: ' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] +\ ' is additional/substituted and the immediate child blocks of this block are also wrong, please check them, and ' addOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] teachVisitedNodes.append(teachCurrent) studVisitedNodes.append(studCurrent) elif studStack and not teachStack: print('^^^^^^^^^^^^^^^STUDENT stack has moreeee.....') break # handles additional nodes down an additional node starting path if additionalNodes: feedback, totNoOfAdditionalNodes = detectUndetectedBlocks( "additionalNodes", graph, additionalNodes, studVisitedNodes, feedback, totNoOfAdditionalNodes) # handles deleted nodes down a deleted node starting path if deletedNodes: feedback, totNoOfDeletedNodes = detectUndetectedBlocks( "deletedNodes", graph, deletedNodes, teachVisitedNodes, feedback, totNoOfDeletedNodes) # handles substituted nodes down a substituted node starting path if substitutedNodes: feedback, totNoOfOtherSubstitutedNodes = detectUndetectedBlocks( "substitutedNodes", graph, substitutedNodes, studVisitedNodes, feedback, totNoOfOtherSubstitutedNodes) # handles additional/substituted nodes down a additional/substituted node starting path if addOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "addOrSubNodes", graph, addOrSubNodes, studVisitedNodes, feedback, totNoOfOtherIncorrectNodes) # handles deleted/substituted nodes down a deleted/substituted node starting path if delOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "delOrSubNodes", graph, delOrSubNodes, teachVisitedNodes, feedback, totNoOfOtherIncorrectNodes) if totNoOfAdditionalNodes == 0 and totNoOfDeletedNodes == 0 and totNoOfSubstitutedNodes == 0 and \ totNoOfOtherSubstitutedNodes == 0 and totNoOfOtherIncorrectNodes == 0: print(totNoOfMatchedNodes) feedback = feedback + "Excellent Job! All the blocks and the flow are correct!" # Number of correct blocks: " + ". " print(feedback) else: feedback = feedback + "Number of correct blocks except start and end blocks: " + str( totNoOfMatchedNodes) + ". " print(feedback) allocateMarksAndSaveToDatabase(totNoOfMatchedNodes, totNoOfAdditionalNodes, totNoOfDeletedNodes, totNoOfSubstitutedNodes, totNoOfOtherSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback, processQuestionId, studentAnswerId)
# In[20]: # transforming the "budget" column into string and creating a new **mixture** column ssj.utils.converter.dataframe_column_to_str(imdb_data, 'budget', inplace=True) imdb_data['mixture'] = imdb_data['norm_title'] + ' ' + imdb_data['norm_year'] + ' ' + imdb_data['budget'] # repeating the same thing for the Kaggle dataset ssj.utils.converter.dataframe_column_to_str(kaggle_data, 'budget', inplace=True) kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data['norm_title_year'] + ' ' + kaggle_data['budget'] # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**. # In[21]: C = ssj.overlap_coefficient_join(kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), l_out_attrs=['norm_movie_title', 'norm_title_year', 'duration', 'budget', 'content_rating'], r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'], threshold=0.65) C.shape # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs. # # #### Substep B: Specifying the keys # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set. # In[22]: import py_entitymatching as em
def tok_wspace(input_string): if pd.isnull(input_string): return pd.np.NaN measure = sm.WhitespaceTokenizer() return measure.tokenize(input_string)
inplace=True) kaggle_data['mixture'] = kaggle_data['norm_movie_title'] + ' ' + kaggle_data[ 'norm_title_year'] + ' ' + kaggle_data['budget'] # Now, we can use the **mixture** columns to create a desired candiate set which we call **C**. # In[21]: C = ssj.overlap_coefficient_join( kaggle_data, imdb_data, 'id', 'id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), l_out_attrs=[ 'norm_movie_title', 'norm_title_year', 'duration', 'budget', 'content_rating' ], r_out_attrs=['norm_title', 'norm_year', 'length', 'budget', 'mpaa'], threshold=0.65) C.shape # We can see that by doing a similarity join, we already reduced the candidate set to 18,317 pairs. # # #### Substep B: Specifying the keys # The next step is to specify to the **py_entitymatching** package which columns correspond to the keys in each dataframe. Also, we need to specify which columns correspond to the foreign keys of the the two dataframes in the candidate set. # In[22]:
ivivus['id'] = range(ivivus.shape[0]) # Edit distance #similar_titles = ssj.edit_distance_join(hotels, mytour , 'id', 'id', 'norm_name','norm_name', l_out_attrs=['link','norm_name','norm_address','rooms', 'norm_star', 'benefits','norm_image'], r_out_attrs=['link','norm_name','norm_address','rooms', 'norm_star', 'benefits','norm_image'], threshold=2) # A: Finding a candidate set using simple heuristic # create mixture column ssj.utils.converter.dataframe_column_to_str(booking, 'norm_star', inplace=True) booking['mixture'] = booking['norm_name'] + ' ' + booking['norm_address'] # repeating the same thing for the kaggle db #ssj.utils.converter.dataframe_column_to_str(hotels, 'norm_star', inplace=True) hotels['mixture'] = hotels['norm_name'] + ' ' + hotels['norm_address'] C = ssj.overlap_coefficient_join(hotels, booking, 'id','id', 'mixture', 'mixture', sm.WhitespaceTokenizer(), l_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], r_out_attrs=['link','norm_name','name','norm_address','head_address','rooms', 'norm_star', 'benefits','norm_image','rating','destination'], threshold=0.7) print(C.shape) # Creating the Rule-Based Matcher import py_entitymatching as em em.set_key(hotels, 'id') em.set_key(booking, 'id') em.set_key(C, '_id') em.set_ltable(C, hotels) em.set_rtable(C, booking) em.set_fk_rtable(C, 'r_id') em.set_fk_ltable(C, 'l_id')
'1gram_set': sm.QgramTokenizer(qval=1, return_set=True), '1grams_set': sm.QgramTokenizer(qval=1, return_set=True), '2grams_set': sm.QgramTokenizer(qval=2, return_set=True), '3grams_set': sm.QgramTokenizer(qval=3, return_set=True), '4grams_set': sm.QgramTokenizer(qval=4, return_set=True), '5grams_set': sm.QgramTokenizer(qval=5, return_set=True), '6grams_set': sm.QgramTokenizer(qval=6, return_set=True), '7grams_set': sm.QgramTokenizer(qval=7, return_set=True), '8grams_set': sm.QgramTokenizer(qval=8, return_set=True), '9grams_set': sm.QgramTokenizer(qval=9, return_set=True), # Word tokenizers 'alphanumeric': sm.AlphanumericTokenizer(), 'alphanum': sm.AlphanumericTokenizer(), 'alphabetic': sm.AlphabeticTokenizer(), 'whitespace': sm.WhitespaceTokenizer(), 'alphanumeric_set': sm.AlphanumericTokenizer(return_set=True), 'alphanum_set': sm.AlphanumericTokenizer(return_set=True), 'alphabetic_set': sm.AlphabeticTokenizer(return_set=True), 'whitespace_set': sm.WhitespaceTokenizer(return_set=True), } cleaner_lookup = { 'lower_and_strip': lower_and_strip, 'alphanumeric': clean_to_alphanum, 'alphanum': clean_to_alphanum, } def get_similarity_measure(measure, **kwargs): if isinstance(measure, str):