def jaccard(arr1, arr2): """ This function computes the Jaccard measure between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the Jaccard measure should be computed. Returns: The Jaccard measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaccard(['data', 'science'], ['data']) 0.5 >>> em.jaccard(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create jaccard measure object measure = sm.Jaccard() # Call a function to compute a similarity score return measure.get_raw_score(arr1, arr2)
def matchHeaders(headers): jac = sm.Jaccard() lev = sm.Levenshtein() oc = sm.OverlapCoefficient() i = 0 j = 0 header_len = len(headers) for i in range(0, header_len - 1): for first in headers[i]: j = i + 1 if j == header_len: break for second in headers[j]: # print(first, '' , second, '') # i = i + 1 # if(i == header_len): # continue x = first y = second delim_tok = sm.DelimiterTokenizer(delim_set=['_']) jacScore = jac.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) levScore = lev.get_sim_score(x, y) ocScore = oc.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5): print(first + ' of Table' + str(i + 1) + ' and ' + second + ' of Table' + str(j + 1) + ' matched')
def jac_score(self, str_pair, sim_score=True) -> float: """ calculate jaccard similarity between two single sets of tokens :return: similarity score (0 to 1) """ e1, e2 = self._check_input(str_pair, type_=list) jac = sm.Jaccard() return jac.get_sim_score(e1, e2) if sim_score else jac.get_raw_score( e1, e2)
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim)
def setUp(self): self.df = read_data(path_big_ten) self.trigramtok = sm.QgramTokenizer(qval=3) self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id', 'name', 'name', self.trigramtok, 0.3) self.jaccsim = sm.Jaccard() self.sim_scores = get_sim_scores(self.df, self.blocked_pairs, self.trigramtok, self.jaccsim) self.sim_matrix = get_sim_matrix(self.df, self.sim_scores) self.aggcl = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='complete') self.labels = self.aggcl.fit_predict(self.sim_matrix)
def jaccard(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create jaccard measure object measure = sm.Jaccard() # Call a function to compute a similarity score return measure.get_raw_score(arr1, arr2)
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def jac_q3_sim(str1, str2): try: # not needed as we already casted all to string and # lower cased and stripped all values before handing it over #str1 = str(str1).lower().strip() #str2 = str(str2).lower().strip() # assign a sim score of -1 when one of them is null if (str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == ''): return -1 else: q3_tok = sm.QgramTokenizer(qval=3, return_set=True) jac = sm.Jaccard() return jac.get_raw_score(q3_tok.tokenize(str1), q3_tok.tokenize(str2)) except: logger.warning('Issue with Jaccard_q3_Sim, hence -1 assigned') return -1
def get_oov_jaccard_sim(self, s1, s2): en_tokens_f = word_tokenize(s1.lower()) de_tokens_f = word_tokenize(s2.lower()) # Replacing the OOVs if their match has found en_tokens = [] for token in en_tokens_f: if token in self.en_oov: for el in self.en_oov[token]: en_tokens.append(el) else: en_tokens.append(token) de_tokens = [] for token in de_tokens_f: if token in self.de_oov: for el in self.de_oov[token]: de_tokens.append(el) else: de_tokens.append(token) new_en_tokens = [ token for token in en_tokens if token not in self.en_dictionary and token not in self.en_oov ] new_de_tokens = [ token for token in de_tokens if token not in self.de_dictionary and token not in self.de_oov ] new_en_str = " ".join(new_en_tokens) new_de_str = " ".join(new_de_tokens) if new_en_str == "" or new_de_str == "": return 0 ## Getting 3 - grams measure = sm.QgramTokenizer(qval=3) en_grams = measure.tokenize(new_en_str) de_grams = measure.tokenize(new_de_str) ## Getting Jaccard distance measure = sm.Jaccard() return measure.get_raw_score(en_grams, de_grams)
def main(): import pickle import py_stringmatching as sm from sklearn.feature_extraction.text import TfidfVectorizer INSAMPLE_ABS_OUTFILE = '../dataCached/insample_abstracts_outfile' OUTSAMPLE_ABS_OUTFILE = '../dataCached/outSample_abstracts_outfile' OUTSAMPLE_ABS_REDUCED_OUTFILE = '../dataCached/outSample_abstracts_reduced_outfile' a1 = pickle.load(open(INSAMPLE_ABS_OUTFILE,'rb')) a2 = pickle.load(open(OUTSAMPLE_ABS_OUTFILE,'rb')) a3 = pickle.load(open(OUTSAMPLE_ABS_REDUCED_OUTFILE,'rb')) csAbstract = CosSim('Cos Sim Abstract',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = CosSim('Cos Sim Sentence',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = stringMatchExcerpts('Fuzzy Jaccard',sm.Jaccard(),sm.QgramTokenizer(qval=3)) components = [csAbstract,csSentence,jacq3] a1Features = [c.generateFeatures(a1) for c in components] print len(a1Features)
import numpy as np import os def ensure_dir(file_path): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile' OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile' OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile' csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True)) cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True)) LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein()) DEFAULTFV = [jacq3,cosM,cosMq3,LVdist] DEFAULTMODEL = LR() DEFAULTMODELNAME = 'LogisiticRegression' DEFAULTITERATIONS = 25 class join: def __init__(self,insampleData,outsampleData,dataFolder): self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles self.dataFolder = dataFolder
df['Sequence1'] = df['aTokens'].apply(sentence) df['Sequence2'] = df['bTokens'].apply(sentence) df.head() # In[26]: get_ipython().system('pip install py_stringmatching') import py_stringmatching as sm # # Token Based Similarities # In[27]: jac = sm.Jaccard() df['Jaccard'] = df.apply( lambda x: jac.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[28]: jaro = sm.Jaro() # !pip install pyjarowinkler # from pyjarowinkler import distance # def jaro_similarity(word1, word2): # return distance.get_jaro_distance(word1, word2, winkler=False, scaling=0.1) def jaccard_similarity_general(tokens1, tokens2):
def blocking_rules(x, y): # return True if x and y survive the blocking rules # x and y are pandas series x_directors = str(x['directors']).split(';') y_directors = str(y['directors']).split(';') x_writers = str(x['writers']).split(';') y_writers = str(y['writers']).split(';') x_actors = str(x['cast']).split(';') y_actors = str(y['cast']).split(';') director_match = False writer_match = False actor_match = False overlap_size = 0 # create a tokenizer ws_tok = sm.WhitespaceTokenizer() # create a Jaccard similarity measure object jac = sm.Jaccard() for x_director in x_directors: if director_match == True: break else: # tokenize x_director using whitespace if x_director == 'nan': continue else: x_director = ws_tok.tokenize(x_director) for y_director in y_directors: if y_director == 'nan': continue else: # tokenize y_director using whitespace y_director = ws_tok.tokenize(y_director) if jac.get_sim_score(x_director, y_director) >= 0.8: director_match == True break for x_writer in x_writers: if writer_match == True: break else: if x_writer == 'nan': continue else: x_writer = ws_tok.tokenize(x_writer) for y_writer in y_writers: if y_writer == 'nan': continue else: y_writer = ws_tok.tokenize(y_writer) if jac.get_sim_score(x_writer, y_writer) >= 0.8: writer_match = True break for x_actor in x_actors: if actor_match == True: break else: if x_actor == 'nan': continue else: x_actor = ws_tok.tokenize(x_actor) for y_actor in y_actors: if y_actor == 'nan': continue else: y_actor = ws_tok.tokenize(y_actor) if jac.get_sim_score(x_actor, y_actor) >= 0.8: actor_match = True break if actor_match == False and director_match == False and writer_match == False: return True else: return False
import pandas as pd from .util import suffix import py_stringmatching as sm from remp import string_matching tokenizer = sm.QgramTokenizer(qval=2, return_set=True) jaccard = sm.Jaccard() def similarity_func_default(string1, string2): return jaccard.get_sim_score(tokenizer.tokenize(string1), tokenizer.tokenize(string2)) def construct_similarity_list(left_triples, right_triples, entity_candidates, aligned_attributes=None, similarity_func=None): if aligned_attributes is None: shared_attributes = set(left_triples['a'].unique()) shared_attributes &= set(right_triples['a'].unique()) shared_attributes = list(shared_attributes) aligned_attributes = pd.DataFrame({ 'a1': shared_attributes, 'a2': shared_attributes }) if 'attr_id' not in aligned_attributes: aligned_attributes['attr_id'] = aligned_attributes.index paired = pd.merge(entity_candidates, suffix(left_triples, '1')) paired = pd.merge(paired, aligned_attributes)
def markStudDFSBlockAnswer(processQuestionId, studentAnswerId): # Connect to Graph graph = connectToGraph() whiteSpaceTokenizer = py_stringmatching.WhitespaceTokenizer( return_set=True) jaccard = py_stringmatching.Jaccard() levenshtein = py_stringmatching.Levenshtein() teacherStartNodeKey = graph.data( "MATCH (node:Teacher) WHERE node.text='start' RETURN node.key") studentStartNodeKey = graph.data( "MATCH (node:Student) WHERE node.text='start' RETURN node.key") teachStack = [teacherStartNodeKey[0]['node.key']] studStack = [studentStartNodeKey[0]['node.key']] teachVisitedNodes = [] studVisitedNodes = [] # keeps track of the nodes matched in each level matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] # keeps track of all the nodes visited throughout graph traversal and a node is added to this each time it is visited allMatchedTeachNodes = [] allMatchedStudNodes = [] additionalNodes = [] deletedNodes = [] substitutedNodes = [] addOrSubNodes = [] delOrSubNodes = [] totNoOfAdditionalNodes = 0 totNoOfDeletedNodes = 0 totNoOfSubstitutedNodes = 0 totNoOfOtherIncorrectNodes = 0 totNoOfOtherSubstitutedNodes = 0 totNoOfMatchedNodes = 0 feedback = "" while teachStack or studStack: if teachStack and studStack: teachCurrent = teachStack.pop() studCurrent = studStack.pop() teacherCurrentText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": teachCurrent}) studentCurrentText = graph.data( "MATCH (node:Student) WHERE node.key= {key} RETURN node.text", parameters={"key": studCurrent}) teacherChildNodes = graph.data( "MATCH (parent:Teacher)-[:TO]->(child:Teacher) WHERE parent.key= {key} RETURN child", parameters={"key": teachCurrent}) #teacherStartNodeKey[0]['node.key'] studentChildNodes = graph.data( "MATCH (parent:Student)-[:TO]->(child:Student) WHERE parent.key= {key} RETURN child", parameters={"key": studCurrent}) #studentStartNodeKey[0]['node.key'] teachChildNodesList = list(teacherChildNodes) studChildNodesList = list(studentChildNodes) for teacherChild in teachChildNodesList: teachText = teacherChild['child']['text'] # teachTextTokens = whiteSpaceTokenizer.tokenize(teacherChild['child']['text']) print(teachText) matchFound = 'false' for studentChild in studChildNodesList: if not studentChild['child']['key'] in matchedStudentNodes: print('current stud child') print(studentChild['child']['text']) childText = studentChild['child']['text'] synsetSim_score = getPhraseSimilarity( teachText, childText) if re.match(teachText, childText, re.IGNORECASE) or synsetSim_score >= 0.55: print( 'threshold similarity added to Student stack') feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is correct. ' matchFound = 'true' if not teacherChild['child'][ 'key'] in teachVisitedNodes: studStack.append(studentChild['child']['key']) teachStack.append(teacherChild['child']['key']) if not studentChild['child'][ 'key'] in allMatchedStudNodes and not studentChild[ 'child']['text'] == 'end': totNoOfMatchedNodes = totNoOfMatchedNodes + 1 allMatchedTeachNodes.append( teacherChild['child']['key']) allMatchedStudNodes.append( studentChild['child']['key']) if len(teachChildNodesList) > len( studChildNodesList): matchedTeacherNodes.append( teacherChild['child']['key']) # add to student matched node set too to check while looping through the current level children (above) matchedStudentNodes.append( studentChild['child']['key']) elif len(teachChildNodesList) < len( studChildNodesList): matchedStudentNodes.append( studentChild['child']['key']) else: matchedStudentNodes.append( studentChild['child']['key']) break if matchFound == 'false' and not teacherChild['child'][ 'key'] in teachVisitedNodes: # len(teachChildNodesList) == len(studChildNodesList) and notMatchedParentTeacherNodes.append( teacherChild['child']['key']) elif matchFound == 'false' and teacherChild['child'][ 'key'] in teachVisitedNodes: feedback = feedback + 'The block:' + teacherChild['child']['text'] + \ ' should be connected to block:' + teacherCurrentText[0]['node.text'] + '. ' totNoOfOtherIncorrectNodes = totNoOfOtherIncorrectNodes + 1 if len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) == 1: print('^^^ONE SUBSTITUTED NODE') totNoOfSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback = \ addTheOnlyUnmatchedNode('NotMatchedNode', graph, notMatchedParentTeacherNodes, teachStack, studChildNodesList, matchedStudentNodes, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(teachChildNodesList) == len(studChildNodesList) and len( notMatchedParentTeacherNodes) > 1: totNoOfSubstitutedNodes = totNoOfSubstitutedNodes + len( notMatchedParentTeacherNodes) againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'substitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(againNotMatchedTeacherNodes) == 1: totNoOfOtherIncorrectNodes, feedback = addTheOnlyUnmatchedNode( 'NotMatchedChildrenNode', graph, againNotMatchedTeacherNodes, teachStack, studChildNodesList, handledStudentNodeList, studStack, totNoOfSubstitutedNodes, feedback, studVisitedNodes, teachCurrent, studentCurrentText[0]['node.text'], totNoOfOtherIncorrectNodes) elif len(againNotMatchedTeacherNodes) > 1: for studentChild in studChildNodesList: if not studentChild['child'][ 'key'] in handledStudentNodeList and not studentChild[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'The block:' + studentChild['child']['text'] + \ ' connected to block:' + studentCurrentText[0]['node.text'] + ' is substituted, and it ' for againNotTeacherNode in againNotMatchedTeacherNodes: teacherNodeText = graph.data( "MATCH (node:Teacher) WHERE node.key= {key} RETURN node.text", parameters={"key": againNotTeacherNode}) feedback = feedback + ' should be:' + teacherNodeText[ 0]['node.text'] + ' or' feedback = feedback + ' one of the mentioned blocks. The immediate blocks that follow ' +\ 'this block:' + studentChild['child']['text'] + ' are also wrong. Please check them. ' substitutedNodes.append( studentChild['child']['key']) # handles scenario where student graph has deleted child nodes for the current node under consideration if len(teachChildNodesList) > len(studChildNodesList): totNoOfDeletedNodes = totNoOfDeletedNodes + ( len(teachChildNodesList) - len(studChildNodesList)) if len(matchedStudentNodes) == len(studChildNodesList): for child in teachChildNodesList: if not child['child'][ 'key'] in matchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'Missing Block:' + child['child']['text'] + \ ' should be connected to block:' + studentCurrentText[0]['node.text'] + '. ' deletedNodes.append(child['child']['key']) elif len(matchedStudentNodes) < len(studChildNodesList): feedback = feedback + 'There is/are ' + str(len(teachChildNodesList) - len(studChildNodesList)) + \ ' missing block(s) that should be connected to block:' + studentCurrentText[0]['node.text'] + \ ' and ' + str(len(studChildNodesList) - len(matchedStudentNodes)) + \ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + \ ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'deletedSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + \ ' that should be connected to block:' + studentCurrentText[0]['node.text'] +\ ' is missing and ' deletedNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len(studChildNodesList): for child in teachChildNodesList: if child['child'][ 'key'] in againNotMatchedTeacherNodes and not child[ 'child']['key'] in teachVisitedNodes: feedback = feedback + ' block:' + child['child']['text'] + \ ' that should be/is connected to block:' + studentCurrentText[0]['node.text'] + \ ' is deleted/substituted and the immediate child blocks of this block are also wrong, please check them, and ' delOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' # handles scenario where student graph has additional child nodes for the current node under consideration elif len(teachChildNodesList) < len(studChildNodesList): totNoOfAdditionalNodes = totNoOfAdditionalNodes + ( len(studChildNodesList) - len(teachChildNodesList)) # handles scenario where all teacher nodes are matched and there are additional nodes if len(matchedStudentNodes) == len(teachChildNodesList): for child in studChildNodesList: if not child['child'][ 'key'] in matchedStudentNodes and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional Block:' + child['child']['text'] +\ ' is connected to block:' + studentCurrentText[0]['node.text'] + '. ' additionalNodes.append(child['child']['key']) elif not child['child'][ 'key'] in matchedStudentNodes and child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'Additional connection from block:' + studentCurrentText[0]['node.text'] +\ ' to block:' + child['child']['text'] + '. ' elif len(matchedStudentNodes) < len(teachChildNodesList): feedback = feedback + 'There is/are ' + str(len(studChildNodesList) - len(teachChildNodesList)) + \ ' additional block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' and ' +\ str(len(teachChildNodesList) - len(matchedStudentNodes)) +\ ' block(s) connected to block:' + studentCurrentText[0]['node.text'] + ' is/are substituted - The incorrect blocks are ' againNotMatchedTeacherNodes, handledStudentNodeList, feedback = checkForCurrentNodeChildMatch( 'additionalSubstitutedCaller', graph, matchedStudentNodes, notMatchedParentTeacherNodes, studChildNodesList, studVisitedNodes, studStack, teachStack, feedback, studentCurrentText[0]['node.text']) if len(handledStudentNodeList) == len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) == (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + 'block:' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] + ' is additional and ' additionalNodes.append(child['child']['key']) elif len(handledStudentNodeList) < len( teachChildNodesList ): # len(againNotMatchedTeacherNodes) > (len(studChildNodesList)-len(teachChildNodesList)) for child in studChildNodesList: if not child['child'][ 'key'] in handledStudentNodeList and not child[ 'child']['key'] in studVisitedNodes: feedback = feedback + ' block: ' + child['child']['text'] + ' connected to block:' +\ studentCurrentText[0]['node.text'] +\ ' is additional/substituted and the immediate child blocks of this block are also wrong, please check them, and ' addOrSubNodes.append(child['child']['key']) feedback = feedback + 'please check all these incorrect blocks. ' matchedTeacherNodes = [] matchedStudentNodes = [] notMatchedParentTeacherNodes = [] teachVisitedNodes.append(teachCurrent) studVisitedNodes.append(studCurrent) elif studStack and not teachStack: print('^^^^^^^^^^^^^^^STUDENT stack has moreeee.....') break # handles additional nodes down an additional node starting path if additionalNodes: feedback, totNoOfAdditionalNodes = detectUndetectedBlocks( "additionalNodes", graph, additionalNodes, studVisitedNodes, feedback, totNoOfAdditionalNodes) # handles deleted nodes down a deleted node starting path if deletedNodes: feedback, totNoOfDeletedNodes = detectUndetectedBlocks( "deletedNodes", graph, deletedNodes, teachVisitedNodes, feedback, totNoOfDeletedNodes) # handles substituted nodes down a substituted node starting path if substitutedNodes: feedback, totNoOfOtherSubstitutedNodes = detectUndetectedBlocks( "substitutedNodes", graph, substitutedNodes, studVisitedNodes, feedback, totNoOfOtherSubstitutedNodes) # handles additional/substituted nodes down a additional/substituted node starting path if addOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "addOrSubNodes", graph, addOrSubNodes, studVisitedNodes, feedback, totNoOfOtherIncorrectNodes) # handles deleted/substituted nodes down a deleted/substituted node starting path if delOrSubNodes: feedback, totNoOfOtherIncorrectNodes = detectUndetectedBlocks( "delOrSubNodes", graph, delOrSubNodes, teachVisitedNodes, feedback, totNoOfOtherIncorrectNodes) if totNoOfAdditionalNodes == 0 and totNoOfDeletedNodes == 0 and totNoOfSubstitutedNodes == 0 and \ totNoOfOtherSubstitutedNodes == 0 and totNoOfOtherIncorrectNodes == 0: print(totNoOfMatchedNodes) feedback = feedback + "Excellent Job! All the blocks and the flow are correct!" # Number of correct blocks: " + ". " print(feedback) else: feedback = feedback + "Number of correct blocks except start and end blocks: " + str( totNoOfMatchedNodes) + ". " print(feedback) allocateMarksAndSaveToDatabase(totNoOfMatchedNodes, totNoOfAdditionalNodes, totNoOfDeletedNodes, totNoOfSubstitutedNodes, totNoOfOtherSubstitutedNodes, totNoOfOtherIncorrectNodes, feedback, processQuestionId, studentAnswerId)
nlmTwoNamesInsample = pickle.load(open(nlmInsampleFile + 'secondName', 'rb')) nlmTwoNamesOutsample = pickle.load(open(nlmOutsampleFile + 'secondName', 'rb')) SOInsampleFile = 'stackoverflowdata/' + insample_data SOOutsampleFile = 'stackoverflowdata/' + outsample_data SOInsampleData = pickle.load(open(SOInsampleFile, 'rb')) SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb')) csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())