def cosine(arr1, arr2): """ This function computes the cosine measure between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the cosine measure should be computed. Returns: The cosine measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create cosine measure object measure = sm.Cosine() # Call the function to compute the cosine measure. return measure.get_raw_score(arr1, arr2)
def cos_score(self, str_pair, sim_score=True): """ calculate cosine similarity between two single sets of tokens :return: similarity score (0 to 1) """ e1, e2 = self._check_input(str_pair, type_=list) cos = sm.Cosine() return cos.get_sim_score(e1, e2) if sim_score else cos.get_raw_score( e1, e2)
def cosine(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create cosine measure object measure = sm.Cosine() # Call the function to compute the cosine measure. return measure.get_raw_score(arr1, arr2)
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def cosine(arr1, arr2): """ This function computes the cosine measure between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the cosine measure should be computed. Returns: The cosine measure if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.cosine(['data', 'science'], ['data']) 0.7071067811865475 >>> em.cosine(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create cosine measure object measure = sm.Cosine() # Call the function to compute the cosine measure. return measure.get_raw_score(arr1, arr2)
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb')) SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb')) nlmInsampleFile = 'NLMdata/dataCached/insample_abstracts_outfile' nlmOutsampleFile = 'NLMdata/dataCached/outSample_abstracts_outfile' nlmInsampleData = pickle.load(open(nlmInsampleFile, 'rb')) nlmOutsampleData = pickle.load(open(nlmOutsampleFile, 'rb')) # Instantiate FVComponent instances csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) LVDist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) FVCList = [csAbstract, csSentence, cosM, LVDist] def classifyAndPredict(insampleData, outsampleData, folderName, componentList): print len(insampleData[0]) print len(outsampleData[1]) # Declare instance of a join object with input arguments easyJoin = myJoin.join(insampleData, outsampleData, folderName) easyJoin.setComponentList(componentList) # Build feature vector easyJoin.buildInsampleFV() easyJoin.buildOutsampleFVReduced(0.01)
import os def ensure_dir(file_path): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile' OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile' OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile' csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False) csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True) jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True)) cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True)) LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein()) DEFAULTFV = [jacq3,cosM,cosMq3,LVdist] DEFAULTMODEL = LR() DEFAULTMODELNAME = 'LogisiticRegression' DEFAULTITERATIONS = 25 class join: def __init__(self,insampleData,outsampleData,dataFolder): self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles self.dataFolder = dataFolder self.labels = insampleData[1]
# In[30]: df['Q2'] = df.apply( lambda x: jac.get_sim_score(x['Q-gram_2_Tokens1'], x['Q-gram_2_Tokens2']), axis=1) df['Q3'] = df.apply( lambda x: jac.get_sim_score(x['Q-gram_3_Tokens1'], x['Q-gram_3_Tokens2']), axis=1) df['Q4'] = df.apply( lambda x: jac.get_sim_score(x['Q-gram_4_Tokens1'], x['Q-gram_4_Tokens2']), axis=1) df.head() # In[31]: cos = sm.Cosine() df['Cosine'] = df.apply( lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[32]: dice = sm.Dice() df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[33]: oc = sm.OverlapCoefficient() df['Overlap'] = df.apply(