Ejemplo n.º 1
0
def dice(arr1, arr2):
    """
    This function computes the Dice score between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the Dice
            score should be computed.

    Returns:
        The Dice score if both the lists/set are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.
    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN

    # Create Dice object
    measure = sm.Dice()
    # Call the function to return the dice score
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 2
0
def dice(arr1, arr2):
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN

    # Create Dice object
    measure = sm.Dice()
    # Call the function to return the dice score
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 3
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Ejemplo n.º 4
0
def dice(arr1, arr2):
    """
    This function computes the Dice score between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the Dice
            score should be computed.

    Returns:
        The Dice score if both the lists/set are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.dice(['data', 'science'], ['data'])
        0.6666666666666666
        >>> em.dice(['data', 'science'], None)
        nan

    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN

    # Create Dice object
    measure = sm.Dice()
    # Call the function to return the dice score
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 5
0
    axis=1)
df['Q4'] = df.apply(
    lambda x: jac.get_sim_score(x['Q-gram_4_Tokens1'], x['Q-gram_4_Tokens2']),
    axis=1)
df.head()

# In[31]:

cos = sm.Cosine()
df['Cosine'] = df.apply(
    lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[32]:

dice = sm.Dice()
df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']),
                      axis=1)
df.head()

# In[33]:

oc = sm.OverlapCoefficient()
df['Overlap'] = df.apply(
    lambda x: oc.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[34]:

# Set alpha beta https://en.wikipedia.org/wiki/Tversky_index
# Setting alpha beta as 0.5 is same as Dice Similarity
Ejemplo n.º 6
0
 def __init__(self):
     self.dice = py_stringmatching.Dice()
     self.tokenizer = py_stringmatching.QgramTokenizer(qval=3)
SOInsampleFile = 'stackoverflowdata/' + insample_data
SOOutsampleFile = 'stackoverflowdata/' + outsample_data
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb'))
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):
    wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)