def tok_qgram(input_string, q):
    """
    This function splits the input string into a list of q-grams. Note that,
    by default the input strings are padded and then tokenized.

    Args:
        input_string (string): Input string that should be tokenized.
        q (int): q-val that should be used to tokenize the input string.

    Returns:
        A list of tokens, if the input string is not NaN,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_qgram('database', q=2)
        ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
        >>> em.tok_qgram('database', q=3)
        ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
        >>> em.tok_qgram(None, q=2)
        nan
    """

    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)
    measure = sm.QgramTokenizer(qval=q)

    return measure.tokenize(input_string)
Esempio n. 2
0
    def filter_person(self, df, lim):
        A = df.ENHANCEDPERSONS.dropna().apply(lambda x: list(
            set([y.split(',')[0].lower() for y in x.split(';')])))
        A = A.apply(pd.Series).stack()
        A.index = A.index.map(lambda i: "{}_{}".format(i[0], i[1]))
        A = A.reset_index()
        A.columns = ['id', 'name']

        B = self.fetch_person()
        B.name = B.name.str.replace(r'_+', ' ').str.lower()

        qg3_tok = sm.QgramTokenizer(qval=3)
        C = ssj.jaccard_join(A,
                             B,
                             'id',
                             'id',
                             'name',
                             'name',
                             qg3_tok,
                             lim,
                             l_out_attrs=['name'],
                             r_out_attrs=['name'],
                             show_progress=False)

        return set(C.l_id.apply(lambda x: int(x.split("_")[0])))
Esempio n. 3
0
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
    def tok_qgram(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s

        s = gh.convert_to_str_unicode(s)

        measure = sm.QgramTokenizer(qval=q)
        return measure.tokenize(s)
Esempio n. 5
0
    def tok_qgram(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s
        if not (isinstance(s, six.string_types) or isinstance(s, bytes)):
            s = str(s)
        else:
            if isinstance(s, bytes):
                s = s.decode('utf-8')

        measure = sm.QgramTokenizer(qval=q)
        return measure.tokenize(s)
 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
     self.sim_matrix = get_sim_matrix(self.df, self.sim_scores)
     self.aggcl = AgglomerativeClustering(n_clusters=5,
                                          affinity='precomputed',
                                          linkage='complete')
     self.labels = self.aggcl.fit_predict(self.sim_matrix)
Esempio n. 7
0
def jac_q3_sim(str1, str2):
    try:
        # not needed as we already casted all to string and
        # lower cased and stripped all values before handing it over
        #str1 = str(str1).lower().strip()
        #str2 = str(str2).lower().strip()
        # assign a sim score of -1 when one of them is null
        if (str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == ''):
            return -1
        else:
            q3_tok = sm.QgramTokenizer(qval=3, return_set=True)
            jac = sm.Jaccard()
            return jac.get_raw_score(q3_tok.tokenize(str1),
                                     q3_tok.tokenize(str2))
    except:
        logger.warning('Issue with Jaccard_q3_Sim, hence -1 assigned')
        return -1
Esempio n. 8
0
    def get_oov_jaccard_sim(self, s1, s2):
        en_tokens_f = word_tokenize(s1.lower())
        de_tokens_f = word_tokenize(s2.lower())

        # Replacing the OOVs if their match has found
        en_tokens = []
        for token in en_tokens_f:
            if token in self.en_oov:
                for el in self.en_oov[token]:
                    en_tokens.append(el)
            else:
                en_tokens.append(token)

        de_tokens = []
        for token in de_tokens_f:
            if token in self.de_oov:
                for el in self.de_oov[token]:
                    de_tokens.append(el)
            else:
                de_tokens.append(token)

        new_en_tokens = [
            token for token in en_tokens
            if token not in self.en_dictionary and token not in self.en_oov
        ]
        new_de_tokens = [
            token for token in de_tokens
            if token not in self.de_dictionary and token not in self.de_oov
        ]

        new_en_str = " ".join(new_en_tokens)
        new_de_str = " ".join(new_de_tokens)

        if new_en_str == "" or new_de_str == "":
            return 0

        ## Getting 3 - grams

        measure = sm.QgramTokenizer(qval=3)
        en_grams = measure.tokenize(new_en_str)
        de_grams = measure.tokenize(new_de_str)

        ## Getting Jaccard distance

        measure = sm.Jaccard()
        return measure.get_raw_score(en_grams, de_grams)
def main():
	import pickle
	import py_stringmatching as sm
	from sklearn.feature_extraction.text import TfidfVectorizer 
	INSAMPLE_ABS_OUTFILE = '../dataCached/insample_abstracts_outfile'
	OUTSAMPLE_ABS_OUTFILE = '../dataCached/outSample_abstracts_outfile'
	OUTSAMPLE_ABS_REDUCED_OUTFILE = '../dataCached/outSample_abstracts_reduced_outfile'
	a1 = pickle.load(open(INSAMPLE_ABS_OUTFILE,'rb'))
	a2 = pickle.load(open(OUTSAMPLE_ABS_OUTFILE,'rb'))
	a3 = pickle.load(open(OUTSAMPLE_ABS_REDUCED_OUTFILE,'rb'))
	csAbstract = CosSim('Cos Sim Abstract',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
	csSentence = CosSim('Cos Sim Sentence',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
	jacq3 = stringMatchExcerpts('Fuzzy Jaccard',sm.Jaccard(),sm.QgramTokenizer(qval=3))
	
	components = [csAbstract,csSentence,jacq3]
	a1Features = [c.generateFeatures(a1) for c in components]
	print len(a1Features)
Esempio n. 10
0
def get_similar_strings(table_path):
    """
    Get list of strings to be normalized by value normalizer.
    The current algorithm is as follows:
        1. Deduplicate the given list of strings.
        2. Apply string similar join.
        3. Retrive the top N similar strings as returned by string similar join.
    Arguments:
        table_path: The absolute path of list of strings.
    Returns:
        similar_strings: Similar strings as returned by the aforementioned algorithm.

    Note: This logic can be changed to improve the value normalizer part of the
          overall application.
    """
    A = pd.read_csv(table_path)
    B = pd.read_csv(table_path)
    qg3_tok = sm.QgramTokenizer(qval=3)
    output_pairs = ssj.jaccard_join(A,
                                    B,
                                    'id',
                                    'id',
                                    'foo',
                                    'foo',
                                    qg3_tok,
                                    0.6,
                                    l_out_attrs=['foo'],
                                    r_out_attrs=['foo'])
    considered_pairs = []
    similar_strings = []
    for index, row in output_pairs.iterrows():
        if row['_sim_score'] > 0.6 and row['_sim_score'] < 1.0:
            if row['l_foo'] not in similar_strings:
                similar_strings.append(row['l_foo'])
            if row['r_foo'] not in similar_strings:
                similar_strings.append(row['r_foo'])
            if len(similar_strings) >= 21:
                break
    similar_strings.sort()
    return similar_strings
Esempio n. 11
0
import numpy as np
import os

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile'
OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile'
OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile'

csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True))
cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True))
LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein())

DEFAULTFV = [jacq3,cosM,cosMq3,LVdist]
DEFAULTMODEL = LR()
DEFAULTMODELNAME = 'LogisiticRegression'
DEFAULTITERATIONS = 25


class join:
    def __init__(self,insampleData,outsampleData,dataFolder):
        self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.dataFolder = dataFolder
Esempio n. 12
0
import pandas as pd
from .util import suffix
import py_stringmatching as sm
from remp import string_matching

tokenizer = sm.QgramTokenizer(qval=2, return_set=True)
jaccard = sm.Jaccard()


def similarity_func_default(string1, string2):
    return jaccard.get_sim_score(tokenizer.tokenize(string1),
                                 tokenizer.tokenize(string2))


def construct_similarity_list(left_triples,
                              right_triples,
                              entity_candidates,
                              aligned_attributes=None,
                              similarity_func=None):
    if aligned_attributes is None:
        shared_attributes = set(left_triples['a'].unique())
        shared_attributes &= set(right_triples['a'].unique())
        shared_attributes = list(shared_attributes)
        aligned_attributes = pd.DataFrame({
            'a1': shared_attributes,
            'a2': shared_attributes
        })
    if 'attr_id' not in aligned_attributes:
        aligned_attributes['attr_id'] = aligned_attributes.index
    paired = pd.merge(entity_candidates, suffix(left_triples, '1'))
    paired = pd.merge(paired, aligned_attributes)
 def __init__(self):
     self.dice = py_stringmatching.Dice()
     self.tokenizer = py_stringmatching.QgramTokenizer(qval=3)
Esempio n. 14
0
from typing import Callable, List, Dict, Tuple, Sequence, NewType
from dataclasses import dataclass

import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support

import py_stringsimjoin as ssj
import py_stringmatching as sm

WS = sm.WhitespaceTokenizer(return_set=True)
TWO_GRAM = sm.QgramTokenizer(qval=2, return_set=True)


def simjoin_top_k_pd(routine, params, k_max, thresh = None, suppress=True, early_break=True):
    knn_results = defaultdict(list)
    for k in range(1, k_max+1):
        ret_avg, ret_count, all_avg, all_count, MRR, retrieved = routine(*params, k=k, thresh=thresh)
        if not suppress:
            print(f"k: {k} \t ret avg: {ret_avg} \t ret_count: {ret_count} \t ret avg: {all_avg} \t ret_count: {all_count} \t MRR: {MRR} \t retrieved: {retrieved}")
        knn_results['k'].append(k)
        knn_results['ret_avg'].append(ret_avg)
        knn_results['ret_count'].append(ret_count)
        knn_results['all_avg'].append(all_avg)
        knn_results['all_count'].append(all_count)
        knn_results['MRR'].append(MRR)
        knn_results['retrieved'].append(retrieved)
        
        if early_break and ret_avg == 1.0 and all_avg == 1.0:
            break
Esempio n. 15
0
def tok_qgram(input_string, q):
    if pd.isnull(input_string):
        return pd.np.NaN

    measure = sm.QgramTokenizer(qval=q)
    return measure.tokenize(input_string)
Esempio n. 16
0
import py_stringmatching as sm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from py_stringmatching import utils
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
#incomplete list of necessary imports
#we will need to import the necessary scikit learn packages when we get there

#Initialize the q-gram tokenizer
qg3_tok_set = sm.QgramTokenizer(qval=3, return_set=True)

#Initialize similarity score calculators
jac = sm.Jaccard()
oc = sm.OverlapCoefficient()

#read in the CSV into a DataFrame
gold_raw_data = pd.read_csv('gold.csv', low_memory=False)

#Extract the gold labels from the DataFrame
#This becomes an input into our Learning Algs
gold_labels = gold_raw_data['Match?']

#This is our feature vector table
#Another input into our Learning Algs.
#We add feature vectors from with in the for
#loop that iterates over the DataFrame
Esempio n. 17
0
import py_stringmatching as sm

alnum_tok = sm.AlphanumericTokenizer()
qg3_tok = sm.QgramTokenizer(qval=3)

jac = sm.Jaccard()
lev = sm.Levenshtein()


def calcola_similarita(string1, string2):
    a = jac.get_sim_score(alnum_tok.tokenize(string1),
                          alnum_tok.tokenize(string2))
    b = lev.get_sim_score(string1, string2)
    c = jac.get_sim_score(qg3_tok.tokenize(string1), qg3_tok.tokenize(string2))
    return [{"alnum_jac": a}, {"alnum_lev": b}, {"qg3_jac": c}]


def add_features(elem):
    line, count = elem
    title1 = line[4]
    director1 = line[3]
    date1 = line[5]
    title2 = line[7]
    director2 = line[6]
    date2 = line[8]
    return (line+(calcola_similarita(title1,title2),calcola_similarita(director1,director2),\
    calcola_similarita(date1,date2)),count)


def precision(true):
    tp = true.map(lambda row:("true",row.response))\
SOInsampleFile = 'stackoverflowdata/' + insample_data
SOOutsampleFile = 'stackoverflowdata/' + outsample_data
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb'))
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):
Esempio n. 19
0
    'cosine': sm.Cosine,
    'dice': sm.Dice,
    'generalized_jaccard': sm.GeneralizedJaccard,
    'jaccard': sm.Jaccard,
    'overlap_coefficient': sm.OverlapCoefficient,
    'tversky_index': sm.TverskyIndex,

    # Corpus
    'tfidf': sm.TfIdf,
    'soft_tfidf': sm.SoftTfIdf,
}

tokenizer_lookup = {

    # Character gram tokenizers
    '1gram': sm.QgramTokenizer(qval=1),
    '1grams': sm.QgramTokenizer(qval=1),
    '2grams': sm.QgramTokenizer(qval=2),
    '3grams': sm.QgramTokenizer(qval=3),
    '4grams': sm.QgramTokenizer(qval=4),
    '5grams': sm.QgramTokenizer(qval=5),
    '6grams': sm.QgramTokenizer(qval=6),
    '7grams': sm.QgramTokenizer(qval=7),
    '8grams': sm.QgramTokenizer(qval=8),
    '9grams': sm.QgramTokenizer(qval=9),
    '1gram_set': sm.QgramTokenizer(qval=1, return_set=True),
    '1grams_set': sm.QgramTokenizer(qval=1, return_set=True),
    '2grams_set': sm.QgramTokenizer(qval=2, return_set=True),
    '3grams_set': sm.QgramTokenizer(qval=3, return_set=True),
    '4grams_set': sm.QgramTokenizer(qval=4, return_set=True),
    '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),