Esempi in Python per QgramTokenizer, esempi in Python per py_stringmatching.QgramTokenizer

Esempio n. 1

0

Mostra file

File: tokenizers.py Progetto: soumyadsanyal/py_entitymatching

def tok_qgram(input_string, q):
    """
    This function splits the input string into a list of q-grams. Note that,
    by default the input strings are padded and then tokenized.

    Args:
        input_string (string): Input string that should be tokenized.
        q (int): q-val that should be used to tokenize the input string.

    Returns:
        A list of tokens, if the input string is not NaN,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_qgram('database', q=2)
        ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
        >>> em.tok_qgram('database', q=3)
        ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
        >>> em.tok_qgram(None, q=2)
        nan
    """

    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)
    measure = sm.QgramTokenizer(qval=q)

    return measure.tokenize(input_string)

Esempio n. 2

0

Mostra file

File: gdelt_networkx.py Progetto: smartdatalake/datasets

    def filter_person(self, df, lim):
        A = df.ENHANCEDPERSONS.dropna().apply(lambda x: list(
            set([y.split(',')[0].lower() for y in x.split(';')])))
        A = A.apply(pd.Series).stack()
        A.index = A.index.map(lambda i: "{}_{}".format(i[0], i[1]))
        A = A.reset_index()
        A.columns = ['id', 'name']

        B = self.fetch_person()
        B.name = B.name.str.replace(r'_+', ' ').str.lower()

        qg3_tok = sm.QgramTokenizer(qval=3)
        C = ssj.jaccard_join(A,
                             B,
                             'id',
                             'id',
                             'name',
                             'name',
                             qg3_tok,
                             lim,
                             l_out_attrs=['name'],
                             r_out_attrs=['name'],
                             show_progress=False)

        return set(C.l_id.apply(lambda x: int(x.split("_")[0])))

Esempio n. 3

0

Mostra file

 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)

Esempio n. 4

0

Mostra file

File: tokenizers.py Progetto: soumyadsanyal/py_entitymatching

    def tok_qgram(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s

        s = gh.convert_to_str_unicode(s)

        measure = sm.QgramTokenizer(qval=q)
        return measure.tokenize(s)

Esempio n. 5

0

Mostra file

    def tok_qgram(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s
        if not (isinstance(s, six.string_types) or isinstance(s, bytes)):
            s = str(s)
        else:
            if isinstance(s, bytes):
                s = s.decode('utf-8')

        measure = sm.QgramTokenizer(qval=q)
        return measure.tokenize(s)

Esempio n. 6

0

Mostra file

File: test_generic_helper.py Progetto: adelaneh/py_stringclustering

 def setUp(self):
     self.df = read_data(path_big_ten)
     self.trigramtok = sm.QgramTokenizer(qval=3)
     self.blocked_pairs = ssj.jaccard_join(self.df, self.df, 'id', 'id',
                                           'name', 'name', self.trigramtok,
                                           0.3)
     self.jaccsim = sm.Jaccard()
     self.sim_scores = get_sim_scores(self.df, self.blocked_pairs,
                                      self.trigramtok, self.jaccsim)
     self.sim_matrix = get_sim_matrix(self.df, self.sim_scores)
     self.aggcl = AgglomerativeClustering(n_clusters=5,
                                          affinity='precomputed',
                                          linkage='complete')
     self.labels = self.aggcl.fit_predict(self.sim_matrix)

Esempio n. 7

0

Mostra file

File: sim_measures.py Progetto: JayKay0104/ma-atl-for-er

def jac_q3_sim(str1, str2):
    try:
        # not needed as we already casted all to string and
        # lower cased and stripped all values before handing it over
        #str1 = str(str1).lower().strip()
        #str2 = str(str2).lower().strip()
        # assign a sim score of -1 when one of them is null
        if (str1 == 'nan' or str2 == 'nan' or str1 == '' or str2 == ''):
            return -1
        else:
            q3_tok = sm.QgramTokenizer(qval=3, return_set=True)
            jac = sm.Jaccard()
            return jac.get_raw_score(q3_tok.tokenize(str1),
                                     q3_tok.tokenize(str2))
    except:
        logger.warning('Issue with Jaccard_q3_Sim, hence -1 assigned')
        return -1

Esempio n. 8

0

Mostra file

    def get_oov_jaccard_sim(self, s1, s2):
        en_tokens_f = word_tokenize(s1.lower())
        de_tokens_f = word_tokenize(s2.lower())

        # Replacing the OOVs if their match has found
        en_tokens = []
        for token in en_tokens_f:
            if token in self.en_oov:
                for el in self.en_oov[token]:
                    en_tokens.append(el)
            else:
                en_tokens.append(token)

        de_tokens = []
        for token in de_tokens_f:
            if token in self.de_oov:
                for el in self.de_oov[token]:
                    de_tokens.append(el)
            else:
                de_tokens.append(token)

        new_en_tokens = [
            token for token in en_tokens
            if token not in self.en_dictionary and token not in self.en_oov
        ]
        new_de_tokens = [
            token for token in de_tokens
            if token not in self.de_dictionary and token not in self.de_oov
        ]

        new_en_str = " ".join(new_en_tokens)
        new_de_str = " ".join(new_de_tokens)

        if new_en_str == "" or new_de_str == "":
            return 0

        ## Getting 3 - grams

        measure = sm.QgramTokenizer(qval=3)
        en_grams = measure.tokenize(new_en_str)
        de_grams = measure.tokenize(new_de_str)

        ## Getting Jaccard distance

        measure = sm.Jaccard()
        return measure.get_raw_score(en_grams, de_grams)

Esempio n. 9

0

Mostra file

File: FVComponent.py Progetto: russell-lee/stackoverflow-join

def main():
	import pickle
	import py_stringmatching as sm
	from sklearn.feature_extraction.text import TfidfVectorizer 
	INSAMPLE_ABS_OUTFILE = '../dataCached/insample_abstracts_outfile'
	OUTSAMPLE_ABS_OUTFILE = '../dataCached/outSample_abstracts_outfile'
	OUTSAMPLE_ABS_REDUCED_OUTFILE = '../dataCached/outSample_abstracts_reduced_outfile'
	a1 = pickle.load(open(INSAMPLE_ABS_OUTFILE,'rb'))
	a2 = pickle.load(open(OUTSAMPLE_ABS_OUTFILE,'rb'))
	a3 = pickle.load(open(OUTSAMPLE_ABS_REDUCED_OUTFILE,'rb'))
	csAbstract = CosSim('Cos Sim Abstract',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
	csSentence = CosSim('Cos Sim Sentence',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
	jacq3 = stringMatchExcerpts('Fuzzy Jaccard',sm.Jaccard(),sm.QgramTokenizer(qval=3))
	
	components = [csAbstract,csSentence,jacq3]
	a1Features = [c.generateFeatures(a1) for c in components]
	print len(a1Features)

Esempio n. 10

0

Mostra file

def get_similar_strings(table_path):
    """
    Get list of strings to be normalized by value normalizer.
    The current algorithm is as follows:
        1. Deduplicate the given list of strings.
        2. Apply string similar join.
        3. Retrive the top N similar strings as returned by string similar join.
    Arguments:
        table_path: The absolute path of list of strings.
    Returns:
        similar_strings: Similar strings as returned by the aforementioned algorithm.

    Note: This logic can be changed to improve the value normalizer part of the
          overall application.
    """
    A = pd.read_csv(table_path)
    B = pd.read_csv(table_path)
    qg3_tok = sm.QgramTokenizer(qval=3)
    output_pairs = ssj.jaccard_join(A,
                                    B,
                                    'id',
                                    'id',
                                    'foo',
                                    'foo',
                                    qg3_tok,
                                    0.6,
                                    l_out_attrs=['foo'],
                                    r_out_attrs=['foo'])
    considered_pairs = []
    similar_strings = []
    for index, row in output_pairs.iterrows():
        if row['_sim_score'] > 0.6 and row['_sim_score'] < 1.0:
            if row['l_foo'] not in similar_strings:
                similar_strings.append(row['l_foo'])
            if row['r_foo'] not in similar_strings:
                similar_strings.append(row['r_foo'])
            if len(similar_strings) >= 21:
                break
    similar_strings.sort()
    return similar_strings

Esempio n. 11

0

Mostra file

import numpy as np
import os

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile'
OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile'
OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile'

csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True))
cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True))
LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein())

DEFAULTFV = [jacq3,cosM,cosMq3,LVdist]
DEFAULTMODEL = LR()
DEFAULTMODELNAME = 'LogisiticRegression'
DEFAULTITERATIONS = 25


class join:
    def __init__(self,insampleData,outsampleData,dataFolder):
        self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.dataFolder = dataFolder

Esempio n. 12

0

Mostra file

File: similarity_vector.py Progetto: mdheller/Remp

import pandas as pd
from .util import suffix
import py_stringmatching as sm
from remp import string_matching

tokenizer = sm.QgramTokenizer(qval=2, return_set=True)
jaccard = sm.Jaccard()


def similarity_func_default(string1, string2):
    return jaccard.get_sim_score(tokenizer.tokenize(string1),
                                 tokenizer.tokenize(string2))


def construct_similarity_list(left_triples,
                              right_triples,
                              entity_candidates,
                              aligned_attributes=None,
                              similarity_func=None):
    if aligned_attributes is None:
        shared_attributes = set(left_triples['a'].unique())
        shared_attributes &= set(right_triples['a'].unique())
        shared_attributes = list(shared_attributes)
        aligned_attributes = pd.DataFrame({
            'a1': shared_attributes,
            'a2': shared_attributes
        })
    if 'attr_id' not in aligned_attributes:
        aligned_attributes['attr_id'] = aligned_attributes.index
    paired = pd.merge(entity_candidates, suffix(left_triples, '1'))
    paired = pd.merge(paired, aligned_attributes)

Esempio n. 13

0

Mostra file

File: similarity_measures.py Progetto: jonathanschuchart/eager

 def __init__(self):
     self.dice = py_stringmatching.Dice()
     self.tokenizer = py_stringmatching.QgramTokenizer(qval=3)

Esempio n. 14

0

Mostra file

from typing import Callable, List, Dict, Tuple, Sequence, NewType
from dataclasses import dataclass

import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support

import py_stringsimjoin as ssj
import py_stringmatching as sm

WS = sm.WhitespaceTokenizer(return_set=True)
TWO_GRAM = sm.QgramTokenizer(qval=2, return_set=True)


def simjoin_top_k_pd(routine, params, k_max, thresh = None, suppress=True, early_break=True):
    knn_results = defaultdict(list)
    for k in range(1, k_max+1):
        ret_avg, ret_count, all_avg, all_count, MRR, retrieved = routine(*params, k=k, thresh=thresh)
        if not suppress:
            print(f"k: {k} \t ret avg: {ret_avg} \t ret_count: {ret_count} \t ret avg: {all_avg} \t ret_count: {all_count} \t MRR: {MRR} \t retrieved: {retrieved}")
        knn_results['k'].append(k)
        knn_results['ret_avg'].append(ret_avg)
        knn_results['ret_count'].append(ret_count)
        knn_results['all_avg'].append(all_avg)
        knn_results['all_count'].append(all_count)
        knn_results['MRR'].append(MRR)
        knn_results['retrieved'].append(retrieved)
        
        if early_break and ret_avg == 1.0 and all_avg == 1.0:
            break

Esempio n. 15

0

Mostra file

File: crosslang_feature_extraction.py Progetto: oykut/clrl

def tok_qgram(input_string, q):
    if pd.isnull(input_string):
        return pd.np.NaN

    measure = sm.QgramTokenizer(qval=q)
    return measure.tokenize(input_string)

Esempio n. 16

0

Mostra file

File: TreeFit.py Progetto: nilsreichert/CS638Stage4

import py_stringmatching as sm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from py_stringmatching import utils
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
#incomplete list of necessary imports
#we will need to import the necessary scikit learn packages when we get there

#Initialize the q-gram tokenizer
qg3_tok_set = sm.QgramTokenizer(qval=3, return_set=True)

#Initialize similarity score calculators
jac = sm.Jaccard()
oc = sm.OverlapCoefficient()

#read in the CSV into a DataFrame
gold_raw_data = pd.read_csv('gold.csv', low_memory=False)

#Extract the gold labels from the DataFrame
#This becomes an input into our Learning Algs
gold_labels = gold_raw_data['Match?']

#This is our feature vector table
#Another input into our Learning Algs.
#We add feature vectors from with in the for
#loop that iterates over the DataFrame

Esempio n. 17

0

Mostra file

import py_stringmatching as sm

alnum_tok = sm.AlphanumericTokenizer()
qg3_tok = sm.QgramTokenizer(qval=3)

jac = sm.Jaccard()
lev = sm.Levenshtein()


def calcola_similarita(string1, string2):
    a = jac.get_sim_score(alnum_tok.tokenize(string1),
                          alnum_tok.tokenize(string2))
    b = lev.get_sim_score(string1, string2)
    c = jac.get_sim_score(qg3_tok.tokenize(string1), qg3_tok.tokenize(string2))
    return [{"alnum_jac": a}, {"alnum_lev": b}, {"qg3_jac": c}]


def add_features(elem):
    line, count = elem
    title1 = line[4]
    director1 = line[3]
    date1 = line[5]
    title2 = line[7]
    director2 = line[6]
    date2 = line[8]
    return (line+(calcola_similarita(title1,title2),calcola_similarita(director1,director2),\
    calcola_similarita(date1,date2)),count)


def precision(true):
    tp = true.map(lambda row:("true",row.response))\

Esempio n. 18

0

Mostra file

File: longExperiments.py Progetto: russell-lee/stackoverflow-join

SOInsampleFile = 'stackoverflowdata/' + insample_data
SOOutsampleFile = 'stackoverflowdata/' + outsample_data
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb'))
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):

Esempio n. 19

0

Mostra file

    'cosine': sm.Cosine,
    'dice': sm.Dice,
    'generalized_jaccard': sm.GeneralizedJaccard,
    'jaccard': sm.Jaccard,
    'overlap_coefficient': sm.OverlapCoefficient,
    'tversky_index': sm.TverskyIndex,

    # Corpus
    'tfidf': sm.TfIdf,
    'soft_tfidf': sm.SoftTfIdf,
}

tokenizer_lookup = {

    # Character gram tokenizers
    '1gram': sm.QgramTokenizer(qval=1),
    '1grams': sm.QgramTokenizer(qval=1),
    '2grams': sm.QgramTokenizer(qval=2),
    '3grams': sm.QgramTokenizer(qval=3),
    '4grams': sm.QgramTokenizer(qval=4),
    '5grams': sm.QgramTokenizer(qval=5),
    '6grams': sm.QgramTokenizer(qval=6),
    '7grams': sm.QgramTokenizer(qval=7),
    '8grams': sm.QgramTokenizer(qval=8),
    '9grams': sm.QgramTokenizer(qval=9),
    '1gram_set': sm.QgramTokenizer(qval=1, return_set=True),
    '1grams_set': sm.QgramTokenizer(qval=1, return_set=True),
    '2grams_set': sm.QgramTokenizer(qval=2, return_set=True),
    '3grams_set': sm.QgramTokenizer(qval=3, return_set=True),
    '4grams_set': sm.QgramTokenizer(qval=4, return_set=True),
    '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),