Beispiel #1
0
def get_matches(df, col, input, option):

    if option == 'Starts with':
        cond = df[col].astype(str).str.lower().str.startswith(input.lower())
        dfn = df[cond].copy()
        dfn.sort_values(by=col, inplace=True, ignore_index=True)

    elif option == 'Contains':
        cond = df[col].astype(str).str.lower().str.contains(input.lower())
        dfn = df[cond].copy()
        dfn.sort_values(by=col, inplace=True, ignore_index=True)

    elif option == 'Most similar':
        dfn = match_strings(df[col].astype(str).drop_duplicates(),
                            pd.Series(input),
                            min_similarity=0.4)
        cols = dfn.columns
        dfn.sort_values(by='similarity',
                        ascending=False,
                        inplace=True,
                        ignore_index=True)
        dfn = pd.merge(dfn, df, left_on='left_side', right_on=col)
        dfn.drop(cols, axis=1, inplace=True)

    return dfn
Beispiel #2
0
def calculate_fuzzymatches_for_min_similarity(
    left: Union[pd.DataFrame, pd.Series],
    right: Union[pd.DataFrame, pd.Series],
    on: str = None,
    left_on: str = None,
    right_on: str = None,
    min_similarity: float = None,
) -> pd.DataFrame:

    if isinstance(left, pd.Series) and isinstance(right, pd.Series):
        left_clean = left.drop_duplicates().pipe(clean_fuzzy_column)
        right_clean = right.drop_duplicates().pipe(clean_fuzzy_column)

    elif on is not None:
        left_clean = left[on].drop_duplicates().pipe(clean_fuzzy_column)
        right_clean = right[on].drop_duplicates().pipe(clean_fuzzy_column)

    elif (left_on is not None) and (right_on is not None):
        left_clean = left[left_on].drop_duplicates().pipe(clean_fuzzy_column)
        right_clean = right[right_on].drop_duplicates().pipe(clean_fuzzy_column)

    else:
        raise ValueError("Unexpected condition...")

    match = match_strings(left_clean, right_clean, min_similarity=min_similarity)

    return match
def dupes_from_as(cfg):
    column_to_match = 'name_concat'
    dataset = pd.read_csv(cfg['input_csv'])
    matches = match_strings(dataset[column_to_match])
    # excludes matches with the same index, which are by definition the same.
    match_subset = matches[matches['left_index'] != matches['right_index']]
    match_subset.to_csv(cfg['output_csv'], index=False)
    joined_subset = regroup_data(dataset, match_subset)
    joined_subset.to_csv(cfg['output_csv_2'], index=False)
    dropped_dupes = filter_mirrors(joined_subset)
    dropped_dupes.to_csv(cfg['output_csv_3'], index=False)
    dropped_subfields = remove_subfields(dropped_dupes)
    dropped_subfields.to_csv(cfg['output_csv_4'], index=False)
Beispiel #4
0
    def _get_lookup_matches(self, requested_words):
        # Create a small set of artifical company names
        search = pd.Series(pd.Series(requested_words).unique())

        lookups = pd.Series(self.lookup["Word"].unique())

        # Create all matches:
        matches = match_strings(lookups,
                                search,
                                ngram_size=3,
                                min_similarity=float(
                                    os.environ.get("LOOKUP_THRES_PRETAGGING")))

        return matches
def dupes_from_wikidata(cfg):
    wikidata_sublist = prep_wikidata_sublist(cfg)
    for key, value in wikidata_sublist.items():
        master_match = pd.DataFrame([key],
                                    columns=['agent_uri', 'name_concat'])
        master_match.set_index('agent_uri', inplace=True)
        dupe_match = pd.DataFrame(
            value,
            columns=[
                'index', 'num_matches', 'agent_uri', 'aay_url', 'name_concat',
                'sort_name', 'dates', 'resources', 'archival_objects',
                'accessions', 'authority_id', 'source', 'create_time',
                'wikidata_uri', 'wikidata_name', 'wikidata_begin',
                'wikidata_end'
            ])
        dupe_match.set_index('index', inplace=True)
        matches = match_strings(master_match['name_concat'],
                                dupe_match['wikidata_name'])
        matches.to_csv(cfg.get('output_csv'),
                       mode='a',
                       header=False,
                       index=False)
Beispiel #6
0
words_detected = []

for m_id, start, end in shape_matches:
    entity = doc[start:end]
    words_detected.append(clean_text(entity.text))
    entities_detected.append((entity.text, entity.start_char, entity.end_char,
                              nlp.vocab.strings[m_id], ""))
    print(entity.text)

req_words = []

for t in doc:
    if len(t.text
           ) > 2 and t.text != "break" and not t.is_stop and not clean_text(
               t.text) in words_detected:
        print(t.text)
        req_words.append(t.text.lower())

# Create a small set of artifical company names
duplicates = pd.Series(pd.Series(req_words).unique())

string_words = pd.Series(result["Word"].unique())

# Create all matches:
matches = match_strings(string_words,
                        duplicates,
                        ngram_size=3,
                        min_similarity=0.4)

print(matches)
Beispiel #7
0
        return False

strings = [x for x in strings if '[no title captured]' not in x]
articles = [x for x in strings if isarticle(x)]
books = [x for x in strings if not isarticle(x)]

print("%s articles, %s books to group" % (len(articles), len(books)))

# grouping books

# this cell may take quite a while to run.
# on Intel i7-9700F this runs in about a minute on 185k names.

books_grouped = string_grouper.match_strings(
    pd.Series(books), 
    number_of_processes=8, 
    min_similarity=0.7
)

from collections import defaultdict

books_grouped[(books_grouped.similarity<1-1e-8)].sort_values("similarity")

# for books, we require that the authors are no more than 1 edit from each other
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

ft = defaultdict(set)

for i,r in books_grouped.iterrows():
    ls = r.left_side
    rs = r.right_side
        '/Users/haoranliu/match/Trademark/Clean_name/Clean/cleaned/ciq_id.json'
) as f:
    ciq_id = json.load(f)

tma_assignor = pd.DataFrame()
tma_assignor['name'] = tma_assignor_name
tma_assignor['id'] = tma_assignor_id

ciq = pd.DataFrame()
ciq['name'] = ciq_name
ciq['id'] = ciq_id

# num = 0.9
# matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num)
# matches.to_stata(f'assignor_ciq{num}.dta', version = 117)
# num = 0.8
# matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num)
# matches.to_stata(f'assignor_ciq{num}.dta', version = 117)
num = 0.7
matches = match_strings(master=tma_assignor['name'],
                        master_id=tma_assignor['id'],
                        duplicates=ciq['name'],
                        duplicates_id=ciq['id'],
                        min_similarity=num)
matches.to_stata(f'assignor_ciq{num}.dta', version=117)
# num = 0.6
# matches = match_strings(master = tma_assignor['name'], master_id = tma_assignor['id'], duplicates = ciq['name'], duplicates_id = ciq['id'], min_similarity = num)
# matches.to_stata(f'assignor_ciq{num}.dta', version = 117)

#string_grouper.match_strings()
#(master, master_id, duplicates, duplicates_id, min_similarity)
Beispiel #9
0
    def run(self):

        from knowknow import pd, Counter, VariableNotFound
        from collections import defaultdict

        import string_grouper
        import editdistance

        # tracks the last group-id assigned
        new_gid = 0

        print(len(self.strings), 'strings total...')

        def isarticle(x):
            sp = x.split("|")
            if len(sp) < 2:
                return False

            try:
                int(sp[1])
                return True
            except ValueError:
                return False

        strings = [x for x in self.strings if '[no title captured]' not in x]
        articles = [x for x in strings if isarticle(x)]
        books = [x for x in strings if not isarticle(x)]

        print('sample articles:', articles[:10])
        print('sample books:', books[:10])
        print("%s articles, %s books to group" % (len(articles), len(books)))

        # grouping books

        # this cell may take quite a while to run.
        # on Intel i7-9700F this runs in about a minute on 185k names.

        self.books_grouped = string_grouper.match_strings(
            pd.Series(books), number_of_processes=8, min_similarity=0.7)

        # for books, we require that the authors are no more than 1 edit from each other
        # even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

        self.ft = defaultdict(set)

        for i, r in self.books_grouped.iterrows():
            ls = r.left_side
            rs = r.right_side

            if ls == rs:
                continue

            la = ls.split("|")[0]
            ra = rs.split("|")[0]

            if editdistance.eval(la, ra) > 1:
                continue

            self.ft[ls].add(rs)
            self.ft[rs].add(ls)

        print("%s books have some connection to others in a group" %
              len(self.ft))

        # assigns group-ids based on the relational structure derived thus far
        # the code propagates ids through the network, assuming transitivity of equality

        for i, k in enumerate(books):
            if k in self.groups:
                continue

            self.traverse(k, new_gid)
            new_gid += 1

        print(len(set(self.groups.values())), 'groups total')
        print(
            Counter(gid for x, gid in self.groups.items()
                    if len(x.split("|")) == 2).most_common(10))

        # grouping articles

        # this cell may take quite a while to run.
        # on Intel i7-9700F this runs in five minutes on 234k entries.

        self.articles_grouped = string_grouper.match_strings(
            pd.Series(articles),
            number_of_processes=
            8,  # decrease this number to 1 or 2 for slower computers or laptops (the fan might start screaming)
            min_similarity=
            0.8  # the similarity cutoff is tighter for articles than for books
        )

        self.articles_grouped[(self.articles_grouped.similarity <
                               1 - 1e-8)].sort_values("similarity")

        # for articles, we require that the entire citations is only 1 edit apart.
        # even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

        # this cell produces the `ft` variable, which maps from each term to the set of terms equivalent. I.e., `ft[A] = {B1,B2,B3}`

        self.ft = defaultdict(set)

        for i, r in self.articles_grouped.iterrows():
            ls = r.left_side
            rs = r.right_side

            if ls == rs:
                continue

            la = ls.split("|")[0]
            ra = rs.split("|")[0]

            if editdistance.eval(ls, rs) > 2:
                continue

            self.ft[ls].add(rs)
            self.ft[rs].add(ls)
            #print(ls,"|||",rs)

        print("%s articles have some connection to others in a group" %
              len(self.ft))

        # assigns group-ids based on the relational structure derived thus far
        # the code propagates ids through the network, assuming transitivity of equality

        for i, k in enumerate(articles):
            if k in self.groups:
                continue

            self.traverse(k, new_gid)
            new_gid += 1

        # this line will break execution if there aren't as many groups assigned as we have articles and books
        assert (len(articles) + len(books) == len(self.groups))

        print("%s books and %s articles total" % (len(books), len(articles)))

        from collections import defaultdict

        # saving the variable for later
        self.dataset.save_variable("groups", self.groups)
        self.dataset.save_variable("group_reps", self.get_reps())
Beispiel #10
0
def string_matcher(col1, col2, sim_thresh=0.95):
    from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper
    matches = match_strings(col1, col2, min_similarity=sim_thresh)
    return matches
punti_search = func.test_search_functions(search_functions_to_test, possibilities, src.list_of_searches, src.categories, saveresults=True, savelog=True)
# %% codecell
# Plot dei grafici dei risultati
folder_log = os.path.join(folder,"log_ricerca")
func.plot_figure(folder_log, src.categories)
# %%
# Test per StringGrouper
import pandas as pd
import numpy as np
from string_grouper import match_strings, match_most_similar, group_similar_strings, StringGrouper
company_names = os.path.join(os.getcwd(),'sec_edgar_company_info.csv')
# We only look at the first 50k as an example
companies = pd.read_csv(company_names)[0:50000]
c =  companies['Company Name']
# Create all matches:
matches = match_strings(companies['Company Name'])
# Look at only the non-exact matches:
matches[matches.left_side != matches.right_side].head()

# Create a small set of artifical company names
duplicates = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP'])
# Create all matches:
matches = match_strings(companies['Company Name'], duplicates)
matches
# Create a small set of artificial company names
new_companies = pd.Series(['S MEDIA GROUP', '012 SMILE.COMMUNICATIONS', 'foo bar', 'B4UTRADE COM CORP'])
# Create all matches:
matches = match_most_similar(companies['Company Name'], new_companies)
# Display the results:
pd.DataFrame({'new_companies': new_companies, 'duplicates': matches})
#%%