Ejemplo n.º 1
0
def main(data, output, index, col_headers, prep, threshold):
    """Console script for disamby."""
    names_df = pd.read_csv(data, index_col=index)
    prep_dict = {
        'A': pre.compact_abbreviations,
        'P': pre.remove_punctuation,
        'W': pre.normalize_whitespace,
        '3': lambda x: pre.ngram(x, 3),
        '4': lambda x: pre.ngram(x, 4),
        '5': lambda x: pre.ngram(x, 5),
        'S': pre.split_words,
        'X': lambda x: pre.ngram(x[:33], 4) + pre.split_words(x)
    }
    columns = col_headers.split(',')
    pipeline = [prep_dict[action] for action in list(prep)]
    dis = Disamby(data=names_df[columns], preprocessors=pipeline)
    components = dis.disambiguated_sets(threshold,
                                        smoother='offset',
                                        offset=100)

    comp_to_id = dict()
    for comp in components:
        members = list(comp)
        representative = members[0]
        name = names_df.loc[representative, columns[0]]
        comp_to_id[name] = members

    with open(output, 'w') as f:
        json.dump(comp_to_id, f)
Ejemplo n.º 2
0
def test_find(company_df):
    df = company_df(100)
    dis = Disamby(df, pipeline)
    term = list(sorted(dis.records['address'].keys()))[0]
    results = dis.find(term, threshold=0, weights={'name': .2, 'address': .8})

    assert len(results) == 31
    score_of_searched = max(x.score for x in results)
    assert score_of_searched == pytest.approx(1)
Ejemplo n.º 3
0
def test_disambiguated_tests(company_df):
    df = company_df(200)
    dis = Disamby(df, preprocessors=pipeline)
    components = dis.disambiguated_sets(verbose=True,
                                        threshold=.7,
                                        weights={
                                            'name': .99,
                                            'address': .01
                                        })
    assert max(len(c) for c in components) == 2
Ejemplo n.º 4
0
    def analisar(self):

        self.load_dataset()

        pipeline = [
            pre.normalize_whitespace, pre.remove_punctuation,
            lambda x: pre.trigram(x) + pre.split_words(x)
        ]

        dis = Disamby(self.df_coluna, pipeline)

        self.lista_de_sets = dis.disambiguated_sets(threshold=0.5)
        self.criar_lista_de_posicoes()
        self.generate_csv()
Ejemplo n.º 5
0
def test_sparse_find(size, company_df, benchmark):
    df = company_df(size)
    dis = Disamby(df, pipeline)
    idx = list(dis.records['name'].keys())[0]
    results = benchmark(dis.find, idx, .7)
    score_of_searched = max(x.score for x in results)
    assert score_of_searched == pytest.approx(1)
Ejemplo n.º 6
0
def test_two_identical_columns(company_df):
    df = company_df(20)
    with pytest.raises(KeyError):
        dis = Disamby(df[['name', 'name']], pipeline)
Ejemplo n.º 7
0
def test_instant_instantiation(company_df):
    df = company_df(500)
    dis = Disamby(df, pipeline)
    dis.field_freq['address'].most_common()
Ejemplo n.º 8
0
def disamby_fitted_instance(fake_names):
    names = fake_names(90)
    data_series = pd.Series(names)
    dis = Disamby()
    dis.fit(data_series, pipeline, 'streets')
    return dis
Ejemplo n.º 9
0
all_two_part_names = [name for name in all_names if len(name.split()) > 1]
# %%
df = pd.DataFrame(all_two_part_names)
# %%
# define the pipeline to process the strings, note that the last step must return
# a tuple of strings
pipeline = [
    pre.normalize_whitespace,
    pre.remove_punctuation,
    lambda x: pre.trigram(x) + pre.split_words(
        x)  # any python function is allowed
]

# instantiate the disamby object, it applies the given pre-processing pipeline and
# computes their frequency.
dis = Disamby(df, pipeline)
# %%
sets = dis.disambiguated_sets(threshold=0.80)
# %%
mergeable_sets = [s for s in sets if len(s) > 1]
# %%
for sets in mergeable_sets:
    print([df.loc[s].item() for s in sets])
"""
['gen paul frieichs', 'paul frieichs']
['debra goff', 'debra a goff']
['chris cuomo', 'anew cuomo']
['david spiegel', 'paul spiegel']
['andy pekosz', 'anew pekosz']
['sharon goldfarb', 'anna goldfarb']
['george findlay', 'james findlay']