def main(data, output, index, col_headers, prep, threshold): """Console script for disamby.""" names_df = pd.read_csv(data, index_col=index) prep_dict = { 'A': pre.compact_abbreviations, 'P': pre.remove_punctuation, 'W': pre.normalize_whitespace, '3': lambda x: pre.ngram(x, 3), '4': lambda x: pre.ngram(x, 4), '5': lambda x: pre.ngram(x, 5), 'S': pre.split_words, 'X': lambda x: pre.ngram(x[:33], 4) + pre.split_words(x) } columns = col_headers.split(',') pipeline = [prep_dict[action] for action in list(prep)] dis = Disamby(data=names_df[columns], preprocessors=pipeline) components = dis.disambiguated_sets(threshold, smoother='offset', offset=100) comp_to_id = dict() for comp in components: members = list(comp) representative = members[0] name = names_df.loc[representative, columns[0]] comp_to_id[name] = members with open(output, 'w') as f: json.dump(comp_to_id, f)
def test_find(company_df): df = company_df(100) dis = Disamby(df, pipeline) term = list(sorted(dis.records['address'].keys()))[0] results = dis.find(term, threshold=0, weights={'name': .2, 'address': .8}) assert len(results) == 31 score_of_searched = max(x.score for x in results) assert score_of_searched == pytest.approx(1)
def test_disambiguated_tests(company_df): df = company_df(200) dis = Disamby(df, preprocessors=pipeline) components = dis.disambiguated_sets(verbose=True, threshold=.7, weights={ 'name': .99, 'address': .01 }) assert max(len(c) for c in components) == 2
def analisar(self): self.load_dataset() pipeline = [ pre.normalize_whitespace, pre.remove_punctuation, lambda x: pre.trigram(x) + pre.split_words(x) ] dis = Disamby(self.df_coluna, pipeline) self.lista_de_sets = dis.disambiguated_sets(threshold=0.5) self.criar_lista_de_posicoes() self.generate_csv()
def test_sparse_find(size, company_df, benchmark): df = company_df(size) dis = Disamby(df, pipeline) idx = list(dis.records['name'].keys())[0] results = benchmark(dis.find, idx, .7) score_of_searched = max(x.score for x in results) assert score_of_searched == pytest.approx(1)
def test_two_identical_columns(company_df): df = company_df(20) with pytest.raises(KeyError): dis = Disamby(df[['name', 'name']], pipeline)
def test_instant_instantiation(company_df): df = company_df(500) dis = Disamby(df, pipeline) dis.field_freq['address'].most_common()
def disamby_fitted_instance(fake_names): names = fake_names(90) data_series = pd.Series(names) dis = Disamby() dis.fit(data_series, pipeline, 'streets') return dis
all_two_part_names = [name for name in all_names if len(name.split()) > 1] # %% df = pd.DataFrame(all_two_part_names) # %% # define the pipeline to process the strings, note that the last step must return # a tuple of strings pipeline = [ pre.normalize_whitespace, pre.remove_punctuation, lambda x: pre.trigram(x) + pre.split_words( x) # any python function is allowed ] # instantiate the disamby object, it applies the given pre-processing pipeline and # computes their frequency. dis = Disamby(df, pipeline) # %% sets = dis.disambiguated_sets(threshold=0.80) # %% mergeable_sets = [s for s in sets if len(s) > 1] # %% for sets in mergeable_sets: print([df.loc[s].item() for s in sets]) """ ['gen paul frieichs', 'paul frieichs'] ['debra goff', 'debra a goff'] ['chris cuomo', 'anew cuomo'] ['david spiegel', 'paul spiegel'] ['andy pekosz', 'anew pekosz'] ['sharon goldfarb', 'anna goldfarb'] ['george findlay', 'james findlay']