def test_getsbs(esconnectornew): df_source = getsource(nrows=50) Xst = esconnectornew.fit_transform(X=df_source) ix = Xst.index X_sbs = esconnectornew.getsbs(X=df_source, on_ix=ix) assert X_sbs.index.equals(ix) for c in df_source.columns: assert c + '_source' in X_sbs.columns return True
def test_search_record(esconnectornew): df_source = getsource(nrows=100) record = df_source.sample().iloc[0] res = esconnectornew.search_record(record=record) score = unpack_allhits(res) assert len(score) <= esconnectornew.size assert isinstance(score, list) assert isinstance(score[0], dict) print(res) print(score)
def test_scorecols_datacols(esconnectornew): df_source = getsource(nrows=10) for c in df_source.sample(1).index: record = df_source.loc[c] res = esconnectornew.search_record(record=record) score = unpack_allhits(res) df = pd.DataFrame.from_dict(score, orient='columns').rename( columns={ 'ix': 'ix_target' }) scorecols = pd.Index(['es_rank', 'es_score']) assert df.columns.contains(scorecols[0]) assert df.columns.contains(scorecols[1])
def test_pipe_df(): df_source = getsource(nrows=100) df_target = gettarget(nrows=100) assert df_source.columns.equals(df_target.columns) print(pd.datetime.now(), ' | ', 'number of rows on left:{}'.format(df_source.shape[0])) print(pd.datetime.now(), ' | ', 'number of rows on right:{}'.format(df_target.shape[0])) scorer = FeatureUnion(transformer_list=[( 'name_char', VectorizerConnector(on='name', analyzer='char') ), ('street_char', VectorizerConnector(on='street', analyzer='char') ), ('countrycode_exact', ExactConnector(on='countrycode'))]) dfcon = DfConnector(scorer=scorer) Xsm = dfcon.fit_transform(X=[df_source, df_target]) ix_con = Xsm.index y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con] Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con) scores_further = scorer_sbs.fit_transform(X=Xsbs) scores_further = pd.DataFrame(data=scores_further, index=ix_con, columns=[c[0] for c in _sbs_score_list]) scores_further = pd.concat([Xsm, scores_further], axis=1, ignore_index=False) X = scores_further scoring = ['precision', 'recall', 'accuracy'] print(pd.datetime.now(), ' | starting score') pipe = Pipeline( steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)), ('Predictor', GradientBoostingClassifier(n_estimators=1000, max_depth=5))]) scores = cross_validate(estimator=pipe, X=X, y=y_true, scoring=scoring, cv=5) for c in scoring: print(pd.datetime.now(), ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
def test_pipe_es(): df_source = getsource(nrows=100) df_target = gettarget(nrows=None) assert df_source.columns.equals(df_target.columns) print(pd.datetime.now(), ' | ', 'number of rows on left:{}'.format(df_source.shape[0])) print(pd.datetime.now(), ' | ', 'number of rows on right:{}'.format(df_target.shape[0])) esclient = elasticsearch.Elasticsearch() scoreplan = { 'name': { 'type': 'FreeText' }, 'street': { 'type': 'FreeText' }, 'city': { 'type': 'FreeText' }, 'duns': { 'type': 'Exact' }, 'postalcode': { 'type': 'FreeText' }, 'countrycode': { 'type': 'Exact' } } escon = EsConnector(client=esclient, scoreplan=scoreplan, index="right", explain=False, size=10) #Xsm is the similarity matrix Xsm = escon.fit_transform(X=df_source) ix_con = Xsm.index y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con] Xsbs = escon.getsbs(X=df_source, on_ix=ix_con) scores_further = scorer_sbs.fit_transform(X=Xsbs) scores_further = pd.DataFrame(data=scores_further, index=ix_con, columns=[c[0] for c in _sbs_score_list]) scores_further = pd.concat([Xsm[['es_score']], scores_further], axis=1, ignore_index=False) X = scores_further scoring = ['precision', 'recall', 'accuracy'] print(pd.datetime.now(), ' | starting score') pipe = Pipeline( steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)), ('Predictor', GradientBoostingClassifier(n_estimators=1000, max_depth=5))]) scores = cross_validate(estimator=pipe, X=X, y=y_true, scoring=scoring, cv=5) for c in scoring: print(pd.datetime.now(), ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
def test_getresults(esconnectornew): df_source = getsource(nrows=10) record = df_source.sample().iloc[0] q = esconnectornew._write_es_query(record) r = esconnectornew.client.search(body=q, index="right") print(r)
def test_create_query(esconnectornew): df_source = getsource(nrows=10) record = df_source.sample().iloc[0] q = esconnectornew._write_es_query(record) print(q)
def test_transform(esconnectornew): df_source = getsource(nrows=100) X = esconnectornew.fit_transform(X=df_source) assert isinstance(X, pd.DataFrame) assert X.shape[1] == 2
from tutorial.main.stepbystep.stepbysteputils.pgconnector import create_engine_ready from suricate.data.companies import getsource, gettarget import pandas as pd import numpy as np engine = create_engine_ready() # filefolder = '~/' # leftpath = 'source.csv' # rightpath = 'target.csv' # df_source = pd.read_csv(filefolder + leftpath, index_col=0, sep='|', encoding='utf-8') # df_target = pd.read_csv(filefolder + rightpath, index_col=0, sep='|', encoding='utf-8') df_source_raw = getsource(nrows=500) df_target_raw = gettarget(nrows=None) from sklearn.model_selection import train_test_split def rebuild_ytrue(ix): y_true_saved = pd.read_sql( sql="SELECT * FROM y_true WHERE y_true.y_true = 1", con=engine).set_index(['ix_source', 'ix_target'], drop=True)['y_true'] y = pd.Series(index=ix, data=np.zeros(shape=len(ix)), name='y_true') ix_common = y_true_saved.index.intersection(ix) y.loc[ix_common] = y_true_saved.loc[ix_common] return y def prepare_source(df): """
from sklearn.pipeline import make_union from suricate.dftransformers.vectorizer import VectorizerConnector from suricate.data.base import ix_names from suricate.data.companies import getsource, gettarget, getXst, getytrue left = getsource(nrows=100) right = gettarget(nrows=100) X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) def test_loaddata(): print(ix_names['ixname']) print(left.shape[0]) print(right.shape[0]) assert True def test_tfidf(): expected_shape = left.shape[0] * right.shape[0] stages = [ VectorizerConnector(on='name', analyzer='char', pruning=False), VectorizerConnector(on='street', analyzer='char', pruning=False), ] scorer = make_union(*stages) scorer.fit(X=X_lr) X_score = scorer.transform(X=X_lr) assert X_score.shape[0] == expected_shape pass
import pandas as pd from tutorial.main.stepbystep.stepbysteputils.pgconnector import create_engine_ready engine = create_engine_ready() # filefolder = '~/' # leftpath = 'source.csv' # rightpath = 'target.csv' # df_source = pd.read_csv(filefolder + leftpath, index_col=0, sep='|', encoding='utf-8') # df_target = pd.read_csv(filefolder + rightpath, index_col=0, sep='|', encoding='utf-8') from suricate.data.companies import getsource, gettarget df_source_raw=getsource(nrows=None) df_target_raw = gettarget(nrows=None) def prepare_source(df): """ Args: df: Returns: pd.DataFrame """ df2 = df return df2 def prepare_target(df): """ Args:
def test_load_source(): df = getsource() print(df.sample(10)) assert isinstance(df, pd.DataFrame)