def fixture_scores(): # Create the scores scores = Pipeline(steps=[( 'scores', FeatureUnion([( 'name_vecword', VectorizerConnector(on='name', analyzer='word', ngram_range=(1, 2))), ('name_vecchar', VectorizerConnector( on='name', analyzer='char', ngram_range=(1, 3))), ('street_vecword', VectorizerConnector( on='street', analyzer='word', ngram_range=(1, 2))), ('street_vecchar', VectorizerConnector( on='street', analyzer='char', ngram_range=(1, 3))), ('city_vecchar', VectorizerConnector( on='city', analyzer='char', ngram_range=(1, 3)) ), ('postalcode_exact', ExactConnector(on='postalcode') ), ('duns_exact', ExactConnector(on='duns')), ('countrycode_exact', ExactConnector(on='countrycode'))]) ), ('imputer', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer())]) return scores
def test_lrmodel(): X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) scorer = make_union(*[ VectorizerConnector(on='name', analyzer='char'), VectorizerConnector(on='street', analyzer='char'), ExactConnector(on='countrycode'), ExactConnector(on='postalcode'), ExactConnector(on='duns') ]) imp = SimpleImputer(strategy='constant', fill_value=0) transformer = make_pipeline(*[scorer, imp]) clf = Classifier() mypipe = PipeDfClf(transformer=transformer, classifier=clf) X_score = mypipe.transformer.fit_transform(X=X_lr) mypipe.fit(X=X_lr, y=y_true) print(mypipe.score(X=X_lr, y=y_true))
def test_tfidf(): expected_shape = left.shape[0] * right.shape[0] stages = [ VectorizerConnector(on='name', analyzer='char', pruning=False), VectorizerConnector(on='street', analyzer='char', pruning=False), ExactConnector(on='duns', pruning=False) ] scorer = make_union(*stages) X_score = scorer.transform(X=X_lr) assert X_score.shape[0] == expected_shape pass
def test_pipeModel(): X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) transformer1 = make_union(*[ VectorizerConnector(on='name', analyzer='word'), VectorizerConnector(on='street', analyzer='word'), ExactConnector(on='countrycode'), ExactConnector(on='duns') ]) imp1 = SimpleImputer(strategy='constant', fill_value=0) transformer1 = make_pipeline(*[transformer1, imp1]) def myfunc(X): y_name = X[:, 0] y_street = X[:, 1] y_country = X[:, 2] y_duns = X[:, 3] y_return = np.logical_or( y_duns == 1, np.logical_and(y_country == 1, np.logical_or(y_name > 0.3, y_street > 0.3))) return y_return clf1 = FunctionClassifier(func=myfunc) lrmodel = PipeDfClf(transformer=transformer1, classifier=clf1) transformer2 = make_union(*[ SbsApplyComparator(on='name', comparator='simple'), SbsApplyComparator(on='name', comparator='token'), SbsApplyComparator(on='street', comparator='simple'), SbsApplyComparator(on='city', comparator='simple'), SbsApplyComparator(on='postalcode', comparator='simple'), ]) imp2 = SimpleImputer(strategy='constant', fill_value=0) transformer2 = make_pipeline(*[transformer2, imp2]) clf = Classifier() sbsmodel = PipeSbsClf(transformer=transformer2, classifier=clf) totalpipe = PruningDfSbsClf(lrmodel=lrmodel, sbsmodel=sbsmodel) totalpipe.fit(X=X_lr, y_lr=y_true, y_sbs=y_true) print(totalpipe.score(X=X_lr, y=y_true))
def test_get_feature_names(): X = X_lr stages = [ VectorizerConnector(on='name', analyzer='char'), VectorizerConnector(on='name', analyzer='word'), DfApplyComparator(on='name', comparator='simple'), DfApplyComparator(on='name', comparator='token'), ExactConnector(on='name') ] pipe = make_union(*stages) scorecols = pipe.get_feature_names() print(scorecols) print(len(scorecols)) assert True
def test_makeunion(): print('\n', 'starting test_makeunion') stages = [ VectorizerConnector(on='name', analyzer='char', ixname=ix_names['ixname'], source_suffix=ix_names['source_suffix'], target_suffix=ix_names['target_suffix']), ExactConnector(on='name', ixname=ix_names['ixname'], source_suffix=ix_names['source_suffix'], target_suffix=ix_names['target_suffix']) ] X_score = make_union(*stages).fit_transform(X=X_lr) assert X_score.shape[0] == X_lr[0].shape[0] * X_lr[1].shape[0] print('\n test_makeunion successful', '\n\n')
def test_pipe_df(): df_source = getsource(nrows=100) df_target = gettarget(nrows=100) assert df_source.columns.equals(df_target.columns) print(pd.datetime.now(), ' | ', 'number of rows on left:{}'.format(df_source.shape[0])) print(pd.datetime.now(), ' | ', 'number of rows on right:{}'.format(df_target.shape[0])) scorer = FeatureUnion(transformer_list=[( 'name_char', VectorizerConnector(on='name', analyzer='char') ), ('street_char', VectorizerConnector(on='street', analyzer='char') ), ('countrycode_exact', ExactConnector(on='countrycode'))]) dfcon = DfConnector(scorer=scorer) Xsm = dfcon.fit_transform(X=[df_source, df_target]) ix_con = Xsm.index y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con] Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con) scores_further = scorer_sbs.fit_transform(X=Xsbs) scores_further = pd.DataFrame(data=scores_further, index=ix_con, columns=[c[0] for c in _sbs_score_list]) scores_further = pd.concat([Xsm, scores_further], axis=1, ignore_index=False) X = scores_further scoring = ['precision', 'recall', 'accuracy'] print(pd.datetime.now(), ' | starting score') pipe = Pipeline( steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)), ('Predictor', GradientBoostingClassifier(n_estimators=1000, max_depth=5))]) scores = cross_validate(estimator=pipe, X=X, y=y_true, scoring=scoring, cv=5) for c in scoring: print(pd.datetime.now(), ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
def test_exact(): print('\n', 'starting test_exact') ixname = ix_names['ixname'] source_suffix = ix_names['source_suffix'] target_suffix = ix_names['target_suffix'] connector = ExactConnector(on='name', ixname=ixname, source_suffix=source_suffix, target_suffix=target_suffix) ## Show side by side y = connector.transform(X=X_lr) assert np.nansum(y) == 2 sbs = connector.show_pairs(X=X_lr) connector = ExactConnector(on='name', ixname=ixname, source_suffix=source_suffix, target_suffix=target_suffix) score = connector.transform(X=X_lr) assert score.shape[0] == X_lr[0].shape[0] * X_lr[1].shape[0] print('\n test_exact successful', '\n\n')
from sklearn.impute import SimpleImputer from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score import pandas as pd from sklearn.linear_model import LogisticRegressionCV from suricate.preutils import createmultiindex # ESCONNECTOR from suricate.dbconnectors import EsConnector import elasticsearch from suricate.metrics.metrics import get_commonscores _lr_score_list = [ ('name_vecword', VectorizerConnector(on='name', analyzer='word', ngram_range=(1, 2))), ('street_vecword', VectorizerConnector(on='street', analyzer='word', ngram_range=(1, 2))), ('city_vecchar', VectorizerConnector(on='city', analyzer='char', ngram_range=(1, 3))), ('countrycode_exact', ExactConnector(on='countrycode')), ('duns_exact', ExactConnector(on='duns')), ('postalcode_exact', ExactConnector(on='postalcode')) ] _lr_score_cols = [c[0] for c in _lr_score_list] _sbs_score_list = [ ('name_fuzzy', SbsApplyComparator(on='name', comparator='simple')), ('street_fuzzy', SbsApplyComparator(on='street', comparator='simple')), ('name_token', SbsApplyComparator(on='name', comparator='token')), ('street_token', SbsApplyComparator(on='street', comparator='token')), ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')), ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')), ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')), ]