Example #1
0
def fixture_scores():
    # Create the scores
    scores = Pipeline(steps=[(
        'scores',
        FeatureUnion([(
            'name_vecword',
            VectorizerConnector(on='name', analyzer='word', ngram_range=(1,
                                                                         2))),
                      ('name_vecchar',
                       VectorizerConnector(
                           on='name', analyzer='char', ngram_range=(1, 3))),
                      ('street_vecword',
                       VectorizerConnector(
                           on='street', analyzer='word', ngram_range=(1, 2))),
                      ('street_vecchar',
                       VectorizerConnector(
                           on='street', analyzer='char', ngram_range=(1, 3))),
                      ('city_vecchar',
                       VectorizerConnector(
                           on='city', analyzer='char', ngram_range=(1, 3))
                       ), ('postalcode_exact', ExactConnector(on='postalcode')
                           ), ('duns_exact', ExactConnector(on='duns')),
                      ('countrycode_exact', ExactConnector(on='countrycode'))])
    ), ('imputer', SimpleImputer(strategy='constant', fill_value=0)
        ), ('Scaler', Normalizer())])
    return scores
Example #2
0
def test_lrmodel():
    X_lr = getXst(nrows=100)
    y_true = getytrue(Xst=X_lr)
    scorer = make_union(*[
        VectorizerConnector(on='name', analyzer='char'),
        VectorizerConnector(on='street', analyzer='char'),
        ExactConnector(on='countrycode'),
        ExactConnector(on='postalcode'),
        ExactConnector(on='duns')
    ])
    imp = SimpleImputer(strategy='constant', fill_value=0)
    transformer = make_pipeline(*[scorer, imp])
    clf = Classifier()
    mypipe = PipeDfClf(transformer=transformer, classifier=clf)
    X_score = mypipe.transformer.fit_transform(X=X_lr)
    mypipe.fit(X=X_lr, y=y_true)
    print(mypipe.score(X=X_lr, y=y_true))
Example #3
0
def test_tfidf():
    expected_shape = left.shape[0] * right.shape[0]
    stages = [
        VectorizerConnector(on='name', analyzer='char', pruning=False),
        VectorizerConnector(on='street', analyzer='char', pruning=False),
        ExactConnector(on='duns', pruning=False)

    ]
    scorer = make_union(*stages)
    X_score = scorer.transform(X=X_lr)
    assert X_score.shape[0] == expected_shape
    pass
Example #4
0
def test_pipeModel():
    X_lr = getXst(nrows=100)
    y_true = getytrue(Xst=X_lr)
    transformer1 = make_union(*[
        VectorizerConnector(on='name', analyzer='word'),
        VectorizerConnector(on='street', analyzer='word'),
        ExactConnector(on='countrycode'),
        ExactConnector(on='duns')
    ])
    imp1 = SimpleImputer(strategy='constant', fill_value=0)
    transformer1 = make_pipeline(*[transformer1, imp1])

    def myfunc(X):
        y_name = X[:, 0]
        y_street = X[:, 1]
        y_country = X[:, 2]
        y_duns = X[:, 3]
        y_return = np.logical_or(
            y_duns == 1,
            np.logical_and(y_country == 1,
                           np.logical_or(y_name > 0.3, y_street > 0.3)))
        return y_return

    clf1 = FunctionClassifier(func=myfunc)
    lrmodel = PipeDfClf(transformer=transformer1, classifier=clf1)
    transformer2 = make_union(*[
        SbsApplyComparator(on='name', comparator='simple'),
        SbsApplyComparator(on='name', comparator='token'),
        SbsApplyComparator(on='street', comparator='simple'),
        SbsApplyComparator(on='city', comparator='simple'),
        SbsApplyComparator(on='postalcode', comparator='simple'),
    ])
    imp2 = SimpleImputer(strategy='constant', fill_value=0)
    transformer2 = make_pipeline(*[transformer2, imp2])
    clf = Classifier()
    sbsmodel = PipeSbsClf(transformer=transformer2, classifier=clf)
    totalpipe = PruningDfSbsClf(lrmodel=lrmodel, sbsmodel=sbsmodel)
    totalpipe.fit(X=X_lr, y_lr=y_true, y_sbs=y_true)
    print(totalpipe.score(X=X_lr, y=y_true))
def test_get_feature_names():
    X = X_lr
    stages = [
        VectorizerConnector(on='name', analyzer='char'),
        VectorizerConnector(on='name', analyzer='word'),
        DfApplyComparator(on='name', comparator='simple'),
        DfApplyComparator(on='name', comparator='token'),
        ExactConnector(on='name')

    ]
    pipe = make_union(*stages)
    scorecols = pipe.get_feature_names()
    print(scorecols)
    print(len(scorecols))
    assert True
Example #6
0
def test_makeunion():
    print('\n', 'starting test_makeunion')
    stages = [
        VectorizerConnector(on='name',
                            analyzer='char',
                            ixname=ix_names['ixname'],
                            source_suffix=ix_names['source_suffix'],
                            target_suffix=ix_names['target_suffix']),
        ExactConnector(on='name',
                       ixname=ix_names['ixname'],
                       source_suffix=ix_names['source_suffix'],
                       target_suffix=ix_names['target_suffix'])
    ]
    X_score = make_union(*stages).fit_transform(X=X_lr)
    assert X_score.shape[0] == X_lr[0].shape[0] * X_lr[1].shape[0]
    print('\n test_makeunion successful', '\n\n')
Example #7
0
def test_pipe_df():
    df_source = getsource(nrows=100)
    df_target = gettarget(nrows=100)
    assert df_source.columns.equals(df_target.columns)
    print(pd.datetime.now(), ' | ',
          'number of rows on left:{}'.format(df_source.shape[0]))
    print(pd.datetime.now(), ' | ',
          'number of rows on right:{}'.format(df_target.shape[0]))
    scorer = FeatureUnion(transformer_list=[(
        'name_char', VectorizerConnector(on='name', analyzer='char')
    ), ('street_char', VectorizerConnector(on='street', analyzer='char')
        ), ('countrycode_exact', ExactConnector(on='countrycode'))])
    dfcon = DfConnector(scorer=scorer)
    Xsm = dfcon.fit_transform(X=[df_source, df_target])

    ix_con = Xsm.index
    y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con]
    Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con)
    scores_further = scorer_sbs.fit_transform(X=Xsbs)
    scores_further = pd.DataFrame(data=scores_further,
                                  index=ix_con,
                                  columns=[c[0] for c in _sbs_score_list])
    scores_further = pd.concat([Xsm, scores_further],
                               axis=1,
                               ignore_index=False)
    X = scores_further
    scoring = ['precision', 'recall', 'accuracy']
    print(pd.datetime.now(), ' | starting score')
    pipe = Pipeline(
        steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0)
                ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)),
               ('Predictor',
                GradientBoostingClassifier(n_estimators=1000, max_depth=5))])
    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y_true,
                            scoring=scoring,
                            cv=5)
    for c in scoring:
        print(pd.datetime.now(),
              ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
Example #8
0
def test_exact():
    print('\n', 'starting test_exact')
    ixname = ix_names['ixname']
    source_suffix = ix_names['source_suffix']
    target_suffix = ix_names['target_suffix']
    connector = ExactConnector(on='name',
                               ixname=ixname,
                               source_suffix=source_suffix,
                               target_suffix=target_suffix)
    ## Show side by side
    y = connector.transform(X=X_lr)
    assert np.nansum(y) == 2
    sbs = connector.show_pairs(X=X_lr)

    connector = ExactConnector(on='name',
                               ixname=ixname,
                               source_suffix=source_suffix,
                               target_suffix=target_suffix)
    score = connector.transform(X=X_lr)
    assert score.shape[0] == X_lr[0].shape[0] * X_lr[1].shape[0]
    print('\n test_exact successful', '\n\n')
Example #9
0
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score,  balanced_accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from suricate.preutils import createmultiindex

# ESCONNECTOR
from suricate.dbconnectors import EsConnector
import elasticsearch
from suricate.metrics.metrics import get_commonscores

_lr_score_list = [
    ('name_vecword', VectorizerConnector(on='name', analyzer='word', ngram_range=(1, 2))),
    ('street_vecword', VectorizerConnector(on='street', analyzer='word', ngram_range=(1, 2))),
    ('city_vecchar', VectorizerConnector(on='city', analyzer='char', ngram_range=(1, 3))),
    ('countrycode_exact', ExactConnector(on='countrycode')),
    ('duns_exact', ExactConnector(on='duns')),
    ('postalcode_exact', ExactConnector(on='postalcode'))

]
_lr_score_cols = [c[0] for c in _lr_score_list]
_sbs_score_list = [
    ('name_fuzzy', SbsApplyComparator(on='name', comparator='simple')),
    ('street_fuzzy', SbsApplyComparator(on='street', comparator='simple')),
    ('name_token', SbsApplyComparator(on='name', comparator='token')),
    ('street_token', SbsApplyComparator(on='street', comparator='token')),
    ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')),
    ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')),
]