Example #1
0
def test_load_ytrue():
    ix_all = createmultiindex(X=getXst())
    y = getytrue()
    assert y.shape[0] == ix_all.shape[0]
    assert unique(y).shape[0] == 2
    print(y.sample(10))
    assert isinstance(y, pd.Series)
Example #2
0
def test_pruningpipe():
    print('start', pd.datetime.now())
    n_rows = 500
    n_cluster = 25
    n_simplequestions = 50
    n_pointedquestions = 50
    Xst = getXst(nrows=n_rows)
    ixc = createmultiindex(X=Xst)
    y_true = getytrue()
    y_true = y_true.loc[ixc]
    print(pd.datetime.now(), 'data loaded')
    pipe = PruningPipe(
        connector=DfConnector(
            scorer=Pipeline(steps=[
                ('scores', FeatureUnion(_lr_score_list)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=0))]
            )
        ),
        pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)),
        sbsmodel=FeatureUnion(transformer_list=_sbs_score_list),
        classifier=LogisticRegressionCV()
    )
    pipe.fit(X=Xst, y=y_true)
    y_pred = pipe.predict(X=Xst)
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    print('***\nscores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format(
        precision, recall, accuracy))
Example #3
0
def test_esconnector():
    print('start', pd.datetime.now())
    n_rows = 500
    n_cluster = 25
    Xst = getXst(nrows=n_rows)
    left = Xst[0]
    esclient = elasticsearch.Elasticsearch()
    scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
    escon = EsConnector(
        client=esclient,
        scoreplan=scoreplan,
        index="right",
        explain=False,
        size=20
    )
    ixc = createmultiindex(X=Xst)
    y_true = getytrue()
    y_true = y_true.loc[ixc]
    print(pd.datetime.now(), 'data loaded')
    pipe = PruningPipe(
        connector=escon,
        pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)),
        sbsmodel=FeatureUnion(transformer_list=_sbs_score_list),
        classifier=LogisticRegressionCV()
    )
    pipe.fit(X=left, y=y_true)
    y_pred = pipe.predict(X=left)
    scores = get_commonscores(y_pred=y_pred, y_true=y_true)
    precision = scores['precision']
    recall = scores['recall']
    accuracy = scores['balanced_accuracy']
    print('***\nscores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format(
        precision, recall, accuracy))
Example #4
0
def test_sbsmodel():
    X_lr = getXst(nrows=100)
    y_true = getytrue(Xst=X_lr)
    df_sbs = DfVisualSbs().fit_transform(X=X_lr)
    df_sbs = df_sbs.loc[y_true.index]
    transformer = make_union(*[
        SbsApplyComparator(on='name', comparator='simple'),
        SbsApplyComparator(on='name', comparator='token'),
        SbsApplyComparator(on='street', comparator='simple')
    ])
    imp = SimpleImputer(strategy='constant', fill_value=0)
    transformer = make_pipeline(*[transformer, imp])
    clf = Classifier()
    mypipe = PipeSbsClf(transformer=transformer, classifier=clf)
    mypipe.fit(X=df_sbs, y=y_true)
    print(mypipe.score(X=df_sbs, y=y_true))
Example #5
0
def test_lrmodel():
    X_lr = getXst(nrows=100)
    y_true = getytrue(Xst=X_lr)
    scorer = make_union(*[
        VectorizerConnector(on='name', analyzer='char'),
        VectorizerConnector(on='street', analyzer='char'),
        ExactConnector(on='countrycode'),
        ExactConnector(on='postalcode'),
        ExactConnector(on='duns')
    ])
    imp = SimpleImputer(strategy='constant', fill_value=0)
    transformer = make_pipeline(*[scorer, imp])
    clf = Classifier()
    mypipe = PipeDfClf(transformer=transformer, classifier=clf)
    X_score = mypipe.transformer.fit_transform(X=X_lr)
    mypipe.fit(X=X_lr, y=y_true)
    print(mypipe.score(X=X_lr, y=y_true))
Example #6
0
def test_explorer():
    print(pd.datetime.now())
    n_rows = 200
    n_cluster = 10
    n_simplequestions = 200
    n_hardquestions = 200
    Xst = getXst(nrows=n_rows)
    y_true = getytrue(Xst=Xst)
    print(pd.datetime.now(), 'data loaded')
    connector = DfConnector(scorer=Pipeline(
        steps=[('scores', FeatureUnion(_score_list)
                ), ('imputer',
                    SimpleImputer(strategy='constant', fill_value=0))]))
    explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster),
                        n_simple=n_simplequestions,
                        n_hard=n_hardquestions)
    connector.fit(X=Xst)
    # Xsm is the transformed output from the connector, i.e. the score matrix
    Xsm = connector.transform(X=Xst)
    print(pd.datetime.now(), 'score ok')
    # ixc is the index corresponding to the score matrix
    ixc = Xsm.index
    ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc),
                                    fit_cluster=True)
    print(pd.datetime.now(),
          'length of ix_simple {}'.format(ix_simple.shape[0]))
    sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple)
    print('***** SBS SIMPLE ******')
    print(sbs_simple.sample(5))
    print('*****')
    y_simple = y_true.loc[ix_simple]
    ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc),
                                y=y_simple)
    print(pd.datetime.now(), 'length of ix_hard {}'.format(ix_hard.shape[0]))
    sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard)
    print(sbs_hard.sample(5))
    print('*****')
    y_train = y_true.loc[ix_simple.union(ix_hard)]
    print('length of y_train: {}'.format(y_train.shape[0]))
    explorer.fit(X=pd.DataFrame(data=Xsm, index=ixc),
                 y=y_train,
                 fit_cluster=True)
    print('results of pred:\n',
          pd.Series(explorer.predict(X=Xsm)).value_counts())
    print('****')
Example #7
0
def test_pruning():
    print('start', pd.datetime.now())
    n_rows = 200
    n_cluster = 10
    n_simplequestions = 200
    n_hardquestions = 200
    Xst = getXst(nrows=n_rows)
    y_true = getytrue(Xst=Xst)
    print(pd.datetime.now(), 'data loaded')
    connector = DfConnector(scorer=Pipeline(
        steps=[('scores', FeatureUnion(_score_list)
                ), ('imputer',
                    SimpleImputer(strategy='constant', fill_value=0))]))
    explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster),
                        n_simple=n_simplequestions,
                        n_hard=n_hardquestions)
    connector.fit(X=Xst)
    # Xst is the transformed output from the connector, i.e. the score matrix
    Xsm = connector.transform(X=Xst)
    print(pd.datetime.now(), 'score ok')
    # ixc is the index corresponding to the score matrix
    ixc = Xsm.index
    y_true = y_true.loc[ixc]

    ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc),
                                    fit_cluster=True)
    ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc),
                                y=y_true.loc[ix_simple])
    ix_train = ix_simple.union(ix_hard)
    print('number of training samples:{}'.format(ix_train.shape[0]))
    X_train = pd.DataFrame(data=Xsm, index=ixc).loc[ix_train]
    y_train = y_true.loc[ix_train]

    explorer.fit(X=X_train, y=y_train, fit_cluster=True)
    y_pruning = explorer.predict(X=Xsm)
    y_pruning = pd.Series(data=y_pruning, name='y_pruning', index=ixc)
    y_pred = (y_pruning > 0).astype(int)
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    print('***\npruning scores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.
          format(precision, recall, accuracy))
Example #8
0
def test_hardquestions(fixture_data, fixture_scores):
    n_clusters = 5
    n_questions = 6
    X_lr = fixture_data
    y_true = getytrue()
    scorer = fixture_scores
    X_score = scorer.fit_transform(X=X_lr)
    cluster = KMeans(n_clusters=n_clusters)
    y_cluster = pd.Series(
        data=cluster.fit_predict(X=X_score),
        index=createmultiindex(X=X_lr)
    )
    questions = HardQuestions(n_questions=n_questions)
    ix_questions = questions.fit_transform(X=y_cluster, y=y_true)
    assert ix_questions.ndim == 1
    assert ix_questions.shape[0] <= n_questions * n_clusters
    assert ix_questions.shape[0] > 0
    X_sbs = DfVisualSbs().fit_transform(X=X_lr)
    X_questions = X_sbs.loc[ix_questions]
    assert X_questions.shape[0] == ix_questions.shape[0]
Example #9
0
def test_pipe_df():
    df_source = getsource(nrows=100)
    df_target = gettarget(nrows=100)
    assert df_source.columns.equals(df_target.columns)
    print(pd.datetime.now(), ' | ',
          'number of rows on left:{}'.format(df_source.shape[0]))
    print(pd.datetime.now(), ' | ',
          'number of rows on right:{}'.format(df_target.shape[0]))
    scorer = FeatureUnion(transformer_list=[(
        'name_char', VectorizerConnector(on='name', analyzer='char')
    ), ('street_char', VectorizerConnector(on='street', analyzer='char')
        ), ('countrycode_exact', ExactConnector(on='countrycode'))])
    dfcon = DfConnector(scorer=scorer)
    Xsm = dfcon.fit_transform(X=[df_source, df_target])

    ix_con = Xsm.index
    y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con]
    Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con)
    scores_further = scorer_sbs.fit_transform(X=Xsbs)
    scores_further = pd.DataFrame(data=scores_further,
                                  index=ix_con,
                                  columns=[c[0] for c in _sbs_score_list])
    scores_further = pd.concat([Xsm, scores_further],
                               axis=1,
                               ignore_index=False)
    X = scores_further
    scoring = ['precision', 'recall', 'accuracy']
    print(pd.datetime.now(), ' | starting score')
    pipe = Pipeline(
        steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0)
                ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)),
               ('Predictor',
                GradientBoostingClassifier(n_estimators=1000, max_depth=5))])
    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y_true,
                            scoring=scoring,
                            cv=5)
    for c in scoring:
        print(pd.datetime.now(),
              ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
Example #10
0
def test_pipeModel():
    X_lr = getXst(nrows=100)
    y_true = getytrue(Xst=X_lr)
    transformer1 = make_union(*[
        VectorizerConnector(on='name', analyzer='word'),
        VectorizerConnector(on='street', analyzer='word'),
        ExactConnector(on='countrycode'),
        ExactConnector(on='duns')
    ])
    imp1 = SimpleImputer(strategy='constant', fill_value=0)
    transformer1 = make_pipeline(*[transformer1, imp1])

    def myfunc(X):
        y_name = X[:, 0]
        y_street = X[:, 1]
        y_country = X[:, 2]
        y_duns = X[:, 3]
        y_return = np.logical_or(
            y_duns == 1,
            np.logical_and(y_country == 1,
                           np.logical_or(y_name > 0.3, y_street > 0.3)))
        return y_return

    clf1 = FunctionClassifier(func=myfunc)
    lrmodel = PipeDfClf(transformer=transformer1, classifier=clf1)
    transformer2 = make_union(*[
        SbsApplyComparator(on='name', comparator='simple'),
        SbsApplyComparator(on='name', comparator='token'),
        SbsApplyComparator(on='street', comparator='simple'),
        SbsApplyComparator(on='city', comparator='simple'),
        SbsApplyComparator(on='postalcode', comparator='simple'),
    ])
    imp2 = SimpleImputer(strategy='constant', fill_value=0)
    transformer2 = make_pipeline(*[transformer2, imp2])
    clf = Classifier()
    sbsmodel = PipeSbsClf(transformer=transformer2, classifier=clf)
    totalpipe = PruningDfSbsClf(lrmodel=lrmodel, sbsmodel=sbsmodel)
    totalpipe.fit(X=X_lr, y_lr=y_true, y_sbs=y_true)
    print(totalpipe.score(X=X_lr, y=y_true))
Example #11
0
def test_clusterclassifier(fixture_scores, fixture_data):
    n_clusters = 10
    n_questions = 200
    X_lr = fixture_data
    y_true = getytrue(Xst=X_lr)
    X_raw = fixture_scores.fit_transform(X=X_lr)
    X_reduced = PCA(n_components=3).fit_transform(X_raw)
    cluster = KMeans(n_clusters=n_clusters)
    y_cluster = pd.Series(data=cluster.fit_predict(X=X_reduced),
                          index=createmultiindex(X=X_lr))
    questions = SimpleQuestions(n_questions=n_questions)
    ix_questions = questions.fit_transform(X=y_cluster)
    y_true = y_true.loc[y_cluster.index.intersection(y_true.index)]
    print('number of labellized rows found :{}'.format(len(y_true)))
    clf = ClusterClassifier(cluster=cluster)
    clf.fit(X=y_cluster, y=y_true)
    print('all match: {}'.format(clf.allmatch))
    print('no match: {}'.format(clf.nomatch))
    print('mixed match: {}'.format(clf.mixedmatch))
    print('not found: {}'.format(clf.notfound))
    y_pred = clf.predict(X=y_cluster)
    res = pd.Series(y_pred).value_counts()
    print(res)
Example #12
0
def test_pipe_es():
    df_source = getsource(nrows=100)
    df_target = gettarget(nrows=None)
    assert df_source.columns.equals(df_target.columns)
    print(pd.datetime.now(), ' | ',
          'number of rows on left:{}'.format(df_source.shape[0]))
    print(pd.datetime.now(), ' | ',
          'number of rows on right:{}'.format(df_target.shape[0]))
    esclient = elasticsearch.Elasticsearch()
    scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
    escon = EsConnector(client=esclient,
                        scoreplan=scoreplan,
                        index="right",
                        explain=False,
                        size=10)
    #Xsm is the similarity matrix
    Xsm = escon.fit_transform(X=df_source)
    ix_con = Xsm.index
    y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con]
    Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
    scores_further = scorer_sbs.fit_transform(X=Xsbs)
    scores_further = pd.DataFrame(data=scores_further,
                                  index=ix_con,
                                  columns=[c[0] for c in _sbs_score_list])
    scores_further = pd.concat([Xsm[['es_score']], scores_further],
                               axis=1,
                               ignore_index=False)
    X = scores_further
    scoring = ['precision', 'recall', 'accuracy']
    print(pd.datetime.now(), ' | starting score')
    pipe = Pipeline(
        steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0)
                ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)),
               ('Predictor',
                GradientBoostingClassifier(n_estimators=1000, max_depth=5))])

    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y_true,
                            scoring=scoring,
                            cv=5)
    for c in scoring:
        print(pd.datetime.now(),
              ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
    ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')),
]

n_rows = 500 # Number of rows to compare in each datasets
n_cluster = 10 # Number of clusters used in the exploratory step
n_simplequestions = 100 # Number of questions per cluster
n_pointedquestions = 100 # Number of additional questions for clusters with mixed matches


##Load the data
print('start', pd.datetime.now())
Xst = getXst(nrows=n_rows)
ixc = createmultiindex(X=Xst)
# Load the vector corresponding to Xst
y_true = getytrue().loc[ixc]
print(y_true.value_counts())
print(pd.datetime.now(), 'data loaded')

## Explore the data:
connector = DfConnector(
        scorer=Pipeline(steps=[
            ('scores', FeatureUnion(_lr_score_list)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=0))]
        )
    )
### Fit the cluster non-supervizes
explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions)
Xst = connector.fit_transform(X=Xst)
explorer.fit_cluster(X=Xst)
Example #14
0
from sklearn.pipeline import make_union

from suricate.dftransformers.vectorizer import VectorizerConnector
from suricate.data.base import ix_names
from suricate.data.companies import getsource, gettarget, getXst, getytrue
left = getsource(nrows=100)
right = gettarget(nrows=100)
X_lr = getXst(nrows=100)
y_true = getytrue(Xst=X_lr)


def test_loaddata():
    print(ix_names['ixname'])
    print(left.shape[0])
    print(right.shape[0])
    assert True


def test_tfidf():
    expected_shape = left.shape[0] * right.shape[0]
    stages = [
        VectorizerConnector(on='name', analyzer='char', pruning=False),
        VectorizerConnector(on='street', analyzer='char', pruning=False),
    ]
    scorer = make_union(*stages)
    scorer.fit(X=X_lr)
    X_score = scorer.transform(X=X_lr)
    assert X_score.shape[0] == expected_shape
    pass