Esempio n. 1
0
def test_getsbs(esconnectornew):
    df_source = getsource(nrows=50)
    Xst = esconnectornew.fit_transform(X=df_source)
    ix = Xst.index
    X_sbs = esconnectornew.getsbs(X=df_source, on_ix=ix)
    assert X_sbs.index.equals(ix)
    for c in df_source.columns:
        assert c + '_source' in X_sbs.columns
    return True
Esempio n. 2
0
def test_search_record(esconnectornew):
    df_source = getsource(nrows=100)
    record = df_source.sample().iloc[0]
    res = esconnectornew.search_record(record=record)
    score = unpack_allhits(res)
    assert len(score) <= esconnectornew.size
    assert isinstance(score, list)
    assert isinstance(score[0], dict)
    print(res)
    print(score)
Esempio n. 3
0
def test_scorecols_datacols(esconnectornew):
    df_source = getsource(nrows=10)
    for c in df_source.sample(1).index:
        record = df_source.loc[c]
        res = esconnectornew.search_record(record=record)
        score = unpack_allhits(res)
        df = pd.DataFrame.from_dict(score, orient='columns').rename(
                columns={
                    'ix': 'ix_target'
                })
        scorecols = pd.Index(['es_rank', 'es_score'])
        assert df.columns.contains(scorecols[0])
        assert df.columns.contains(scorecols[1])
Esempio n. 4
0
def test_pipe_df():
    df_source = getsource(nrows=100)
    df_target = gettarget(nrows=100)
    assert df_source.columns.equals(df_target.columns)
    print(pd.datetime.now(), ' | ',
          'number of rows on left:{}'.format(df_source.shape[0]))
    print(pd.datetime.now(), ' | ',
          'number of rows on right:{}'.format(df_target.shape[0]))
    scorer = FeatureUnion(transformer_list=[(
        'name_char', VectorizerConnector(on='name', analyzer='char')
    ), ('street_char', VectorizerConnector(on='street', analyzer='char')
        ), ('countrycode_exact', ExactConnector(on='countrycode'))])
    dfcon = DfConnector(scorer=scorer)
    Xsm = dfcon.fit_transform(X=[df_source, df_target])

    ix_con = Xsm.index
    y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con]
    Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con)
    scores_further = scorer_sbs.fit_transform(X=Xsbs)
    scores_further = pd.DataFrame(data=scores_further,
                                  index=ix_con,
                                  columns=[c[0] for c in _sbs_score_list])
    scores_further = pd.concat([Xsm, scores_further],
                               axis=1,
                               ignore_index=False)
    X = scores_further
    scoring = ['precision', 'recall', 'accuracy']
    print(pd.datetime.now(), ' | starting score')
    pipe = Pipeline(
        steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0)
                ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)),
               ('Predictor',
                GradientBoostingClassifier(n_estimators=1000, max_depth=5))])
    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y_true,
                            scoring=scoring,
                            cv=5)
    for c in scoring:
        print(pd.datetime.now(),
              ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
Esempio n. 5
0
def test_pipe_es():
    df_source = getsource(nrows=100)
    df_target = gettarget(nrows=None)
    assert df_source.columns.equals(df_target.columns)
    print(pd.datetime.now(), ' | ',
          'number of rows on left:{}'.format(df_source.shape[0]))
    print(pd.datetime.now(), ' | ',
          'number of rows on right:{}'.format(df_target.shape[0]))
    esclient = elasticsearch.Elasticsearch()
    scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
    escon = EsConnector(client=esclient,
                        scoreplan=scoreplan,
                        index="right",
                        explain=False,
                        size=10)
    #Xsm is the similarity matrix
    Xsm = escon.fit_transform(X=df_source)
    ix_con = Xsm.index
    y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con]
    Xsbs = escon.getsbs(X=df_source, on_ix=ix_con)
    scores_further = scorer_sbs.fit_transform(X=Xsbs)
    scores_further = pd.DataFrame(data=scores_further,
                                  index=ix_con,
                                  columns=[c[0] for c in _sbs_score_list])
    scores_further = pd.concat([Xsm[['es_score']], scores_further],
                               axis=1,
                               ignore_index=False)
    X = scores_further
    scoring = ['precision', 'recall', 'accuracy']
    print(pd.datetime.now(), ' | starting score')
    pipe = Pipeline(
        steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0)
                ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)),
               ('Predictor',
                GradientBoostingClassifier(n_estimators=1000, max_depth=5))])

    scores = cross_validate(estimator=pipe,
                            X=X,
                            y=y_true,
                            scoring=scoring,
                            cv=5)
    for c in scoring:
        print(pd.datetime.now(),
              ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
Esempio n. 6
0
def test_getresults(esconnectornew):
    df_source = getsource(nrows=10)
    record = df_source.sample().iloc[0]
    q = esconnectornew._write_es_query(record)
    r = esconnectornew.client.search(body=q, index="right")
    print(r)
Esempio n. 7
0
def test_create_query(esconnectornew):
    df_source = getsource(nrows=10)
    record = df_source.sample().iloc[0]
    q = esconnectornew._write_es_query(record)
    print(q)
Esempio n. 8
0
def test_transform(esconnectornew):
    df_source = getsource(nrows=100)
    X = esconnectornew.fit_transform(X=df_source)
    assert isinstance(X, pd.DataFrame)
    assert X.shape[1] == 2
Esempio n. 9
0
from tutorial.main.stepbystep.stepbysteputils.pgconnector import create_engine_ready
from suricate.data.companies import getsource, gettarget
import pandas as pd
import numpy as np

engine = create_engine_ready()

# filefolder = '~/'
# leftpath = 'source.csv'
# rightpath = 'target.csv'
# df_source = pd.read_csv(filefolder + leftpath, index_col=0, sep='|', encoding='utf-8')
# df_target = pd.read_csv(filefolder + rightpath, index_col=0, sep='|', encoding='utf-8')
df_source_raw = getsource(nrows=500)
df_target_raw = gettarget(nrows=None)

from sklearn.model_selection import train_test_split


def rebuild_ytrue(ix):
    y_true_saved = pd.read_sql(
        sql="SELECT * FROM y_true WHERE y_true.y_true = 1",
        con=engine).set_index(['ix_source', 'ix_target'], drop=True)['y_true']
    y = pd.Series(index=ix, data=np.zeros(shape=len(ix)), name='y_true')
    ix_common = y_true_saved.index.intersection(ix)
    y.loc[ix_common] = y_true_saved.loc[ix_common]
    return y


def prepare_source(df):
    """
Esempio n. 10
0
from sklearn.pipeline import make_union

from suricate.dftransformers.vectorizer import VectorizerConnector
from suricate.data.base import ix_names
from suricate.data.companies import getsource, gettarget, getXst, getytrue
left = getsource(nrows=100)
right = gettarget(nrows=100)
X_lr = getXst(nrows=100)
y_true = getytrue(Xst=X_lr)


def test_loaddata():
    print(ix_names['ixname'])
    print(left.shape[0])
    print(right.shape[0])
    assert True


def test_tfidf():
    expected_shape = left.shape[0] * right.shape[0]
    stages = [
        VectorizerConnector(on='name', analyzer='char', pruning=False),
        VectorizerConnector(on='street', analyzer='char', pruning=False),
    ]
    scorer = make_union(*stages)
    scorer.fit(X=X_lr)
    X_score = scorer.transform(X=X_lr)
    assert X_score.shape[0] == expected_shape
    pass
Esempio n. 11
0
import pandas as pd
from tutorial.main.stepbystep.stepbysteputils.pgconnector import create_engine_ready

engine = create_engine_ready()
# filefolder = '~/'
# leftpath = 'source.csv'
# rightpath = 'target.csv'
# df_source = pd.read_csv(filefolder + leftpath, index_col=0, sep='|', encoding='utf-8')
# df_target = pd.read_csv(filefolder + rightpath, index_col=0, sep='|', encoding='utf-8')

from suricate.data.companies import getsource, gettarget

df_source_raw=getsource(nrows=None)
df_target_raw = gettarget(nrows=None)

def prepare_source(df):
    """

    Args:
        df:

    Returns:
        pd.DataFrame
    """
    df2 = df
    return df2

def prepare_target(df):
    """

    Args:
Esempio n. 12
0
def test_load_source():
    df = getsource()
    print(df.sample(10))
    assert isinstance(df, pd.DataFrame)