Example #1
0
def test_explorer():
    print(pd.datetime.now())
    n_rows = 200
    n_cluster = 10
    n_simplequestions = 200
    n_hardquestions = 200
    Xst = getXst(nrows=n_rows)
    y_true = getytrue(Xst=Xst)
    print(pd.datetime.now(), 'data loaded')
    connector = DfConnector(scorer=Pipeline(
        steps=[('scores', FeatureUnion(_score_list)
                ), ('imputer',
                    SimpleImputer(strategy='constant', fill_value=0))]))
    explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster),
                        n_simple=n_simplequestions,
                        n_hard=n_hardquestions)
    connector.fit(X=Xst)
    # Xsm is the transformed output from the connector, i.e. the score matrix
    Xsm = connector.transform(X=Xst)
    print(pd.datetime.now(), 'score ok')
    # ixc is the index corresponding to the score matrix
    ixc = Xsm.index
    ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc),
                                    fit_cluster=True)
    print(pd.datetime.now(),
          'length of ix_simple {}'.format(ix_simple.shape[0]))
    sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple)
    print('***** SBS SIMPLE ******')
    print(sbs_simple.sample(5))
    print('*****')
    y_simple = y_true.loc[ix_simple]
    ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc),
                                y=y_simple)
    print(pd.datetime.now(), 'length of ix_hard {}'.format(ix_hard.shape[0]))
    sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard)
    print(sbs_hard.sample(5))
    print('*****')
    y_train = y_true.loc[ix_simple.union(ix_hard)]
    print('length of y_train: {}'.format(y_train.shape[0]))
    explorer.fit(X=pd.DataFrame(data=Xsm, index=ixc),
                 y=y_train,
                 fit_cluster=True)
    print('results of pred:\n',
          pd.Series(explorer.predict(X=Xsm)).value_counts())
    print('****')
Example #2
0
def test_pruning():
    print('start', pd.datetime.now())
    n_rows = 200
    n_cluster = 10
    n_simplequestions = 200
    n_hardquestions = 200
    Xst = getXst(nrows=n_rows)
    y_true = getytrue(Xst=Xst)
    print(pd.datetime.now(), 'data loaded')
    connector = DfConnector(scorer=Pipeline(
        steps=[('scores', FeatureUnion(_score_list)
                ), ('imputer',
                    SimpleImputer(strategy='constant', fill_value=0))]))
    explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster),
                        n_simple=n_simplequestions,
                        n_hard=n_hardquestions)
    connector.fit(X=Xst)
    # Xst is the transformed output from the connector, i.e. the score matrix
    Xsm = connector.transform(X=Xst)
    print(pd.datetime.now(), 'score ok')
    # ixc is the index corresponding to the score matrix
    ixc = Xsm.index
    y_true = y_true.loc[ixc]

    ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc),
                                    fit_cluster=True)
    ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc),
                                y=y_true.loc[ix_simple])
    ix_train = ix_simple.union(ix_hard)
    print('number of training samples:{}'.format(ix_train.shape[0]))
    X_train = pd.DataFrame(data=Xsm, index=ixc).loc[ix_train]
    y_train = y_true.loc[ix_train]

    explorer.fit(X=X_train, y=y_train, fit_cluster=True)
    y_pruning = explorer.predict(X=Xsm)
    y_pruning = pd.Series(data=y_pruning, name='y_pruning', index=ixc)
    y_pred = (y_pruning > 0).astype(int)
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    print('***\npruning scores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.
          format(precision, recall, accuracy))
    )
### Fit the cluster non-supervizes
explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions)
Xst = connector.fit_transform(X=Xst)
explorer.fit_cluster(X=Xst)

### Ask simple questions
ix_simple = explorer.ask_simple(X=Xst)
Sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple)
y_simple = y_true.loc[ix_simple]

### Fit the cluser with supervized data
explorer.fit(X=Xst, y=y_simple, fit_cluster=False)

### Ask hard (pointed) questions
ix_hard = explorer.ask_hard(X=Xst, y=y_simple)
Sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard)
y_hard = y_true.loc[ix_hard]

### Obtain the results of the labels
y_questions = y_true.loc[ix_hard.union(ix_simple)]


## Define the pruning pipe
pipe = PruningPipe(
    connector=connector,
    pruningclf=explorer,
    sbsmodel=FeatureUnion(transformer_list=_sbs_score_list),
    classifier=LogisticRegressionCV()
)
pipe.fit(X=Xst, y=y_questions)
Example #4
0
X_cluster.set_index('ix', inplace=True)
X_cluster = X_cluster[[
    'ix_source', 'ix_target', 'avg_score', 'y_cluster', 'y_true'
]]
X_cluster.to_sql('cluster_output', con=engine, if_exists='replace')

### Ask simple questions
ix_simple = exp.ask_simple(X=Xst)
Sbs_simple = Xsbs.loc[ix_simple]
y_simple = y_true.loc[ix_simple]['y_true']

### Fit the cluser with supervized data
exp.fit(X=Xst, y=y_simple, fit_cluster=False)

### Ask hard (pointed) questions
ix_hard = exp.ask_hard(X=Xst, y=y_simple)
Sbs_hard = Xsbs.loc[ix_hard]
y_hard = y_true.loc[ix_hard]['y_true']

### Obtain the results of the labels
y_questions = y_true.loc[ix_hard.union(ix_simple)]['y_true']
X_questions = Xsbs.loc[y_questions.index].copy()
X_questions['y_cluster'] = y_cluster
X_questions['y_true'] = y_questions
X_questions.reset_index(inplace=True, drop=False)
X_questions.set_index('ix', inplace=True)
# REORDER COLS
X_questions = X_questions[[c for c in Xsbs.columns if c != 'ix'] +
                          ['y_cluster', 'y_true']]
X_questions.to_sql('questions', con=engine, if_exists='replace')