# Load the vector corresponding to Xst
y_true = getytrue().loc[ixc]
print(y_true.value_counts())
print(pd.datetime.now(), 'data loaded')

## Explore the data:
connector = DfConnector(
        scorer=Pipeline(steps=[
            ('scores', FeatureUnion(_lr_score_list)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=0))]
        )
    )
### Fit the cluster non-supervizes
explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions)
Xst = connector.fit_transform(X=Xst)
explorer.fit_cluster(X=Xst)

### Ask simple questions
ix_simple = explorer.ask_simple(X=Xst)
Sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple)
y_simple = y_true.loc[ix_simple]

### Fit the cluser with supervized data
explorer.fit(X=Xst, y=y_simple, fit_cluster=False)

### Ask hard (pointed) questions
ix_hard = explorer.ask_hard(X=Xst, y=y_simple)
Sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard)
y_hard = y_true.loc[ix_hard]

### Obtain the results of the labels
Beispiel #2
0
# REBUILD Y_true
y_true = pd.read_sql(sql="SELECT * FROM y_true WHERE y_true.y_true = 1",
                     con=engine).set_index(['ix_source', 'ix_target'],
                                           drop=True)
y_truetemp = Xst[['ix']]
y_truetemp['y_true'] = 0
y_truetemp.loc[y_true.index.intersection(Xst.index),
               'y_true'] = y_true.loc[y_true.index.intersection(Xst.index),
                                      'y_true']
y_true = y_truetemp.copy()
del y_truetemp
### y_true has now a multiindex, ix, and y_true columns

## Fit the cluster to non-supervized data
exp = Explorer(n_simple=n_questions, n_hard=n_questions)
exp.fit_cluster(X=Xst[['es_score']])
y_cluster = pd.Series(data=exp.pred_cluster(X=Xst),
                      index=Xst.index,
                      name='y_cluster')
X_cluster = pd.DataFrame(y_cluster)
X_cluster['avg_score'] = Xst[['es_score']].mean(axis=1)
X_cluster['y_true'] = y_true['y_true']
X_cluster['ix'] = Xst['ix']
X_cluster.reset_index(inplace=True, drop=False)
X_cluster.set_index('ix', inplace=True)
X_cluster = X_cluster[[
    'ix_source', 'ix_target', 'avg_score', 'y_cluster', 'y_true'
]]
X_cluster.to_sql('cluster_output', con=engine, if_exists='replace')

### Ask simple questions