コード例 #1
0
# 2.2 conclusion - PU-Learning bagging的数据 可视化展示`label=0`即 `unlabeled` 的数据
# 展示unlabeled数据集 在经过Bagging方法后的效果
plt.scatter(X[y == 0].feature1,
            X[y == 0].feature2,
            c=results[y == 0].output_bag,
            linewidth=0,
            s=50,
            alpha=0.5,
            cmap='jet_r')
plt.colorbar(label='Unlabeled样本的预测分值')
plt.title('PU Bagging')
plt.show()

# 3.1 Using `BaggingClassifierPU`
bc = BaggingClassifierPU(DecisionTreeClassifier(),
                         n_estimators=1000,
                         max_samples=sum(y),
                         n_jobs=-1)
bc.fit(X, y)
results['output_skb'] = bc.oob_decision_function_[:, 1]
# Visualize the approach's result
plt.scatter(X[y == 0].feature1,
            X[y == 0].feature2,
            c=results[y == 0].output_skb,
            linewidth=0,
            s=50,
            alpha=0.5,
            cmap='jet_r')
plt.colorbar(label='Scores given to unlabeled points')
plt.title(r'Using ${\tt BaggingClassifierPU}$')
plt.show()
コード例 #2
0
)



tfidf = seq_vectorizer(ngram_max=4, downsample=40000)

print(f"FOR BaggingClassifierPU MODELS:\t Sequences: {sequences.shape};\t Binding: {binding.shape};\t Number of ligand id values: {len(lig_id_vals)}. ")

spinner = Spinner()
models = {}
for lig_id in lig_id_vals:
    try:
        X, y = fitter_df_maker(lig_id)
        sys.stdout.write(f"Models dict now populated to length {len(models.keys())}; ")
        bc = BaggingClassifierPU(DecisionTreeClassifier(),
                                 n_estimators=estimators,
                                 #n_jobs=-1,
                                 max_samples=int(sum(y.values)))

        sys.stdout.write(f"next, fitting on ligand #{lig_id}")
        spinner.start()
        bc.fit(X,y)
        spinner.stop()
        models[lig_id] = bc
        #sys.stdout.flush()
        sys.stdout.write('\r') # yes finally https://stackoverflow.com/questions/23138413/clearing-old-data-from-sys-stdout-in-python
    except:
        pass

#with open('models.pickle', 'wb') as mp:
#    pickle.dump(models, mp)
コード例 #3
0
        print(
            print_cm(sklearn.metrics.confusion_matrix(y_orig,
                                                      model.predict(X)),
                     labels=['negative', 'positive']))
        print('')
        print('Precision: ', precision_score(y_orig, model.predict(X)))
        print('Recall: ', recall_score(y_orig, model.predict(X)))
        print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
        print('f1_score: ', f1_score(y_orig, model.predict(X)))

        f1_orig.append(f1_score(y_orig, model.predict(X)))

        print('Training bagging classifier...')
        pu_start = time.perf_counter()
        model = BaggingClassifierPU(xgb.XGBClassifier(),
                                    n_estimators=50,
                                    n_jobs=-1,
                                    max_samples=sum(y1))
        model.fit(X, y1)
        pu_end = time.perf_counter()
        print('Done!')
        print('Time:', pu_end - pu_start)

        # train data
        print('---- {} ----'.format('PU Bagging'))
        print(
            print_cm(sklearn.metrics.confusion_matrix(y_orig,
                                                      model.predict(X)),
                     labels=['negative', 'positive']))
        print('')
        print('Precision: ', precision_score(y_orig, model.predict(X)))
        print('Recall: ', recall_score(y_orig, model.predict(X)))
y.loc[np.random.choice(y[y == 1].index, replace = False, size = hidden_size)] = 0

# Check the new contents of the set
print('%d positive out of %d total' % (sum(y), len(y)))

# Plot the data set, as the models will see it
plt.scatter(X[y==0].feature1, X[y==0].feature2, c='k', marker='.', linewidth=1, s=10, alpha=0.5, label='Unlabeled')
plt.scatter(X[y==1].feature1, X[y==1].feature2, c='b', marker='o', linewidth=0, s=50, alpha=0.5, label='Positive')
plt.legend()
plt.title('Data set (as seen by the classifiers)')
plt.show()


bc = BaggingClassifierPU(
    DecisionTreeClassifier(),
    n_estimators = 1000,  # 1000 trees as usual
    max_samples = sum(y), # Balance the positives and unlabeled in each bag
    n_jobs = -1           # Use all cores
)
bc.fit(X, y)

# Store the scores assigned by this approach
results = pd.DataFrame({
    'truth'      : y_orig,   # The true labels
    'label'      : y,        # The labels to be shown to models in experiment
}, columns = ['truth', 'label'])
results['output_bag_tree'] = bc.oob_decision_function_[:,1]

# Visualize this approach's results
plt.scatter(
    X[y==0].feature1, X[y==0].feature2,
    c = results[y==0].output_bag_tree, linewidth = 0, s = 50, alpha = 0.5,
コード例 #5
0
def run(data_train, data_test, clf_name):
    X_train, y_train = train_data_process(data_train)
    X_test, y_true = test_data_process(data_test)
    classifiers = {
        "XGBOD": XGBOD(random_state=0),
        "KNeighborsClassifier": KNeighborsClassifier(3),
        "SVC": SVC(random_state=0),
        "GaussianProcessClassifier": GaussianProcessClassifier(1.0 * RBF(1.0)),
        "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
        "RandomForestClassifier": RandomForestClassifier(random_state=0),
        "MLPClassifier": MLPClassifier(random_state=0),
        "AdaBoostClassifier": AdaBoostClassifier(),
        "GaussianNB": GaussianNB(),
        "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
        "BaggingClassifierPU": BaggingClassifierPU(
            DecisionTreeClassifier(),
            n_estimators=1000,  # 1000 trees as usual
            max_samples=sum(y_train),  # Balance the positives and unlabeled in each bag
            n_jobs=-1  # Use all cores
        )
    }

    clf = classifiers[clf_name]
    try:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        TP = 0
        FN = 0
        FP = 0
        TN = 0
        for i, label in enumerate(y_true):
            if label:
                if y_pred[i]:
                    TP += 1
                else:
                    FN += 1
            else:
                if y_pred[i]:
                    FP += 1
                else:
                    TN += 1
        if (FP + TN) == 0:
            pf = "no negative samples."
        else:
            pf = FP / (FP + TN)

        try:
            auc = roc_auc_score(y_true, y_pred)
        except ValueError as e:
            auc = str(e)
        return {
            'train samples': str(X_train.shape[0]),
            'defective train samples': str(np.sum(y_train)),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'pf': pf,
            'F-measure': f1_score(y_true, y_pred),
            'accuracy': accuracy_score(y_true, y_pred),
            'AUC': auc
        }
    except ValueError as e:
        return str(e)
def train_val_PU(max_a, val, tr1, tr2, loc='./log', start_ep=0, end_ep=45):
    '''
    this function trains a triple (a combination of 3 random seeds).
    max_a: the maximum alpha used to train the PU model.
    val: the random seed number that produces the LID sequence for test
    tr1: the 1st random seed number that produces the LID sequence for training
    tr2: the 2nd random seed number that produces the LID sequence for training
    loc: the LID sequences' location
    start_ep: the starting epoch of the LID sequence
    end_ep: the ending epoch of the LID sequence
    '''
    records = [
        [tr1, tr2, val],
    ]
    train1 = LID_assmb(tr1, max_a, start_ep, end_ep, loc)
    train2 = LID_assmb(tr2, max_a, start_ep, end_ep, loc)
    total = train2.append(train1)
    total = total.sample(frac=1).reset_index(drop=True)  #shuffle everything
    labels = (total.iloc[:, -2]).to_numpy()
    loss_labels = (total.iloc[:, -1]).to_numpy()
    total = total.iloc[:, start_ep:end_ep]
    bc = BaggingClassifierPU(DecisionTreeClassifier(),
                             n_estimators=1000,
                             max_samples=int(sum(loss_labels)),
                             n_jobs=-1)
    bc.fit(total, loss_labels)

    v_total = LID_assmb(val, max_a, start_ep, end_ep, loc)
    v_total = v_total.sample(frac=1).reset_index(drop=True)
    v_labels = (v_total.iloc[:, -2]).to_numpy()
    v_loss_labels = (v_total.iloc[:, -1]).to_numpy()
    v_total = v_total.iloc[:, start_ep:end_ep]
    pred = bc.predict_proba(v_total)

    v_summary = {}
    for i in set(v_labels):
        v_summary[i] = 0.0
        if float(i) > 100:
            bl_label = i
    for i in range(len(pred)):
        if np.isnan(pred[i][1]):
            raise ValueError(
                'prediction has illegal value nan, please check the model and data!'
            )
            break
        elif pred[i][1] < 0.5:
            continue
        else:
            v_summary[v_labels[i]] += 1

    records.extend(
        [v_summary[bl_label] / float(sum(v_loss_labels)), v_summary[bl_label]])
    temp = set(v_labels)
    temp.remove(bl_label)
    header = list(sorted(temp))
    header = ['train 1, 2 val', 'recall'] + [
        bl_label,
    ] + header
    for r in range(3, len(header)):
        records.append(v_summary[header[r]])
    return (records, header)
    '''