Beispiel #1
0
    def label_spread(self, X_train, y_train, gamma = None, max_iter = None):
        """
        Train Label Spreading model from scikit-learn

        Parameters
        __________
        X_train: Scaled training data
        y_train: Class label
        gamma: Parameter for rbf kernel
        max_iter: Maximum number of iterations allowed

        Returns
        ________
        Predicted labels and probability
        """
        # Label spreading model
        model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.transduction_

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba
Beispiel #2
0
class LabelSpreadingModel(SupervisedW2VModel):
    def fit_with_test(self, test_data):
        xs, ys = [], []
        self.ans_mapping = []
        for ans, cvs in self.context_vectors.items():
            xs.extend(cvs)
            if ans not in self.ans_mapping:
                y = len(self.ans_mapping)
                self.ans_mapping.append(ans)
            else:
                y = self.ans_mapping.index(ans)
            ys.extend(y for _ in cvs)
        for ctx in test_data:
            xs.append(self.cv(ctx))
            ys.append(-1)  # unlabeled
        self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11)
        self.ls_clf.fit(xs, ys)

    def __call__(self, x, ans=None, with_confidence=False):
        v = self.cv(x)
        probs = self.ls_clf.predict_proba([v])[0]
        pred = probs.argmax()
        m_ans = self.ans_mapping[pred]
        # TODO - get confidence as difference between probs[pred] and next
        return (m_ans, 0.0) if with_confidence else m_ans
Beispiel #3
0
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1):
    spread = LabelSpreading(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            alpha=a,
                            max_iter=MI,
                            n_jobs=-1)
    spread.fit(xTrain, yTrain)
    predY = spread.predict_proba(xTrain)

    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'SC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)
 def doLabelSpreading(self,X,y,**kwargs):
     label_spread_model = LabelSpreading(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_spread_model.fit(X, y)
     if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) )
     return label_spread_model.predict_proba(X)
 def label_spreading(self, X_train, y, X_test):
     clf = LabelSpreading()
     X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
     print("X shape now ", X.shape)
     print("Y shape now ", y.shape)
     clf.fit(X, y)
     final_labels = clf.predict(X_test)
     label_prob = clf.predict_proba(X_test)
     print(compare_labels_probabilities().compare(label_prob, final_labels))
     return final_labels, clf
def propagate_labels(
    features,
    labels,
):
    label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1)
    label_prop_model.fit(features, labels)
    logger.debug(label_prop_model.classes_)
    # preds = label_prop_model.predict(features)
    preds = label_prop_model.predict_proba(features)
    # logger.debug(label_prop_model.classes_)

    return preds
Beispiel #7
0
class LabelSpreadingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
Beispiel #8
0
class LPLearner(Learner):
    def __init__(self, K, seed):
        #TODO: It does not seem that fix the random state can remove the randomness
        self.lp = LabelSpreading()
        self.K = K

    def fit(self, X, y):
        self.lp.fit(X, y)

    def predict_proba(self, X):
        prob = self.lp.predict_proba(X)
        seen_classes = self.lp.classes_
        prob = self.adjust_prob(prob, seen_classes)
        return prob

    def predict(self, X):
        prob = self.predict_proba(X)
        return prob.argmax(1)
Beispiel #9
0
def MyLabelSpreading(option, neighbor):

    CONFIG = GetConfig(option)

    [word2idx, vocabulary, X, y, X_train, X_test, y_train, y_test, inds_train, inds_test, inds_all] = \
        joblib.load(CONFIG['RAW_DATA'])
    doc2vec = joblib.load(CONFIG['TENSOR_EMBEDDING'])

    # propagation

    classes = np.unique(y)

    n_samples = y.shape[0]
    n_classes = classes.shape[0]

    labels = np.zeros((n_samples, n_classes))

    for i, val in enumerate(y_train):
        labels[i][int(val)] = 1.0

    step = y_train.shape[0]
    for i, val in enumerate(y_test):
        labels[i + step] = -1

    label_prop_model = LabelSpreading(kernel='knn', n_neighbors=neighbor)
    # label_prop_model = LabelSpreading(kernel='rbf', n_neighbors=args.neighbor,\
    #                                   gamma=20, alpha=0.2, max_iter=30, tol=0.001)
    label_prop_model.fit(doc2vec, labels)

    pred_probability = label_prop_model.predict_proba(doc2vec)
    pred_class = classes[np.argmax(pred_probability, axis=1)].ravel()

    accuracy = accuracy_score(y_test, pred_class[inds_test])
    prf = precision_recall_fscore_support(y_test,
                                          pred_class[inds_test],
                                          average='binary')

    print('Accuracy:%f' % accuracy)
    print('Precision:%f' % prf[0])
    print('Recall:%f' % prf[1])
    print('Fscore:%f' % prf[2])

    return accuracy, prf[0], prf[1], prf[2]

# Train model and print statistics (use 'knn' as kernel)

from sklearn.semi_supervised import LabelSpreading

model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train)

print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2)))
pred = model.predict(X_test) == Y_test
print("Correct: {}".format(np.count_nonzero(pred==True)),"/",
      "Incorrect: {}".format(np.count_nonzero(pred==False)))

Z1 = model.predict(X_test).reshape(Y_test.size,1)
Z2 = np.asarray(Y_test).reshape(Y_test.size,1)
Z3 = np.around(model.predict_proba(X_test),decimals=2)
data = np.concatenate((Z1,Z2,Z3),axis=1)
outcome = pd.DataFrame(data, columns = ["Predicted Label", 
                                        "Actual Label", 
                                        "Prob. Label = 0.0", 
                                        "Prob. Label = 1.0"])
indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"]

print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep]))

# Plot predictions

import matplotlib.pyplot as plt
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
Beispiel #11
0
    def __call__(self, *args, **kwargs):
        """ Augment the labels

            Inputs:
            tr_percs: percentage of splitting between labeled and unlabeled observations
            algs: methods to perform the label propagation
            max_iter: parameter for 'gtg': number of iterations
        """
        tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1])
        algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only'])
        max_iter = kwargs.pop('max_iter', 25)

        if not osp.exists(self.label_dir):
            os.makedirs(self.label_dir)

        with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst:
            loader = prepare_loader(
                osp.join(self.splitting_dir, 'test.txt'),
                img_root=self.dset['src'],
                stats=self.dset['stats'],
                batch_size=1,
                shuffle=False,
            )

            for _, label, path in loader:
                dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n'))

        for net_name in self.net_names:
            with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'),
                      'rb') as pkl:
                net_name, labels, features, fnames = pickle.load(pkl)
                labels = labels.ravel()

                # uncomment to debug code
                # labels = labels[:5000]
                # features = features[:5000]
                # fnames = fnames[:5000]

            for tr_perc in tr_percs:
                labeled, unlabeled = equiclass_mapping(labels, tr_perc)
                for alg in algs:
                    print(net_name + ' - ' + str(self.dset['nr_classes']) +
                          ' classes')

                    # generate alg label file name
                    alg_path = osp.join(self.label_dir, alg, net_name,
                                        'labels_{}.txt'.format(tr_perc))

                    if self.hard_labels:
                        alg_labels = np.full(labels.shape[0], -1)
                        alg_labels[labeled] = labels[labeled]
                    else:
                        alg_labels = np.zeros(
                            (len(labels), self.dset['nr_classes']))
                        alg_labels[labeled,
                                   labels[labeled].ravel().astype(int)] = 1.0

                    if alg == 'gtg':
                        # predict labels with gtg
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)

                        ps = init_rand_probability(labels, labeled, unlabeled)
                        res = gtg.gtg(W,
                                      ps,
                                      max_iter=max_iter,
                                      labels=labels,
                                      U=unlabeled,
                                      L=labeled)

                        if self.hard_labels:
                            alg_labels[unlabeled] = res[unlabeled].argmax(
                                axis=1)
                        else:
                            alg_labels[unlabeled] = res[unlabeled]

                    elif alg == 'svm':
                        # predict labels with a linear SVM
                        lin_svm = svm.LinearSVC()

                        if self.hard_labels:
                            lin_svm.fit(features[labeled, :], labels[labeled])
                            svm_labels = lin_svm.predict(
                                features[unlabeled]).astype(int)
                        else:
                            cv = min(
                                np.unique(labels[labeled],
                                          return_counts=True)[1].min(), 3)
                            clf = CalibratedClassifierCV(lin_svm, cv=cv)
                            clf.fit(features[labeled, :], labels[labeled])

                            svm_labels = clf.predict_proba(features[unlabeled])

                        alg_labels[unlabeled] = svm_labels

                    elif alg == 'label_propagation':
                        # predict labels with a label propagation model
                        label_propagation = LabelPropagation(kernel='rbf',
                                                             gamma=0.05,
                                                             max_iter=4000)
                        labels[unlabeled] = -1
                        label_propagation.fit(features, labels)
                        if self.hard_labels:
                            label_propagation_labels = label_propagation.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_propagation_labels = label_propagation.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_propagation_labels

                    elif alg == 'label_spreading':
                        # predict labels with a label propagation model
                        label_spreading = LabelSpreading(kernel='rbf',
                                                         gamma=0.05)
                        labels[unlabeled] = -1
                        label_spreading.fit(features, labels)
                        if self.hard_labels:
                            label_spreading_labels = label_spreading.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_spreading_labels = label_spreading.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_spreading_labels

                    elif alg == 'harmonic':
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)
                        soft_labels, hard_labels = harmonic_function(
                            W, labels, labeled, unlabeled)
                        if self.hard_labels:
                            label_harmonic = hard_labels
                        else:
                            label_harmonic = soft_labels

                        alg_labels[unlabeled] = label_harmonic

                    elif alg == 'labels_only':
                        # generate labeled only file
                        alg_labels = alg_labels[labeled]

                        if not osp.exists(osp.dirname(alg_path)):
                            os.makedirs(osp.dirname(alg_path))

                        if (self.hard_labels and (alg_labels == -1).sum() > 0) or \
                                (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                            raise ValueError(
                                'There is some unlabeled observation, check \''
                                + alg + '\' algorithm,')

                        create_relabeled_file([fnames[i] for i in labeled],
                                              alg_path,
                                              alg_labels,
                                              sep=',')
                        break
                    else:
                        raise ValueError('algorithm \'' + alg +
                                         '\' not recognized.')

                    if not osp.exists(osp.dirname(alg_path)):
                        os.makedirs(osp.dirname(alg_path))

                    if (self.hard_labels and (alg_labels == -1).sum() > 0) or\
                        (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                        raise ValueError('There is some unlabeled observation,'
                                         'check \'' + alg + '\' algorithm,')

                    create_relabeled_file(fnames,
                                          alg_path,
                                          alg_labels,
                                          sep=',')

            if 'W' in locals():
                del W
def baseline_labelspreading_new(data_path, bad_sample_num, good_sample_num,
                                reject_sample_num, random_state_for_each_epoch,
                                classifier, resampling_model):
    """

    :return:
    """
    '''Data input'''
    warnings.filterwarnings("ignore")
    warnings.filterwarnings("ignore")
    raw_data_train = pd.read_csv(data_path, index_col='ID')

    data_bad = raw_data_train[raw_data_train['label'] == 1]
    # print data_bad.shape
    data_good = raw_data_train[(raw_data_train['label'] == 0)]
    data_reject = raw_data_train[raw_data_train['label'] == -1]

    data_bad_sampling = data_bad.sample(
        n=bad_sample_num, random_state=random_state_for_each_epoch)
    data_good_sampling = data_good.sample(
        n=good_sample_num, random_state=random_state_for_each_epoch)
    data_train = pd.concat([data_bad_sampling, data_good_sampling], axis=0)
    # print("All Data Size:" + str(data_train.shape))
    feature_name = list(data_train.columns.values)
    # print(feature_name)

    s = 0
    np.random.seed(s)
    sampler = np.random.permutation(len(data_train.values))
    data_train_randomized = data_train.take(sampler)

    y = data_train_randomized['label'].as_matrix()
    X = data_train_randomized.drop(['label'], axis=1).as_matrix()
    '''Split train/test data sets'''
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.2,
                                                        random_state=123)

    data_reject_sampling = data_reject.sample(
        n=reject_sample_num, random_state=random_state_for_each_epoch)

    X_reject = data_reject_sampling.drop(['label'], axis=1).as_matrix()
    y_reject = data_reject_sampling['label'].as_matrix()

    X_train_and_reject = np.r_[X_train, X_reject]
    y_train_and_reject = np.r_[y_train, y_reject]
    '''Supervised Learning'''
    ls_semi = LabelSpreading(kernel='rbf',
                             gamma=5,
                             alpha=0.5,
                             max_iter=100,
                             tol=0.001,
                             n_jobs=-1)
    # ls_semi = LabelSpreading(kernel='knn', n_neighbors=20, alpha=0.5, max_iter=100, tol=0.1, n_jobs=-1)
    # ls_semi = LabelSpreading(kernel='rbf', gamma=10, alpha=0.7, max_iter=500, tol=0.001, n_jobs=-1)
    # ls_semi = LabelSpreading(kernel='knn', n_neighbors=5, alpha=0.7, max_iter=400, tol=0.1, n_jobs=-1)

    ls_semi.fit(X_train_and_reject, y_train_and_reject)
    y_reject_proba = ls_semi.predict_proba(X_reject)
    y_reject_predict = ls_semi.predict(X_reject)

    # y_proba = np.nan_to_num(y_proba)  # y_proba中有时会出现nan的情况
    # print np.isnan(y_proba).sum()

    y_train_and_reject_1 = np.r_[y_train, y_reject_predict]
    # print(y_train_and_reject_1.sum())
    '''Supervised Learning'''
    y_proba = classifier.fit(X_train_and_reject,
                             y_train_and_reject_1).predict_proba(X_test)
    y_predict = classifier.fit(X_train_and_reject,
                               y_train_and_reject_1).predict(X_test)

    # y_predict = y_proba[:, 1].copy()
    # y_predict[y_predict >= 0.2] = 1
    # y_predict[y_predict < 0.2] = 0
    '''AUC and ROC curve'''
    fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
    auc_result = auc(fpr, tpr)
    # print("AUC Score:" + str(auc_result))
    '''Accuracy'''
    accuracy_result = accuracy_score(y_test, y_predict)
    # print("Accuracy Score:" + str(accuracy_result))
    '''Precision'''
    precision_result = precision_score(y_test, y_predict)
    # print("Precision Score:" + str(precision_result))
    '''Recall'''
    recall_result = recall_score(y_test, y_predict)
    # print("Recall Score:" + str(recall_result))
    '''F1'''
    f1_result = f1_score(y_test, y_predict)
    # print("F1 Score:" + str(f1_result))
    '''Log loss'''
    log_loss_result = log_loss(y_test, y_proba[:, 1])
    # print("logloss Score:" + str(log_loss_result))
    '''Cohen-Kappa'''
    cohen_kappa_result = cohen_kappa_score(y_test, y_predict)
    # print("Cohen-Kappa Score:" + str(cohen_kappa_result))
    '''brier score'''
    brier_result = brier_score_loss(y_test, y_proba[:, 1])
    # print("brier Score:" + str(brier_result))
    '''K-S Value'''
    ks_result = max(tpr - fpr)
    '''plot auc'''

    # plt.figure()
    # lw = 2
    # plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.4f)' % roc_auc)
    # plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    # plt.xlim([0.0, 1.0])
    # plt.ylim([0.0, 1.05])
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('Receiver operating characteristic example')
    # plt.legend(loc="lower right")
    # plt.show()
    '''Classification Report'''
    # target_names = ['class 0', 'class 1', 'class 2']
    # print(classification_report(y_test, y_predict, target_names=target_names))
    '''Confusion Matrix'''
    # # Compute confusion matrix
    # cnf_matrix = confusion_matrix(y_test, y_predict)
    # np.set_printoptions(precision=2)
    #
    # # Plot non-normalized confusion matrix
    # plt.figure()
    # plot_confusion_matrix(cnf_matrix, classes=[0, 1], title='Confusion matrix, without normalization')
    #
    # # Plot normalized confusion matrix
    # plt.figure()
    # plot_confusion_matrix(cnf_matrix, classes=[0, 1], normalize=True, title='Normalized confusion matrix')
    #
    # plt.show()

    # print("Accuracy Score:" + str(accuracy_result) + " Precision Score:" + str(precision_result) + " Recall Score:" + str(recall_result) +
    #       " F1 Score:" + str(f1_result) + " logloss Score:" + str(log_loss_result) + " Cohen-Kappa Score:" + str(cohen_kappa_result) +
    #       " brier Score:" + str(brier_result) + " AUC Score:" + str(auc_result))

    return accuracy_result, precision_result, recall_result, f1_result, log_loss_result, cohen_kappa_result, brier_result, ks_result, auc_result
train_dataset = train.values
X = train_dataset[:, 2:]
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

kernels = ['knn']  #'rbf'] - taking too much time on knn

for kernel in kernels:
    print('LabelSpreading kernel****************', kernel)
    ls = LabelSpreading(kernel=kernel)
    print('fitting****************')
    ls_train = ls.fit(X, y)
    print('predicting on train****************')
    ls_X_prediction = ls.predict_proba(X)[:, 1]
    print('predicting on test****************')
    ls_X_test_prediction = ls.predict_proba(X_test)[:, 1]
    tr_te_concatenated = np.concatenate(
        [ls_X_prediction, ls_X_test_prediction])
    df['label_spreading_' + kernel + '_kernel'] = tr_te_concatenated

print('final tr_te shape', df.shape)
df.to_csv('label_spreading_tr_te.csv', index=False)
print(df.head())