Ejemplo n.º 1
0
    def testing_predictions(self,
                            test_data,
                            model,
                            num_pcs,
                            gamma=False,
                            max_iter=1000000,
                            mean=False):

        pca_data = self.principal_components(test_data, self.pca, num_pcs)
        if mean == False:
            return np.array([p[1] for p in model.predict_proba(pca_data)])

        train_pca_data = self.principal_components(self.X, self.pca, num_pcs)

        predicted_probs = ""
        for seed in self.seeds:
            np.random.seed(seed)

            model = LabelPropagation(kernel='rbf',
                                     gamma=gamma,
                                     max_iter=max_iter)
            model.fit(train_pca_data, self.Y)

            predicted_prob = np.array(
                [p[1] for p in model.predict_proba(pca_data)])
            if predicted_probs == "":
                predicted_probs = predicted_prob
            else:
                predicted_probs = np.vstack((predicted_probs, predicted_prob))

        #get mean of each run:
        mean_probs = np.mean(predicted_probs, axis=0)
        return mean_probs
Ejemplo n.º 2
0
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6):
    prop = LabelPropagation(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            max_iter=MI,
                            n_jobs=-1)
    prop.fit(xTrain, yTrain)
    predY = prop.predict_proba(xTrain)
    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'HC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)
 def doLabelPropagation(self,X,y,**kwargs):
     label_prop_model = LabelPropagation(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_prop_model.fit(X, y)
     if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) )
     return label_prop_model.predict_proba(X)
    def label_propagation(self, X_train, y, X_test):

        clf = LabelPropagation()
        print("X_train Shape :", X_train.shape, type(X_train))
        print("X_test shape : ", X_test.shape, type(X_test))
        print("y shape : ", y.shape)

        X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
        print("X shape now ", X.shape)
        print("Y shape now ", y.shape)
        clf.fit(X, y)
        final_labels = clf.predict(X_test)
        label_prob = clf.predict_proba(X_test)
        print(compare_labels_probabilities().compare(label_prob, final_labels))
        return final_labels, clf
Ejemplo n.º 5
0
class _LabelPropagationImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
Ejemplo n.º 6
0
 def fit(self, X,y, method='self-training', treshold=0.7):
     getLabel = lambda p: np.where(p>treshold)[0][0] if np.any(p>treshold) else -1 
     yp = copy(y)
     mask = np.ones(len(y),dtype=bool) #mask of labeled data
     mask[np.where(yp==-1)[0]] = False #cheke unlabeled data , msk = number of labeled data
     
     lda = LinearDiscriminantAnalysis(solver='svd',store_covariance=True, n_components=10)
     #print(y)
     #if there are no unlabeled data
     if(len(np.where(yp==-1)[0])==0):  #replace with len(mask)=0?
         method = 'supervised'
         
     if method =='supervised':
         lda.fit(X[mask,:],yp[mask]) #train with all labeled data
      
     elif method=='self-training':
         counter=0
         while True:
             lda.fit(X[mask,:],yp[mask])
             if len(yp[~mask]) == 0 or counter == self.max_iter:
                 break
             probs = lda.predict_proba(X[~mask])
             yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype)
             counter+=1
             mask = np.ones(len(y), dtype=bool)
             mask[np.where(yp==-1)[0]]=False
             
     elif method == 'label-propagation':
         label_prop_model=LabelPropagation(kernel='knn',n_neighbors=10,alpha=0.9)
         label_prop_model.fit(X,yp)
         #print(probs)
         probs = label_prop_model.predict_proba(X[~mask])
         yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype)
         self.propagated_labels = yp
         
         lda.fit(X[mask,:],yp[mask])
         
     else:
         raise('No valid method was given!')
     self.classifier, self.means_, self.covariance_ =lda, lda.means_, lda.covariance_
Ejemplo n.º 7
0
    def evaluate_model(self, X, Y, gamma, seed, max_iter=100000):
        #set random seed:
        np.random.seed(seed)

        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            stratify=Y,
                                                            test_size=0.20,
                                                            random_state=seed)

        lp_model = LabelPropagation(kernel='rbf',
                                    gamma=gamma,
                                    max_iter=max_iter)

        lp_model.fit(X_train, Y_train)

        #test model on validation data
        predicted_labels = lp_model.predict(X_test)
        predicted_prob = lp_model.predict_proba(X_test)

        #get just the labeled testing data:
        labeled_prob = [
            p[1] for i, p in enumerate(predicted_prob) if Y_test[i] in [0, 1]
        ]
        labels = [
            p for i, p in enumerate(predicted_labels) if Y_test[i] in [0, 1]
        ]
        true_labels = [l for l in Y_test if l in [0, 1]]

        #evaluation
        accuracy = metrics.accuracy_score(true_labels, labels)
        precision = metrics.precision_score(true_labels, labels)
        auc = metrics.roc_auc_score(true_labels, labeled_prob)
        conf = metrics.confusion_matrix(true_labels, labels)

        return accuracy, precision, auc, conf
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

# In[5]:
df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

kernels = [
    'knn',
]  #, 'rbf'] - taking too much time on knn - so only one model

for kernel in kernels:
    print('label_propagation begins on kernel****************', kernel)
    lp = LabelPropagation(kernel=kernel)
    print('fitting****************')
    lp_train = lp.fit(X, y)
    print('predicting on train****************')
    lp_X_prediction = lp.predict_proba(X)[:, 1]
    print('predicting on test****************')
    lp_X_test_prediction = lp.predict_proba(X_test)[:, 1]
    tr_te_concatenated = np.concatenate(
        [lp_X_prediction, lp_X_test_prediction])
    df['label_propagation_' + kenel + '_kernel'] = tr_te_concatenated

print('final tr_te shape', df.shape)
df.to_csv('label_propagation_tr_te.csv', index=False)
print(df.head())
Ejemplo n.º 9
0
lspr = LP(gamma = 70)
lspr.fit(X_norm,Ytrain)


# In[15]:

print('nofClasses: ',lspr.classes_)


# In[16]:

pred = lspr.predict(X_norm)
notN = [1 for i in pred if i>0.0]
print(sum(notN))


# In[12]:

Y_pred = lspr.predict_proba(X_test)


# In[13]:

print(Y_pred.shape)


# In[ ]:



Ejemplo n.º 10
0
for i in range(113):
    if i in unique_y_train:
        continue
    idx = (Y_test == i).nonzero()[0][0]
    add_X.append(X_test[idx])
    add_Y.append(Y_test[idx])
if len(add_X) != 0:
    X_train = np.r_[X_train, np.array(add_X)]
    Y_train = np.r_[Y_train, np.array(add_Y)]

print('train unique Y:{} test uniuqe Y:{}'.format(len(np.unique(Y_train)),
                                                  len(np.unique(Y_test))))

classifier = LabelPropagation(kernel='rbf',
                              n_jobs=50,
                              max_iter=200,
                              gamma=0.25)
# Y_train[int(len(Y_train)*0.8):] = -1
print('-' * 15)
print(X_train.shape, Y_train.shape, len(np.unique(Y_train)))
classifier.fit(X_train, Y_train)

# --- testing
y_prob = classifier.predict_proba(X_test)
# --- report
one_hot_Y = np.zeros((X_test.shape[0], len(np.unique(Y))))
one_hot_Y[np.arange(X_test.shape[0]), Y_test] = 1
test_metrics = metric_report(one_hot_Y, y_prob)
print(test_metrics)
collect_report(METHOD_NAME, args.data_ratio, test_metrics['pr'])
Ejemplo n.º 11
0
    def __call__(self, *args, **kwargs):
        """ Augment the labels

            Inputs:
            tr_percs: percentage of splitting between labeled and unlabeled observations
            algs: methods to perform the label propagation
            max_iter: parameter for 'gtg': number of iterations
        """
        tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1])
        algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only'])
        max_iter = kwargs.pop('max_iter', 25)

        if not osp.exists(self.label_dir):
            os.makedirs(self.label_dir)

        with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst:
            loader = prepare_loader(
                osp.join(self.splitting_dir, 'test.txt'),
                img_root=self.dset['src'],
                stats=self.dset['stats'],
                batch_size=1,
                shuffle=False,
            )

            for _, label, path in loader:
                dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n'))

        for net_name in self.net_names:
            with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'),
                      'rb') as pkl:
                net_name, labels, features, fnames = pickle.load(pkl)
                labels = labels.ravel()

                # uncomment to debug code
                # labels = labels[:5000]
                # features = features[:5000]
                # fnames = fnames[:5000]

            for tr_perc in tr_percs:
                labeled, unlabeled = equiclass_mapping(labels, tr_perc)
                for alg in algs:
                    print(net_name + ' - ' + str(self.dset['nr_classes']) +
                          ' classes')

                    # generate alg label file name
                    alg_path = osp.join(self.label_dir, alg, net_name,
                                        'labels_{}.txt'.format(tr_perc))

                    if self.hard_labels:
                        alg_labels = np.full(labels.shape[0], -1)
                        alg_labels[labeled] = labels[labeled]
                    else:
                        alg_labels = np.zeros(
                            (len(labels), self.dset['nr_classes']))
                        alg_labels[labeled,
                                   labels[labeled].ravel().astype(int)] = 1.0

                    if alg == 'gtg':
                        # predict labels with gtg
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)

                        ps = init_rand_probability(labels, labeled, unlabeled)
                        res = gtg.gtg(W,
                                      ps,
                                      max_iter=max_iter,
                                      labels=labels,
                                      U=unlabeled,
                                      L=labeled)

                        if self.hard_labels:
                            alg_labels[unlabeled] = res[unlabeled].argmax(
                                axis=1)
                        else:
                            alg_labels[unlabeled] = res[unlabeled]

                    elif alg == 'svm':
                        # predict labels with a linear SVM
                        lin_svm = svm.LinearSVC()

                        if self.hard_labels:
                            lin_svm.fit(features[labeled, :], labels[labeled])
                            svm_labels = lin_svm.predict(
                                features[unlabeled]).astype(int)
                        else:
                            cv = min(
                                np.unique(labels[labeled],
                                          return_counts=True)[1].min(), 3)
                            clf = CalibratedClassifierCV(lin_svm, cv=cv)
                            clf.fit(features[labeled, :], labels[labeled])

                            svm_labels = clf.predict_proba(features[unlabeled])

                        alg_labels[unlabeled] = svm_labels

                    elif alg == 'label_propagation':
                        # predict labels with a label propagation model
                        label_propagation = LabelPropagation(kernel='rbf',
                                                             gamma=0.05,
                                                             max_iter=4000)
                        labels[unlabeled] = -1
                        label_propagation.fit(features, labels)
                        if self.hard_labels:
                            label_propagation_labels = label_propagation.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_propagation_labels = label_propagation.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_propagation_labels

                    elif alg == 'label_spreading':
                        # predict labels with a label propagation model
                        label_spreading = LabelSpreading(kernel='rbf',
                                                         gamma=0.05)
                        labels[unlabeled] = -1
                        label_spreading.fit(features, labels)
                        if self.hard_labels:
                            label_spreading_labels = label_spreading.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_spreading_labels = label_spreading.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_spreading_labels

                    elif alg == 'harmonic':
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)
                        soft_labels, hard_labels = harmonic_function(
                            W, labels, labeled, unlabeled)
                        if self.hard_labels:
                            label_harmonic = hard_labels
                        else:
                            label_harmonic = soft_labels

                        alg_labels[unlabeled] = label_harmonic

                    elif alg == 'labels_only':
                        # generate labeled only file
                        alg_labels = alg_labels[labeled]

                        if not osp.exists(osp.dirname(alg_path)):
                            os.makedirs(osp.dirname(alg_path))

                        if (self.hard_labels and (alg_labels == -1).sum() > 0) or \
                                (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                            raise ValueError(
                                'There is some unlabeled observation, check \''
                                + alg + '\' algorithm,')

                        create_relabeled_file([fnames[i] for i in labeled],
                                              alg_path,
                                              alg_labels,
                                              sep=',')
                        break
                    else:
                        raise ValueError('algorithm \'' + alg +
                                         '\' not recognized.')

                    if not osp.exists(osp.dirname(alg_path)):
                        os.makedirs(osp.dirname(alg_path))

                    if (self.hard_labels and (alg_labels == -1).sum() > 0) or\
                        (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                        raise ValueError('There is some unlabeled observation,'
                                         'check \'' + alg + '\' algorithm,')

                    create_relabeled_file(fnames,
                                          alg_path,
                                          alg_labels,
                                          sep=',')

            if 'W' in locals():
                del W
Ejemplo n.º 12
0
from sklearn.semi_supervised import LabelPropagation
import numpy as np
from random import sample

X = np.genfromtxt('/Users/dgy/Desktop/385project/grouped_GPS.csv',
                  delimiter=',')
X = X[1:(X.shape[0] - 1), 1:3]

rows = X.shape[0]
seedsX = X[sample(range(0, rows), 500), ]
seedsY = np.repeat([0, 0, 0, 0, 1], 100)

lp = LabelPropagation(gamma=10000)
lp.fit(seedsX, seedsY)

Y = lp.predict_proba(X)
p = Y[0:rows, 1]
(np.where(p > 0.5))[0].shape
np.count_nonzero(np.isnan(p))
np.savetxt("utility.txt", Y, delimiter=" ", fmt="%s")
Ejemplo n.º 13
0
                p.append(0)
            for i in range(1, len(arr)):
                p.extend(dp[arr[i]])
            res.append(p)
    return np.array(res)


data = hex2bin(path)
X = data[:, 1:]
Y = data[:, 0]
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.7)
rng = np.random.RandomState(42)
unlabeled_point = rng.rand(len(train_Y)) < 0.4
train_Y[unlabeled_point] = -1

clf = LabelPropagation(n_jobs=8, gamma=0.6)
clf.fit(train_X, train_Y)

prob = clf.predict_proba(test_X)
score = roc_auc_score(test_Y, prob[:, 1])
score = round(score, 2)
fpr, tpr, _ = roc_curve(test_Y, prob[:, 1])
plt.plot(fpr, tpr, color='r')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.text(0.5, 0.4, "AUC=" + str(score), fontsize=15)
plt.show()

print(score)