Esempio n. 1
0
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel,
                      neighbors, gamma, alpha):
    semi = LabelSpreading(kernel=kernel,
                          n_neighbors=neighbors,
                          gamma=gamma,
                          alpha=alpha,
                          tol=0.001,
                          max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test):
    #pca = randomized_PCA(X_train)
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3)
    #X = pca.transform(X)
    #val_images = pca.transform(val_images)
    #y= y[:]

    X_train = X_train[:, :]
    y_train = y_train[:]
    Xunlabelled = Xunlabelled[:10000,:]

    #import ipdb; ipdb.set_trace()

    X_both = np.vstack((X_train, Xunlabelled))

    y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],)))


    label_prop_model = LabelSpreading(max_iter=100)
    #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train)))
    #labels = np.copy(y_train)
    #labels[random_unlabeled_points] = -1
    label_prop_model.fit(np.copy(X_both), np.copy(y_both))
    y_pred = label_prop_model.predict(np.copy(X_both))
    print(y_pred)
Esempio n. 3
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
 def doLabelSpreading(self,X,y,**kwargs):
     label_spread_model = LabelSpreading(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_spread_model.fit(X, y)
     if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) )
     return label_spread_model.predict_proba(X)
 def label_spreading(self, X_train, y, X_test):
     clf = LabelSpreading()
     X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
     print("X shape now ", X.shape)
     print("Y shape now ", y.shape)
     clf.fit(X, y)
     final_labels = clf.predict(X_test)
     label_prob = clf.predict_proba(X_test)
     print(compare_labels_probabilities().compare(label_prob, final_labels))
     return final_labels, clf
Esempio n. 6
0
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(
             kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
Esempio n. 9
0
def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)
Esempio n. 10
0
def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)
Esempio n. 11
0
    def augment_instances(self, X_train, y_train):

        if self.args.num_unlabeled == 0:
            return X_train, y_train

        X_unlabeled = self.dataset.X_train_unlabeled
        y_unlabeled = self.dataset.y_train_unlabeled

        X_unlabeled = X_unlabeled.values
        y_unlabeled = y_unlabeled.values

        X_train_text = X_train[:, self.args.text_col]
        self.fit_text(X_train_text, y_train)
        X_train_rep = self.transform_text(X_train_text)
        X_train_rep = self.augment_features(X_train_rep, X_train)

        chunk_size = 1000
        num_instances = X_unlabeled.shape[0]
        num_cols = y_train.shape[1]
        for row in tqdm(range(0, self.args.num_unlabeled, chunk_size),
                        desc='spreading labels in rows',
                        total=int(self.args.num_unlabeled / chunk_size)):
            end_row = row + chunk_size
            end_row = np.minimum(end_row, num_instances)
            for col in tqdm(range(num_cols),
                            desc='spreading labels in cols',
                            leave=False):

                X_unlabeled_rep = self.transform_text(
                    X_unlabeled[row:end_row, self.args.text_col])
                X_unlabeled_rep = self.augment_features(
                    X_unlabeled_rep, X_unlabeled[row:end_row, :])

                X_spread = np.append(X_train_rep, X_unlabeled_rep, axis=0)
                y_spread = np.append(y_train[:, col],
                                     y_unlabeled[row:end_row, col],
                                     axis=0)

                labeling = LabelSpreading()
                labeling.fit(X_spread, y_spread)
                y_unlabeled[row:end_row,
                            col] = labeling.predict(X_unlabeled_rep)

        X_train = np.append(X_train, X_unlabeled[:row + chunk_size], axis=0)
        y_train = np.append(y_train, y_unlabeled[:row + chunk_size], axis=0)
        return X_train, y_train
Esempio n. 12
0
class LabelSpreadingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
Esempio n. 13
0
class LP:
    def __init__(self, lmnn=False, max_iter=1000, lm_num=200):
        # self.clf =  LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25)
        self.clf = LabelSpreading(kernel='knn',
                                  n_neighbors=25,
                                  max_iter=max_iter,
                                  alpha=0.2,
                                  n_jobs=-1)
        self.lmnn = lmnn
        self.lm_num = lm_num
        if lmnn:
            self.ml = LMNN(use_pca=False, max_iter=2000)

    def fit(self, X, y):
        if self.lmnn:
            nonzero_index = np.nonzero(y)
            index = random.sample(list(nonzero_index[0]), self.lm_num)
            X_ = X[index]
            y_ = y[index]
            print('ml fitting')
            self.ml.fit(X_, y_)
            print('transform')
            X = self.ml.transform(X)
        print('lp fitting')
        zero_index = np.nonzero(y == 0)
        negetive_index = np.nonzero(y == -1)
        positive_index = np.nonzero(y == 1)
        y[zero_index] = -1
        y[negetive_index] = 2
        print(zero_index[0].shape, negetive_index[0].shape,
              positive_index[0].shape)
        self.clf.fit(X, y)

    def predict(self, X):
        print('lp predict')
        if self.lmnn:
            X = self.ml.transform(X)
        y_pred = self.clf.predict(X)
        negative_index = np.nonzero(y_pred == -1)
        two_index = np.nonzero(y_pred == 2)
        y_pred[negative_index] = 0
        y_pred[two_index] = -1
        return y_pred
Esempio n. 14
0
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1):
    spread = LabelSpreading(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            alpha=a,
                            max_iter=MI,
                            n_jobs=-1)
    spread.fit(xTrain, yTrain)
    evaledY = spread.predict(xTrain)

    #def stats(trainY,evaledY,expectedY,day_one): return
    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, evaledY, yExpect, day_one)

    results = [
        'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'SC.csv'
    write_csv(file_name, results)
def getLabelPropa(yale):
    n = len(yale)
    yale['labels'] = yale['Rank']
    yale['labels'].loc[yale['Town'].isin(
        ['Greenwich', 'Westport', 'Fairfield', 'Trumbull', 'Ridgefield'])] = 1
    #print(yale['labels'])
    label = yale['labels']
    yale = yale.select_dtypes(include=['float64', 'int64'])
    label_prop_model = LabelSpreading(alpha=0.1,
                                      kernel='rbf',
                                      n_neighbors=3,
                                      max_iter=300,
                                      gamma=2)
    yale = yale.drop(['labels'], axis=1)
    yale = preprocessing.normalize(yale, axis=0, norm='max')
    label_prop_model.fit(yale, label)
    label = label_prop_model.predict(yale)
    ##print(label_prop_model.predict(yale))
    #print(label_prop_model)
    #print(label_prop_model.predict_proba(yale))
    return label
Esempio n. 16
0
def run_lp_tfidf(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Esempio n. 17
0
class propgate_lables_predictor():
    def __init__(self,X_unlabled,predictor):
        self.X_unlabled=X_unlabled
        self.prop_model = LabelSpreading(kernel='rbf',gamma=0.1,max_iter=1000,tol=0.001,n_jobs=-1,alpha=0.2)
        self.predictor = predictor


    def fit(self,X,y):
        unlabled =self.X_unlabled#.head(500)
        new_x, new_y_pre = pd.concat([pd.DataFrame(X), pd.DataFrame(unlabled.values)]), pd.concat([y, pd.DataFrame([-1] * len(unlabled))])
        scale = StandardScaler()
        self.prop_model.fit(scale.fit_transform(new_x), np.array(new_y_pre).ravel())

        new_y_post = self.prop_model.predict(scale.transform(new_x))
        pred_entropies = pd.Series(scipy.stats.entropy(self.prop_model.label_distributions_.T))


        X_final = new_x.reset_index(drop=True)
        pred_entropies.index = X_final.index


        #pred_entropies[new_y_pre==-1] = 0 #they are known, making sure they are in
        y_final = pd.concat([pd.Series(y), pd.Series(new_y_post[len(y):])])
        y_final.index = X_final.index

        cond = (~pred_entropies.isna()) & (pred_entropies < pred_entropies.iloc[len(y):].mean())

        X_final = X_final.loc[cond,:]
        y_final = y_final[cond]

        print(len(X),'final amount of instances:',len(X_final))
        self.predictor.fit(X_final,np.array(y_final).ravel())

    def predict(self,X):
        return self.predictor.predict(X)

    def predict_proba(self,X):
        return self.predictor.predict_proba(X)
Esempio n. 18
0
def JSFS(X: np.ndarray, y: np.ndarray, test_X: np.ndarray, test_y: np.ndarray,
         name: str):
    # reference: Jiang, Bingbing, et al.
    # "Joint semi-supervised feature selection and classification through Bayesian approach."
    # Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 33. 2019.
    print('========== JSFS ==========')
    # --- Input & Initialize---
    # np.set_printoptions(threshold=np.inf)
    n = len(X)
    d = len(X[0])
    y.resize((n, 1))
    testSize = len(test_X)
    # labeled sample ratio
    # labelRatio = 0.5
    labelRatio = CONFIG[name]['labelRatio']
    l = int(n * labelRatio)
    u = n - l
    # γ and µ are super parameters
    # Gamma = 0.001
    Gamma = CONFIG[name]['Gamma']
    # Mu = 0.9
    Mu = CONFIG[name]['Mu']
    # Beta = 0.005
    Beta = 5
    Omega = np.zeros((d, 1))
    Omega[:] = 0.5
    Lambda_vector = np.zeros((u, 1))
    Lambda_vector[:] = 0.5
    A = np.zeros((d, d))
    for i in range(d):
        A[i, i] = 0.001
    C = np.zeros((u, u))
    for i in range(u):
        C[i, i] = 0.001
    # --- Construct the affinity matrix S and graph Laplacian L via KNN ---
    print('Construct the affinity matrix S and graph Laplacian L via KNN')
    trainData_X = X
    trainData_Y = y.ravel()  # y and trainData_Y address the same memory
    # replace the original -1 label with 0, because in this method -1 means no label
    for i in range(n):
        trainData_Y[i] = 0 if trainData_Y[i] == -1 else trainData_Y[i]
    trainData_Y[l:] = -1
    KNN = KNeighborsClassifier(n_neighbors=5)
    KNN.fit(trainData_X[:l], trainData_Y[:l])
    S = np.zeros((n, n))
    D = np.zeros((n, n))
    L = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            if trainData_Y[i] == trainData_Y[j] and trainData_Y[i] != -1:
                S[i][j] = 10
            elif (trainData_Y[i] == -1
                  and trainData_Y[j] > -1) and (KNN.predict(
                      trainData_X[i:i + 1]) == trainData_Y[j]):
                S[i][j] = 1
            elif (trainData_Y[j] == -1
                  and trainData_Y[i] > -1) and (KNN.predict(
                      trainData_X[j:j + 1]) == trainData_Y[i]):
                S[i][j] = 1
            else:
                S[i][j] = 0
            S[j][i] = S[i][j]
        D[i, i] = sum(S[i, :])
        percent = 100 * (float((2 * n - i) * (i + 1)) / ((n + 1) * n))
        show_str = ('[%%-%ds]' % 50) % (int(50 * percent / 100) * "#")
        print('\r%s %d%%' % (show_str, percent), end='')
    L = D - S
    # --- Obtain the pseudo laber vector y_u via label progation ---
    print('\nObtain the pseudo laber vector y_u via label progation')
    LGC_rbf = LabelSpreading(kernel='knn',
                             gamma=20,
                             n_neighbors=7,
                             max_iter=150)
    LGC_rbf.fit(trainData_X, trainData_Y)
    trainData_Y[l:] = LGC_rbf.predict(trainData_X[l:])
    # change 0 back to the -1
    """ for i in range(n):
        trainData_Y[i] = -1 if trainData_Y[i] == 0 else trainData_Y[i] """
    # --- Data preprocessing - Normalized for X, y ---
    # min_max_scaler = preprocessing.MinMaxScaler((0, 0.0001))
    min_max_scaler = preprocessing.MinMaxScaler(
        (0, CONFIG[name]['xMaxScaler']))
    X = min_max_scaler.fit_transform(X)
    test_X = min_max_scaler.transform(test_X)
    # --- Convergence ---
    B = Gamma * np.dot(np.dot(X.T, L), X)
    Lambda = np.matlib.identity(n)
    Sigma = np.zeros((n, 1))
    E = np.zeros((n, n))
    P = np.zeros((u, u))
    k_lambda = np.zeros((u, 1))
    Eu = np.zeros((u, u))
    O = np.zeros((u, u))
    Omega_old = np.ones((d, 1))
    Lambda_vector_old = np.zeros((u, 1))
    g_omega = np.zeros((d, 1))
    H_omega = np.zeros((d, d))
    Sig_omega = np.zeros((d, d))
    g_lambda = np.zeros((u, 1))
    H_lambda = np.zeros((u, u))
    Sig_lambda = np.zeros((u, u))
    G = np.zeros((d, d))
    cnt = 0
    while np.linalg.norm(Omega - Omega_old, ord=np.inf) > 0.001:
        print('--------', cnt + 1, '--------')
        for i in range(n):
            if (i < l):
                Sigma[i, 0] = 1 / (1 + np.exp(-1 * np.dot(X[i, :], Omega)))
                E[i, i] = Sigma[i, 0] * (1 - Sigma[i, 0])
            else:
                Sigma[i, 0] = 1 / \
                    (1 + np.exp(-1 *
                                Lambda_vector[i-l, 0] * np.dot(X[i, :], Omega)))
                E[i, i] *= Mu * Lambda_vector[i-l, 0] * \
                    Lambda_vector[i-l, 0] * Sigma[i, 0] * (1 - Sigma[i, 0])
                Lambda[i, i] = Mu * Lambda_vector[i - l, 0]
                P[i - l, i - l] = np.dot(X[i, :], Omega)
                k_lambda[i-l, 0] = Beta * \
                    (1 - (1 / (1 + np.exp(-(Beta * Lambda_vector[i-l, 0])))))
                Eu[i - l, i - l] = Sigma[i, 0] * (1 - Sigma[i, 0])
                O[i - l, i - l] = Beta * Beta * (
                    1 / (1 + np.exp(-(Beta * Lambda_vector[i - l, 0])))) * (
                        1 - (1 /
                             (1 + np.exp(-(Beta * Lambda_vector[i - l, 0])))))
        if (np.linalg.norm(g_omega[:, 0], ord=2) / d) < 0.001:
            g_omega = np.dot(np.dot(X.T, Lambda), (y - Sigma)) - \
                np.dot((A + B), Omega)
            H_omega = -1 * (np.dot(np.dot(X.T, E), X) + A + B)
            Sig_omega = -1 * np.linalg.inv(H_omega)
            Omega_old = Omega.copy()
            Omega = Omega - np.dot(np.linalg.inv(H_omega), g_omega)
            print('gw:', np.mean(g_omega[:, 0]), ' gw_judge:',
                  (np.linalg.norm(g_omega[:, 0], ord=2) / d), 'w_max',
                  np.max(Omega, axis=0), 'w_min', np.min(Omega, axis=0))
        for i in range(d):
            if (Omega[i, 0] != 0) and (abs(Omega[i, 0]) < 0.001):
                Omega[i, 0] = 0
        if (np.linalg.norm(g_lambda[:, 0], ord=2) / u) < 0.001:
            g_lambda = Mu * np.dot(P, (y[l:] - Sigma[l:])) - \
                np.dot(C, Lambda_vector) + k_lambda
            H_lambda = -1 * ((Mu * np.dot(np.dot(P.T, Eu), P)) + C + O)
            Sig_lambda = -1 * np.linalg.inv(H_lambda)
            Lambda_vector_old = Lambda_vector.copy()
            Lambda_vector = Lambda_vector - \
                np.dot(np.linalg.inv(H_lambda), g_lambda)
            print('gl:', np.mean(g_lambda[:, 0]), ' gl_judge:',
                  (np.linalg.norm(g_lambda[:, 0], ord=2) / u), 'l_max',
                  np.max(Lambda_vector, axis=0), 'l_min',
                  np.min(Lambda_vector, axis=0))
        for i in range(u):
            if (Lambda_vector[i, 0] != 0) and (abs(Lambda_vector[i, 0]) <
                                               0.001):
                Lambda_vector[i, 0] = 0
        G = np.dot(
            np.dot(
                np.dot(np.linalg.inv(A), B),
                np.linalg.inv(
                    np.matlib.identity(d) + np.dot(np.linalg.inv(A), B))),
            np.linalg.inv(A))
        for i in range(d):
            A[i,
              i] = 1 / (Omega[i, 0] * Omega[i, 0] + G[i, i] + Sig_omega[i, i])
        for i in range(u):
            C[i, i] = 1 / (Lambda_vector[i, 0] * Lambda_vector[i, 0] +
                           Sig_lambda[i, i])
        print('max_lambda_new-old',
              np.linalg.norm(Lambda_vector - Lambda_vector_old, ord=np.inf))
        print('max_omega_new-old', np.linalg.norm(Omega - Omega_old,
                                                  ord=np.inf))
        cnt += 1
        if cnt == 50:
            break
    # --- Test ---
    predict_y = np.zeros(testSize)
    predict_vector_y = np.dot(test_X, Omega).flatten()
    predict_vector_y *= CONFIG[name]['yScaler']
    threshold = CONFIG[name]['threshold']
    for i in range(testSize):
        if predict_vector_y[0, i] < threshold:
            predict_y[i] = -1
        else:
            predict_y[i] = 1
    print('predict_y:', predict_vector_y[0, :10])
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    for idx in range(len(test_y)):
        if test_y[idx] == 1 and predict_y[idx] == 1:
            tp += 1
        elif test_y[idx] == 1 and predict_y[idx] == -1:
            fn += 1
        elif test_y[idx] == -1 and predict_y[idx] == 1:
            fp += 1
        elif test_y[idx] == -1 and predict_y[idx] == -1:
            tn += 1
    p = tp / (fp + tp)
    pf = fp / (fp + tn)
    pd = tp / (tp + fn)
    F_measure = 2 * pd * p / (pd + p)
    """ print('precision:', 100 * p, '%')
    print('recall:', 100 * recall_score(test_y, predict_y), '%')
    print('pf:', 100 * pf, '%')
    print('F-measure:', 100 * F_measure, '%')
    print('accuracy:', 100 * accuracy_score(test_y, predict_y), '%')
    print('AUC:', 100 * roc_auc_score(test_y, predict_y), '%') """
    print('precision:', p)
    print('recall:', recall_score(test_y, predict_y))
    print('pf:', pf)
    print('F-measure:', F_measure)
    print('accuracy:', accuracy_score(test_y, predict_y))
    print('AUC:', roc_auc_score(test_y, predict_y))
# -*- coding: utf-8 -*-
"""
http://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelSpreading.html
Created on Fri Sep 14 16:13:06 2018

@author: Akitaka
"""

import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading
label_prop_model = LabelSpreading()
iris = datasets.load_iris()
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
labels = np.copy(iris.target)
labels[random_unlabeled_points] = -1
label_prop_model.fit(iris.data, labels)
print(labels)
print(iris.target)
print(label_prop_model.transduction_)
print(label_prop_model.predict(iris.data))
  4.  1.  6. -1.  4.  4.  1.  1.  6.  6. -1.  4.  4.  4.  3.  2.  6. -1.
  1.  6.  4.  4.  4.  5.  6. -1. -1.  5.  2.  6.  1.  6.  3.  2.  6.  3.
  3.  1.  2.  5.  2. -1. -1.  1.  6.  6. -1.  6.  6.  6.  4.  6. -1.  2.
  3.  2.  5.  4.  4.  6.  4. -1.  4.  2.  6.  1.  1.  2. -1.  5.  2.  4.
  3. -1.  6.  2.  5.  2.  2.  5.  5.  4.  2.  1. -1.  1.]
(500, 100)
(500,)
"""

from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
label_propagation_model.fit(X, y)

# make predictions for first twenty samples (some will be known, some unknown)
for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1))
"""
y:  6.0     y_hat:  [6.]
y:  6.0     y_hat:  [6.]
y:  2.0     y_hat:  [2.]
y:  1.0     y_hat:  [1.]
y:  -1.0    y_hat:  [6.]    *
y:  2.0     y_hat:  [2.]
y:  6.0     y_hat:  [6.]
y:  4.0     y_hat:  [4.]
y:  3.0     y_hat:  [3.]
y:  5.0     y_hat:  [5.]
y:  6.0     y_hat:  [6.]
y:  4.0     y_hat:  [4.]
y:  3.0     y_hat:  [3.]
y:  3.0     y_hat:  [3.]
Esempio n. 21
0
            label_prop_model = LabelPropagation(kernel=p_ss_kern,
                                                gamma=p_gamma,
                                                n_neighbors=p_neighbors,
                                                alpha=p_alpha,
                                                max_iter=70)
        else:
            label_prop_model = dic_ss_mod[p_ss_mod](kernel=p_ss_kern,
                                                    gamma=p_gamma,
                                                    n_neighbors=p_neighbors)
        print('Start to fit. Run for shelter!')
        label_prop_model.fit(X_tot, y_tot)
        temp_acc = label_prop_model.score(X_valid_lab, y_valid)
        print('{} / {} :accuracy = {}'.format(i, p_manyfit, temp_acc))
        RESULT_ACC_SS += temp_acc
    y_tot = label_prop_model.transduction_
    y_submit = label_prop_model.predict(X_submit)
    save_to_csv(X_tot, y_tot, X_valid_lab, y_valid)
    RESULT_ACC_SS /= p_manyfit
    json_dict['ss_accuracy'] = RESULT_ACC_SS
    print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS)
else:
    init_variables()
    #PCA preprocessing
    if (PCA_MODE):
        pca_preprocess()
    X_tot, y_tot, X_valid, y_valid = load_xy()

##############################NEURAL NETWORK PART ##################################

if (USING_NN):
    model = build_model()
Esempio n. 22
0
def build_models(trainX, trainY, testX, testY, source_pos, target_pos, window):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX)
    acc_ss_propagation, acc_ss_propagation_INFO = checkAccuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX)
    acc_ss_spreading, acc_ss_spreading_INFO = checkAccuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = checkAccuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = checkAccuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = checkAccuracy(testY, predND)
    #
    print("WITHOUT TL ACC_LR:", accLR, " ACC_DT:", accDT, " ACC_NB:", accNB)
    ########################
    #### WITH TL ########
    ########################

    ####################################################
    ### Kernel Mean Matching (Huang et al., 2006)
    ###
    # Decision Tree
    print("\n Kernel Mean Matching (Huang et al., 2006) ")
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_KMM, acc_DT_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_KMM)
    # Logistic Regression
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_KMM, acc_LR_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_KMM)
    # Naive Bayes Bernoulli
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_KMM, acc_NB_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_KMM)
    ####################################################
    ### Nearest-neighbour-based weighting (Loog, 2015)
    ###
    # Decision Tree
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_NN, acc_DT_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_NN)
    # Logistic Regression
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_NN, acc_LR_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_NN)
    # Naive Bayes Bernoulli
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_NN, acc_NB_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_NN)

    ####################################################
    ### Transfer Component Analysis (Pan et al, 2009)
    ###
    # Decision Tree
    print("\n Transfer Component Analysis (Pan et al, 2009)")
    classifier = TransferComponentClassifier(loss="dtree", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_TCA, acc_DT_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_TCA)
    # Logistic Regression
    classifier = TransferComponentClassifier(loss="logistic", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_TCA, acc_LR_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_TCA)
    # Naive Bayes Bernoulli
    classifier = TransferComponentClassifier(loss="berno", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_TCA, acc_NB_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_TCA)

    ####################################################
    ### Subspace Alignment (Fernando et al., 2013)
    ###
    # Decision Tree
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_SA, acc_DT_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_SA)
    # Logistic Regression
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_SA, acc_LR_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_SA)
    # Naive Bayes Bernoulli
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_SA, acc_NB_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_SA)
    #################################
    ############# ENSEMBLE ##########
    #################################
    classifier_SA_DT = SubspaceAlignedClassifier(loss="dtree")
    classifier_SA_LR = SubspaceAlignedClassifier(loss="logistic")
    classifier_SA_NB = SubspaceAlignedClassifier(loss="berno")
    classifier_TCA_DT = TransferComponentClassifier(loss="dtree")
    classifier_TCA_LR = TransferComponentClassifier(loss="logistic")
    classifier_TCA_NB = TransferComponentClassifier(loss="berno")
    classifier_NN_DT = ImportanceWeightedClassifier(iwe='nn', loss="dtree")
    classifier_NN_LR = ImportanceWeightedClassifier(iwe='nn', loss="logistic")
    classifier_NN_NB = ImportanceWeightedClassifier(iwe='nn', loss="berno")
    classifier_KMM_DT = ImportanceWeightedClassifier(iwe='kmm', loss="dtree")
    classifier_KMM_LR = ImportanceWeightedClassifier(iwe='kmm',
                                                     loss="logistic")
    classifier_KMM_NB = ImportanceWeightedClassifier(iwe='kmm', loss="berno")
    #
    eclf = EnsembleClassifier(
        clfs=[classifier_TCA_DT, classifier_NN_DT, classifier_KMM_DT])
    eclf.fit(trainX, trainY, testX)
    pred = eclf.predict_v2(testX)
    acc_ENSEMBLE, acc_ENSEMBLE_INFO = checkAccuracy(testY, pred)

    ########################
    #### RETURN ########
    ########################
    return pd.DataFrame([{
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,
        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO': acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO': acc_ss_spreading_INFO,
        'acc_ENSEMBLE': acc_ENSEMBLE,
        'acc_LR': accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO),
        'acc_LR_KMM': acc_LR_KMM,
        'acc_LR_KMM_INFO': str(acc_LR_KMM_INFO),
        'acc_LR_NN': acc_LR_NN,
        'acc_LR_NN_INFO': str(acc_LR_NN_INFO),
        'acc_LR_TCA': acc_LR_TCA,
        'acc_LR_TCA_INFO': str(acc_LR_TCA_INFO),
        'acc_LR_SA': acc_LR_SA,
        'acc_LR_SA_INFO': str(acc_LR_SA_INFO),
        'acc_DT_KMM': acc_DT_KMM,
        'acc_DT_KMM_INFO': str(acc_DT_KMM_INFO),
        'acc_DT_NN': acc_DT_NN,
        'acc_DT_NN_INFO': str(acc_DT_NN_INFO),
        'acc_DT_TCA': acc_DT_TCA,
        'acc_DT_TCA_INFO': str(acc_DT_TCA_INFO),
        'acc_DT_SA': acc_DT_SA,
        'acc_DT_SA_INFO': str(acc_DT_SA_INFO),
        'acc_NB_KMM': acc_NB_KMM,
        'acc_NB_KMM_INFO': str(acc_NB_KMM_INFO),
        'acc_NB_NN': acc_NB_NN,
        'acc_NB_NN_INFO': str(acc_NB_NN_INFO),
        'acc_NB_TCA': acc_NB_TCA,
        'acc_NB_TCA_INFO': str(acc_NB_TCA_INFO),
        'acc_NB_SA': acc_NB_SA,
        'acc_NB_SA_INFO': str(acc_NB_SA_INFO)
    }])
Esempio n. 23
0
# 留部分测试数据
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=20)

# 生成有未标记样本的数据集
rng = np.random.RandomState(0)
random_unlabeled = rng.rand(len(y_train)) < 0.8
# 未标记样本的标签设置为-1
y_train[random_unlabeled] = -1

# 调参gamma
# for i in [0.005, 0.01, 0.1, 0.5, 1]:
#     model = LabelPropagation(kernel='rbf', gamma=i)
#     model.fit(x_train, y_train)
#     print(i, accuracy_score(y_test, model.predict(x_test)))

model = LabelSpreading(kernel='rbf', gamma=0.01)
# model = LabelPropagation(kernel='rbf', gamma=0.01)
model.fit(x_train, y_train)

print('===========y===============')
print(y_test)
print('===========y_pred===============')
y_pred = model.predict(x_test)
print(y_pred)
print('=======confusion_matrix=======')
print(confusion_matrix(y_test, y_pred))
print('accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print(model.label_distributions_)
Esempio n. 24
0
def labelspread(train_data, semi_data, train_label, semi_label, train_name,
                semi_name, lib, libname):
    print("===========================")
    train_d = []
    train_l = []
    semi_d = []
    semi_l = []
    name = []

    for i in range(len(train_data)):
        if train_label[i] in lib:
            train_d.append(train_data[i])
            train_l.append(train_label[i])
            name.append(train_name[i])

    for i in range(len(semi_data)):
        if semi_label[i] in lib:
            semi_d.append(semi_data[i])
            semi_l.append(semi_label[i])
            name.append(semi_name[i])

    train_d = np.array(train_d) / -80.
    train_l = np.array(train_l)
    semi_d = np.array(semi_d) / -80.
    semi_l = np.array(semi_l)
    name = np.array(name)

    print(libname, ' all num: ', train_d.shape[0] + semi_d.shape[0])
    print(libname, ' train num: ', train_d.shape[0])
    print(libname, ' ratio: ',
          train_d.shape[0] / (train_d.shape[0] + semi_d.shape[0]))
    #print('PCA...')
    #data = PCA(n_components=439,whiten=True,svd_solver="full",random_state=0).fit_transform(data)
    #semi_sp_data = SVD.transform(semi_sp_data)
    #print('data PCA size: ',data.shape)
    semi_unl = np.full(semi_l.shape[0], -1)
    label = np.concatenate((train_l, semi_unl), axis=0).astype('int')
    data = np.concatenate((train_d, semi_d), axis=0).astype('float')
    print('label size: ', label.shape)
    print('data size: ', data.shape)
    print('label propagation...')
    #print(label)
    #model = LabelPropagation(kernel='knn',n_neighbors=5,max_iter=10000,tol=0.001,n_jobs=-1)
    model = LabelSpreading(kernel='rbf',
                           gamma=20,
                           alpha=0.2,
                           n_neighbors=5,
                           max_iter=100000,
                           tol=0.001,
                           n_jobs=20)
    model.fit(data, label)
    oursemi_l = model.predict(semi_d)

    ourlabel = np.concatenate((train_l, oursemi_l), axis=0)
    csvlabel = np.concatenate((train_l, semi_l), axis=0)
    print('our... ', ourlabel)
    print('csv... ', csvlabel)
    similarity = 0
    for i in range(len(ourlabel)):
        if ourlabel[i] == csvlabel[i]:
            similarity += 1
    print('new train num: ', similarity)
    print('ratio: ', similarity / len(ourlabel))
    return data, name, ourlabel, csvlabel
Esempio n. 25
0
    test_labels = np.argmax(test_labels_one_hot, 1)

    x_all = np.concatenate(
        (train_data, test_data
         ))  # concatenate the train and test data (for structure exploitation)
    test_labels_none = -1 * np.ones([
        test_labels.shape[0],
    ])  # the label of the test_data is set to -1
    y_all = np.concatenate(
        (train_labels,
         test_labels_none))  # concatenate the train labels and -1 test labels

    consist_model = LabelSpreading(gamma=4, max_iter=60)
    consist_model.fit(x_all, y_all)
    clf.evaluate_sub('consistency model', test_labels,
                     consist_model.predict(test_data))

    lgr_model = clf.classifier('LGR', train_data, train_labels)
    clf.evaluate('LGR', lgr_model, test_data, test_labels)

    knn_model = clf.classifier('KNN', train_data, train_labels)
    clf.evaluate('KNN', knn_model, test_data, test_labels)

    bnb_model = clf.classifier('BNB', train_data, train_labels)
    clf.evaluate('BNB', bnb_model, test_data, test_labels)

    svm_model = clf.classifier('SVM', train_data, train_labels)
    clf.evaluate('SVM', svm_model, test_data, test_labels)

    dtc_model = clf.classifier('DTC', train_data, train_labels)
    clf.evaluate('DTC', dtc_model, test_data, test_labels)
Esempio n. 26
0
 def fit_with_clustering(self, X_l, y_l, X_u, y_u=None):
     """
     Initialize the parameter using both labeled and unlabeled data.
     The classes of unlabeled data are assigned using similarity with labeled data.
     Assume unlabeled class as missing values, apply EM on unlabeled data to refine classifier.
     The label propagation can only use dense matrix, so it is quite time consuming.
     """
     n_ul_docs = X_u.shape[0]  # number of unlabeled samples
     n_l_docs = X_l.shape[0]  # number of labeled samples
     # initialization (n_docs = n_ul_docs):
     # assign class to unlabeled data using similarity with labeled data if y_u is not given
     if (y_u == None):
         label_prop_model = LabelSpreading(kernel='rbf',
                                           max_iter=5,
                                           n_jobs=-1)
         y_u = np.array([-1.0] * n_ul_docs)
         X = vstack([X_l, X_u])
         y = np.concatenate((y_l, y_u), axis=0)
         label_prop_model.fit(X.toarray(), y)
         y_u = label_prop_model.predict(X_u.toarray())
     y = np.concatenate((y_l, y_u), axis=0)
     clf = deepcopy(self.clf)  # build new copy of classifier
     clf.fit(X,
             y)  # use labeled data only to initialize classifier parameters
     prev_log_lkh = self.log_lkh  # record log likelihood of previous EM iteration
     lp_w_c = clf.feature_log_prob_  # log CP of word given class [n_classes, n_words]
     b_w_d = (X_u > 0)  # words in each document [n_docs, n_words]
     lp_d_c = get_blas_funcs(
         "gemm", [lp_w_c, b_w_d.T.toarray()
                  ])  # log CP of doc given class [n_classes, n_docs]
     lp_d_c = lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.T.toarray())
     lp_c = np.matrix(
         clf.class_log_prior_).T  # log prob of classes [n_classes, 1]
     lp_c = np.repeat(lp_c, n_ul_docs,
                      axis=1)  # repeat for each doc [n_classes, n_docs]
     lp_dc = lp_d_c + lp_c  # joint prob of doc and class [n_classes, n_docs]
     p_c_d = clf.predict_proba(
         X_u)  # weight of each class in each doc [n_docs, n_classes]
     expectation = get_blas_funcs(
         "gemm",
         [p_c_d, lp_dc
          ])  # expectation of log likelihood over all unlabeled docs
     expectation = expectation(alpha=1.0, a=p_c_d, b=lp_dc).trace()
     self.clf = deepcopy(clf)
     self.log_lkh = expectation
     if self.print_log_lkh:
         print("Initial expected log likelihood = %0.3f\n" % expectation)
     # Loop until log likelihood does not improve
     iter_count = 0  # count EM iteration
     while (self.log_lkh - prev_log_lkh >= self.tol
            and iter_count < self.max_iter):
         # while (iter_count<self.max_iter):
         iter_count += 1
         if self.print_log_lkh:
             print("EM iteration #%d" % iter_count)  # debug
         # E-step: Estimate class membership of unlabeled documents
         y_u = clf.predict(X_u)
         # M-step: Re-estimate classifier parameters
         X = vstack([X_l, X_u])
         y = np.concatenate((y_l, y_u), axis=0)
         clf.fit(X, y)
         # check convergence: update log likelihood
         p_c_d = clf.predict_proba(X_u)
         lp_w_c = clf.feature_log_prob_  # log CP of word given class [n_classes, n_words]
         b_w_d = (X_u > 0)  # words in each document
         lp_d_c = get_blas_funcs(
             "gemm", [lp_w_c, b_w_d.transpose().toarray()
                      ])  # log CP of doc given class [n_classes, n_docs]
         lp_d_c = lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.transpose().toarray())
         lp_c = np.matrix(
             clf.class_log_prior_).T  # log prob of classes [n_classes, 1]
         lp_c = np.repeat(lp_c, n_ul_docs,
                          axis=1)  # repeat for each doc [n_classes, n_docs]
         lp_dc = lp_d_c + lp_c  # joint prob of doc and class [n_classes, n_docs]
         expectation = get_blas_funcs(
             "gemm",
             [p_c_d, lp_dc
              ])  # expectation of log likelihood over all unlabeled docs
         expectation = expectation(alpha=1.0, a=p_c_d, b=lp_dc).trace()
         if self.print_log_lkh:
             print("\tExpected log likelihood = %0.3f" % expectation)
         if (expectation - self.log_lkh >= self.tol):
             prev_log_lkh = self.log_lkh
             self.log_lkh = expectation
             self.clf = deepcopy(clf)
         else:
             break
     self.feature_log_prob_ = self.clf.feature_log_prob_
     self.coef_ = self.clf.coef_
     return self
Esempio n. 27
0
# training the scaler
scaler = StandardScaler(with_mean=True, with_std=True)
scaler = scaler.fit(x_train_labeled)

# scaling the training and test data
x_train_labeled_scaled = scaler.transform(x_train_labeled)
# x_test_scaled = scaler.transform(x_test)

# stratified ten fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=seed)

# setup the model
for train_index, val_index in cv.split(x_train_labeled_scaled, y_train_labeled):
    # create training and validation splits
    x_train, x_val = x_train_labeled_scaled[train_index], x_train_labeled_scaled[val_index]
    y_train, y_val = y_train_labeled[train_index], y_train_labeled[val_index]

    # my_kernel = polynomial_kernel(x_train, y_train, degree=5, gamma=None, coef0=1)

    # create model and fit data
    model = LabelSpreading(kernel=polynomial_kernel, gamma=20, alpha=0.2, max_iter=1, tol=0.001, n_jobs=1)
    model = model.fit(x_train, y_train)

    # evaluate model
    y_pred = model.predict(x_val)
    acc = accuracy_score(y_val, y_pred)
    print("Model Result: Split {} - Acc: {}".format(train_index, acc))


Esempio n. 28
0
test_svm(x_all, y_all)

# make a more select dataset
# Filter the rest of the data
x_obs, y_obs, x_nuls = load_data()
keep = list(best.k_feature_idx_)
np.save('sfs_features', keep)
# keep = np.load('sfs_features.npy')
x_obs = x_obs[:, keep]
x_nuls = x_nuls[:, keep]

# apply LabelSpreading
label_spread = LabelSpreading(kernel='knn', alpha=0.8)
label_spread.fit(x_obs, y_obs)
x_all = np.concatenate([x_obs, x_nuls], axis=0)
y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

x, y = shuffle(x_all, y_all, random_state=42)
smpnum = min([sum(y == i) for i in range(1, 6)])
y_btr = y[y == 1][:smpnum]
x_btr = x[y == 1][:smpnum]
for i in range(2, 6):
    x_btr = np.concatenate([x_btr, x[y == i][:smpnum]])
    y_btr = np.concatenate([y_btr, y[y == i][:smpnum]])

x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size=0.20)

mod = svm.SVC(kernel='rbf')
mod.fit(x_tr, y_tr)
mod.score(x_te, y_te)
Esempio n. 29
0

nb_samples = 5000
nb_unlabeled = 1000


if __name__ == '__main__':
    # Create the dataset
    X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, random_state=100)
    Y[nb_samples - nb_unlabeled:nb_samples] = -1

    # Create and fit a LabelSpreading instance
    ls = LabelSpreading(kernel='rbf', gamma=10.0, alpha=0.2)
    ls.fit(X, Y)

    Y_final = ls.predict(X)

    # Show the final result
    fig, ax = plt.subplots(1, 2, figsize=(18, 8))

    ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], color='#88d7f0', marker='s', s=100)
    ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], color='#55ffec', marker='o', s=100)
    ax[0].scatter(X[Y == -1, 0], X[Y == -1, 1], color='r', marker='x', s=20)

    ax[0].set_xlabel(r'$x_0$')
    ax[0].set_ylabel(r'$x_1$')
    ax[0].set_title('Dataset')
    ax[0].grid()

    ax[1].scatter(X[Y_final == 0, 0], X[Y_final == 0, 1], color='#88d7f0', marker='s', s=100)
    ax[1].scatter(X[Y_final == 1, 0], X[Y_final == 1, 1], color='#55ffec', marker='o', s=100)
hist, bins = np.histogram(
    lables,
    bins=[-0.1, 0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1])
print(hist)
print(bins)

print(train_labeled.shape)
print(train_labeled[:, 0])

train_unlabeled = sklearn.preprocessing.scale(train_unlabeled)
features = sklearn.preprocessing.scale(features)

lp = LabelSpreading(kernel='knn',
                    gamma=20,
                    n_neighbors=7,
                    alpha=0.2,
                    max_iter=50,
                    tol=0.01,
                    n_jobs=-1)

y = lables
for i in range(21000):
    y = np.concatenate((y, np.array([-1])), axis=0)

all_data = np.concatenate((features, train_unlabeled), axis=0)

lp.fit(all_data, y)
Yresult = lp.predict(all_data)
print(lp.score(all_data, Yresult))

np.savetxt('semiLabelsOfUnlabeled2.csv', Yresult, delimiter=",")
print cutdown_labels
'''
[ 0  0  0  0 -1 -1 -1  0  0  0 -1  0  0 -1 -1 -1  0  0  0 -1  0 -1 -1  0  0
  0 -1  0  0 -1  0 -1 -1  0  0  0  0 -1  0  0 -1  0 -1  0 -1  0  0  0  0 -1
  1  1  1  1  1  1 -1 -1 -1  1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1 -1  1  1
  1  1 -1  1 -1  1  1  1 -1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1 -1 -1
 -1  2  2  2  2 -1  2  2 -1 -1 -1 -1  2  2  2  2  2 -1  2  2  2  2  2 -1 -1
  2  2  2 -1  2  2 -1 -1  2  2  2  2  2  2  2  2 -1  2  2 -1 -1  2  2 -1 -1]
'''

# fit LabelSpreading model
label_propagation_model.fit(iris['data'], cutdown_labels)

# quick test
print 'y: ', full_labels[-1]
print 'y_hat: ', label_propagation_model.predict(iris['data'][-1])
'''
y:  2
y_hat:  [2]
'''

# overall accuracy
correct = 0.0
for i in range(len(iris['data'])):

	if label_propagation_model.predict(iris['data'][i])[0] == full_labels[i]:
		correct += 1

print 'Overall accuracy: ', correct/ len(iris['data'])
'''
Overall accuracy:  0.98
y = np.asarray(Y_train)
for i in idxs:
    y[i] = -1

Y_train = y


# Train model and print statistics (use 'knn' as kernel)

from sklearn.semi_supervised import LabelSpreading

model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train)

print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2)))
pred = model.predict(X_test) == Y_test
print("Correct: {}".format(np.count_nonzero(pred==True)),"/",
      "Incorrect: {}".format(np.count_nonzero(pred==False)))

Z1 = model.predict(X_test).reshape(Y_test.size,1)
Z2 = np.asarray(Y_test).reshape(Y_test.size,1)
Z3 = np.around(model.predict_proba(X_test),decimals=2)
data = np.concatenate((Z1,Z2,Z3),axis=1)
outcome = pd.DataFrame(data, columns = ["Predicted Label", 
                                        "Actual Label", 
                                        "Prob. Label = 0.0", 
                                        "Prob. Label = 1.0"])
indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"]

print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep]))
server.sendmail("*****@*****.**", "*****@*****.**", msg)
server.quit()

# In[15]:

targets

# # Measuring effectiveness.
#
#

# In[26]:

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

t_pred = label_prop_model.predict(x_test)

print("Metrics based on 50 hold-out points")

print("Macro")
print("accuracy: %f" % accuracy_score(t_test, t_pred))
print("precision: %f" % precision_score(t_test, t_pred, average='macro'))
print("recall: %f" % recall_score(t_test, t_pred, average='macro'))
print("f1: %f" % f1_score(t_test, t_pred, average='macro'))
print("\n\nMicro")
print("accuracy: %f" % accuracy_score(t_test, t_pred))
print("precision: %f" % precision_score(t_test, t_pred, average='micro'))
print("recall: %f" % recall_score(t_test, t_pred, average='micro'))
print("f1: %f" % f1_score(t_test, t_pred, average='micro'))

from sklearn import metrics
def main(argv):
    trainFile = None
    testFile = None
    outFile = None

    try:
        opts, args = getopt.getopt(argv, "hi:t:o:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage()
            sys.exit()
        elif opt == '-i':
            trainFile = arg
        elif opt == '-t':
            testFile = arg
        elif opt == '-o':
            outFile = arg
        else:
            usage()
            print('Invalid argument %s' % opt)
            sys.exit(2)

    if (None == trainFile) or (None == testFile) or (None == outFile):
        print("Missing arguments")
        usage()
        sys.exit(2)

    facialData = pd.read_csv(trainFile)
    testData = pd.read_csv(testFile)

    testData.drop(columns=['id'], inplace=True)
    testData.reset_index(inplace=True, drop=True)

    labels = testData['class']
    classLabels = []
    for i in range(len(labels)):
        classLabels.append(1 if (labels[i] == 'deceptive') else 0)
    testData.drop(columns=['class'], inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(testData,
                                                        classLabels,
                                                        test_size=0.2,
                                                        stratify=classLabels,
                                                        random_state=42)

    X_train.insert(1, "class", y_train)
    sns.countplot(x="class", data=X_train)
    X_train = X_train.drop(columns=['class'])

    # Label Propagation
    modelLabelProp = LabelPropagation()
    labels = [-1] * len(facialData[:10000])
    labels.extend(y_train)
    inputData = pd.concat([facialData[:10000], X_train],
                          sort=False,
                          ignore_index=True,
                          copy=False)
    modelLabelProp.fit(inputData, labels)
    yPred = modelLabelProp.predict(X_test)
    print("LABEL PROPAGATION:")
    metricNPlot(modelLabelProp, X_test, y_test, yPred)

    with open(outFile, 'w') as f:
        f.write("Label Propagation prediction\n")
        for item in yPred:
            f.write("%s\n" % item)

    # Label Spreading
    modelLabelSpread = LabelSpreading(kernel='knn', n_neighbors=15)
    labels = [-1] * len(facialData[:10000])
    labels.extend(y_train)
    inputData = pd.concat([facialData[:10000], X_train],
                          sort=False,
                          ignore_index=True,
                          copy=False)
    modelLabelSpread.fit(inputData, labels)
    yPred = modelLabelSpread.predict(X_test)
    print("LABEL SPREADING:")
    metricNPlot(modelLabelSpread, X_test, y_test, yPred)

    with open(outFile, 'a') as f:
        f.write("Label Spreading prediction\n")
        for item in yPred:
            f.write("%s\n" % item)

    height = [0.8, 0.68]
    bars = ('Label Propagation', 'Label Spreading')
    y_pos = np.arange(len(bars))
    plt.title("Performance Comparison")
    plt.bar(y_pos, height, color=['cyan', 'red'])
    plt.xticks(y_pos, bars)
    plt.show()
Esempio n. 35
0
    def __call__(self, *args, **kwargs):
        """ Augment the labels

            Inputs:
            tr_percs: percentage of splitting between labeled and unlabeled observations
            algs: methods to perform the label propagation
            max_iter: parameter for 'gtg': number of iterations
        """
        tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1])
        algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only'])
        max_iter = kwargs.pop('max_iter', 25)

        if not osp.exists(self.label_dir):
            os.makedirs(self.label_dir)

        with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst:
            loader = prepare_loader(
                osp.join(self.splitting_dir, 'test.txt'),
                img_root=self.dset['src'],
                stats=self.dset['stats'],
                batch_size=1,
                shuffle=False,
            )

            for _, label, path in loader:
                dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n'))

        for net_name in self.net_names:
            with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'),
                      'rb') as pkl:
                net_name, labels, features, fnames = pickle.load(pkl)
                labels = labels.ravel()

                # uncomment to debug code
                # labels = labels[:5000]
                # features = features[:5000]
                # fnames = fnames[:5000]

            for tr_perc in tr_percs:
                labeled, unlabeled = equiclass_mapping(labels, tr_perc)
                for alg in algs:
                    print(net_name + ' - ' + str(self.dset['nr_classes']) +
                          ' classes')

                    # generate alg label file name
                    alg_path = osp.join(self.label_dir, alg, net_name,
                                        'labels_{}.txt'.format(tr_perc))

                    if self.hard_labels:
                        alg_labels = np.full(labels.shape[0], -1)
                        alg_labels[labeled] = labels[labeled]
                    else:
                        alg_labels = np.zeros(
                            (len(labels), self.dset['nr_classes']))
                        alg_labels[labeled,
                                   labels[labeled].ravel().astype(int)] = 1.0

                    if alg == 'gtg':
                        # predict labels with gtg
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)

                        ps = init_rand_probability(labels, labeled, unlabeled)
                        res = gtg.gtg(W,
                                      ps,
                                      max_iter=max_iter,
                                      labels=labels,
                                      U=unlabeled,
                                      L=labeled)

                        if self.hard_labels:
                            alg_labels[unlabeled] = res[unlabeled].argmax(
                                axis=1)
                        else:
                            alg_labels[unlabeled] = res[unlabeled]

                    elif alg == 'svm':
                        # predict labels with a linear SVM
                        lin_svm = svm.LinearSVC()

                        if self.hard_labels:
                            lin_svm.fit(features[labeled, :], labels[labeled])
                            svm_labels = lin_svm.predict(
                                features[unlabeled]).astype(int)
                        else:
                            cv = min(
                                np.unique(labels[labeled],
                                          return_counts=True)[1].min(), 3)
                            clf = CalibratedClassifierCV(lin_svm, cv=cv)
                            clf.fit(features[labeled, :], labels[labeled])

                            svm_labels = clf.predict_proba(features[unlabeled])

                        alg_labels[unlabeled] = svm_labels

                    elif alg == 'label_propagation':
                        # predict labels with a label propagation model
                        label_propagation = LabelPropagation(kernel='rbf',
                                                             gamma=0.05,
                                                             max_iter=4000)
                        labels[unlabeled] = -1
                        label_propagation.fit(features, labels)
                        if self.hard_labels:
                            label_propagation_labels = label_propagation.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_propagation_labels = label_propagation.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_propagation_labels

                    elif alg == 'label_spreading':
                        # predict labels with a label propagation model
                        label_spreading = LabelSpreading(kernel='rbf',
                                                         gamma=0.05)
                        labels[unlabeled] = -1
                        label_spreading.fit(features, labels)
                        if self.hard_labels:
                            label_spreading_labels = label_spreading.predict(
                                features[unlabeled]).astype(int)
                        else:
                            label_spreading_labels = label_spreading.predict_proba(
                                features[unlabeled])

                        alg_labels[unlabeled] = label_spreading_labels

                    elif alg == 'harmonic':
                        if 'W' not in locals():
                            W = gtg.sim_mat(features, verbose=True)
                        soft_labels, hard_labels = harmonic_function(
                            W, labels, labeled, unlabeled)
                        if self.hard_labels:
                            label_harmonic = hard_labels
                        else:
                            label_harmonic = soft_labels

                        alg_labels[unlabeled] = label_harmonic

                    elif alg == 'labels_only':
                        # generate labeled only file
                        alg_labels = alg_labels[labeled]

                        if not osp.exists(osp.dirname(alg_path)):
                            os.makedirs(osp.dirname(alg_path))

                        if (self.hard_labels and (alg_labels == -1).sum() > 0) or \
                                (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                            raise ValueError(
                                'There is some unlabeled observation, check \''
                                + alg + '\' algorithm,')

                        create_relabeled_file([fnames[i] for i in labeled],
                                              alg_path,
                                              alg_labels,
                                              sep=',')
                        break
                    else:
                        raise ValueError('algorithm \'' + alg +
                                         '\' not recognized.')

                    if not osp.exists(osp.dirname(alg_path)):
                        os.makedirs(osp.dirname(alg_path))

                    if (self.hard_labels and (alg_labels == -1).sum() > 0) or\
                        (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0):
                        raise ValueError('There is some unlabeled observation,'
                                         'check \'' + alg + '\' algorithm,')

                    create_relabeled_file(fnames,
                                          alg_path,
                                          alg_labels,
                                          sep=',')

            if 'W' in locals():
                del W