Beispiel #1
0
def semiLabelPropagation(feature_extractor, generator, val_generator, kernel,
                         neighbors, gamma):
    semi = LabelPropagation(kernel=kernel,
                            n_neighbors=neighbors,
                            gamma=gamma,
                            alpha=None,
                            tol=0.001,
                            max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes
Beispiel #2
0
def sklearn_lp(X, y,
            output=None,
            kernel='knn', 
            gamma=None,
            n_neighbors=10, 
            alpha=1, 
            max_iter=1000, 
            tol=0.00001):

    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3)
    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X_train, y_train)

    y_predict = label_prop_model.predict(X_test)
    print 'y_train: ', y_train
    print 'y_predict: ', y_predict
    
    print '+--------------------------------------------------------+'
    print '|                         Report                         +'
    print '+--------------------------------------------------------+'
    print classification_report(y_test, y_predict)
    print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
    print '\n\n'
Beispiel #3
0
def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    '''
    X, y, unlabeled_indices, XPredict, yTrue = data
    #print("get ytrue")
    #print(yTrue)
    # 必须拷贝,后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1
    print(y_train)
    #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1)
    clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5)
    #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20)
    clf.fit(X, y_train)
    ### 获取预测准确率
    # 预测标记
    predicted_labels = clf.predict(XPredict)
    print(XPredict)
    #predicted_labels = clf.transduction_[unlabeled_indices]
    # 真实标记
    #yTrue
    #true_labels = y[unlabeled_indices]
    print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))
Beispiel #4
0
def lb_prop_classify(network, labels):
	kf = StratifiedKFold(n_splits=10)
	scores = []
	cms = []

	for test_index, train_index in kf.split(network ,labels):
		first_train_index, last_train_index = min(train_index), max(train_index)

		train_dataset = network[first_train_index:last_train_index]
		train_labels = labels[first_train_index:last_train_index]

		test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0)
		test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0)

		label_spreading_model = LabelPropagation()
		label_spreading_model.fit(train_dataset, train_labels)
		scores.append(label_spreading_model.score(test_dataset, test_labels))

		prediction = label_spreading_model.predict(test_dataset)
		cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_))

	print('label propagation media {}'.format(np.average(scores)))
	print('label propagation desvio padrao {}'.format(np.std(scores)))
	print('label propagation matriz de confusao')
	print(get_percentile_cm(get_average_cm(cms)))
	print('\n')

	return scores
def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('../feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[4977:4977+3000]
    y_va = y[4977:4977+3000]
    x = np.concatenate((x[:4977],x[4977+3000:]))
    y = np.concatenate((y[:4977],y[4977+3000:]))
    
    
    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]
    
    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x = np.concatenate([x_obs, x_nuls], axis=0)
    y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)
    
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va
def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('./feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[[i in [8, 9] for i in o.values]]
    y_va = y[[i in [8, 9] for i in o.values]]
    x = x[[i not in [8, 9] for i in o.values]]
    y = y[[i not in [8, 9] for i in o.values]]
    o.unique()

    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]

    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x_all = np.concatenate([x_obs, x_nuls], axis=0)
    y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

    # Over sample the stages
    zen = SMOTE(random_state=8675309)
    x, y = zen.fit_resample(x_all, y_all)
    x, y = shuffle(x, y, random_state=42)
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va
Beispiel #7
0
def _label_propagation(df):
    X = _generate_features(df)
    labels = _generate_labels(df)
    # for some reason pandas returns NaN for -1 values
    labels = labels.fillna(-1)
    label_prop_model = LabelPropagation()
    label_prop_model.fit(X.toarray(), labels)
    return label_prop_model.predict(X.toarray())
 def doLabelPropagation(self,X,y,**kwargs):
     label_prop_model = LabelPropagation(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_prop_model.fit(X, y)
     if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) )
     return label_prop_model.predict_proba(X)
Beispiel #9
0
def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0],))
    all_labels = np.concatenate((labels, neg_ones), axis = 0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)
Beispiel #10
0
def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0], ))
    all_labels = np.concatenate((labels, neg_ones), axis=0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)
Beispiel #11
0
def LP(source_train, target_test, label1, label3):
    label_prop_model = LabelPropagation()
    label_prop_model.fit(source_train, label1)
    source_predict = label_prop_model.predict(target_test)
    # 评价参数
    accuracy = metrics.accuracy_score(label3, source_predict)
    recall = metrics.recall_score(label3, source_predict, average='weighted')
    f1 = metrics.f1_score(label3, source_predict, average='weighted')
    precision = metrics.precision_score(label3, source_predict, average='weighted')
    print("LP:", accuracy, recall, f1, precision)
    return accuracy, recall, f1, precision
Beispiel #12
0
def semi_shuffle_estimator(n_splits=10, test_size=0.6, seed=0, gamma=4, n_neighbors=6, max_iter=1000):
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
    i = 0
    testsize_list.append(test_size)
    train_scores = []
    test_scores =[]
    for label_index, unlabel_index in sss.split(X, Y):
        i += 1
        X_train = X.iloc[label_index]
        Y_train = Y.iloc[label_index]
        X_test = X.iloc[unlabel_index]
        Y_test = Y.iloc[unlabel_index]

        Y_unlabel = copy.deepcopy(Y_test)
        Y_unlabel['Class'] = -1

        X_new = pd.concat([X_train, X_test])
        Y_new = pd.concat([Y_train, Y_unlabel])


        shuffle_index = np.random.permutation(X.index)
        X_new_shuffle = X_new.take(shuffle_index)
        Y_new_shuffle = Y_new.take(shuffle_index)

        lp = LabelPropagation(gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter)
        lp.fit(X_new_shuffle, Y_new_shuffle.values.ravel())

        Y_predict_train = lp.predict(X_train)
        Y_predict_test = lp.predict(X_test)
        train_scores.append(accuracy_score(Y_train, Y_predict_train))
        test_scores.append(accuracy_score(Y_test, Y_predict_test))
        # print("-------Cross_validation epoch {}--------".format(i))
        # print("The accuracy in train set:", accuracy_score(Y_train, Y_predict_train))
        # print("The accuracy in test set:", accuracy_score(Y_test, Y_predict_test))
    mean_train_score = np.array(train_scores).mean()
    mean_test_score = np.array(test_scores).mean()
    print("For test size {}, the mean accuracy in train set is {}".format(test_size, mean_train_score))
    print("For test size {}, the mean accuracy in test set is {}".format(test_size, mean_test_score))
    train_socres_list.append(mean_train_score)
    test_scores_list.append(mean_test_score)
Beispiel #13
0
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )
Beispiel #14
0
def do_evaluation(X, y, 
                kernel='knn',
                output=None, 
                gamma=None,
                n_neighbors=10, 
                alpha=1, 
                max_iter=1000, 
                tol=0.00001):
    # from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    import random

    size = len(X)

    random_seeds = np.random.randint(1, 1000, size=10)
    for i in range(len(random_seeds)):
        
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i])
        labels = np.copy(y)
        tmp = np.arange(size)
        np.random.shuffle(tmp)
        train_test_split_rate = int(size*.9)
        random_unlabeled_points = tmp[:train_test_split_rate]
        labeled_points = tmp[train_test_split_rate:]
        random_unlabeled_points.sort()
        X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points]
        y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points]
        y_train = [y[_] for _ in range(size) if _ in labeled_points]

        labels[random_unlabeled_points] = -1

        label_prop_model = LabelPropagation(kernel=kernel, 
                                            gamma=gamma, 
                                            n_neighbors=n_neighbors, 
                                            alpha=alpha, 
                                            max_iter=max_iter, 
                                            tol=tol)
        label_prop_model.fit(X, labels)

        y_predict = label_prop_model.predict(X_test)
        
        print '+--------------------------------------------------------+'
        print '|                         Report                         |'
        print '+--------------------------------------------------------+'
        print 'test round:', (i+1), ' with random seed: ', random_seeds[i]
        print 'training label: ', y_train
        print 'training post id: ', [_+1 for _ in labeled_points]
        print 'predict label: ', y_predict
        print classification_report(y_test, y_predict)
        print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
        print '\n\n'
Beispiel #15
0
 def process(self, n_components):
     X_train, y_train, X_test, y_test = self.preprocess(n_components)
     label_prop_model = LabelPropagation(n_jobs=-1)
     label_prop_model.fit(X_train, y_train)
     y_pred = label_prop_model.predict(X_test)
     mean_acc = label_prop_model.score(X_test, y_test)
     plot_confusion_matrix(y_test,
                           y_pred,
                           self.labels,
                           normalize=False,
                           figname=('lp_comps_%d.png' % n_components))
     self.m_acc.append(mean_acc)
     print(label_prop_model.get_params())
Beispiel #16
0
    def execute(self, function_context: FunctionContext,
                input_list: List) -> List:
        x_train = input_list[0]
        y_label = input_list[1]
        input_dim = 512

        x_train_columns = list()
        x_train_columns.append('face_id')
        for i in range(1, input_dim + 1):
            x_train_columns.append('col' + str(i))
        trainDf = pd.DataFrame(x_train, columns=x_train_columns)
        labelDf = pd.DataFrame(y_label, columns=('face_id', 'label'))

        trainDf = pd.merge(trainDf,
                           labelDf,
                           on=['face_id'],
                           how='inner',
                           suffixes=('_x', '_y'))
        y_label = trainDf['label'].values.astype(int)
        trainDf = trainDf.drop('face_id', 1)
        x_train = trainDf.drop('label', 1).values

        label_prop_model = None
        score = 0.0
        while score < 0.95:
            print('before train ACC:', score)
            random_unlabeled_points = np.random.rand(len(y_label))
            random_unlabeled_points = random_unlabeled_points < 0.3  # 0-1的随机数,小于0.7返回1,大于等于0.7返回0
            Y = y_label[random_unlabeled_points]  # label转换之前的
            y_label[random_unlabeled_points] = -1  # 标签重置,将标签为1的变为-1

            label_prop_model = LabelPropagation()
            label_prop_model.fit(x_train, y_label)

            Y_pred = label_prop_model.predict(x_train)
            Y_pred = Y_pred[random_unlabeled_points]
            score = accuracy_score(Y, Y_pred)

            y_label[random_unlabeled_points] = Y

        model_path = os.path.dirname(os.path.abspath(__file__)) + '/model'
        print('Save trained model to {}'.format(model_path))
        if not os.path.exists(model_path):
            joblib.dump(label_prop_model, model_path)

        model_meta: ModelMeta = function_context.node_spec.output_model
        # Register model version to notify that cluster serving is ready to start loading the registered model version.
        register_model_version(model=model_meta, model_path=model_path)
        return []
    def label_propagation(self, X_train, y, X_test):

        clf = LabelPropagation()
        print("X_train Shape :", X_train.shape, type(X_train))
        print("X_test shape : ", X_test.shape, type(X_test))
        print("y shape : ", y.shape)

        X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
        print("X shape now ", X.shape)
        print("Y shape now ", y.shape)
        clf.fit(X, y)
        final_labels = clf.predict(X_test)
        label_prob = clf.predict_proba(X_test)
        print(compare_labels_probabilities().compare(label_prob, final_labels))
        return final_labels, clf
def label_prop():

    labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']]
    model = LabelPropagation(kernel=rbf_kernel_safe)
    model.fit(df10, labels.values.ravel())
    pred = np.array(model.predict(df12))
    df13 = pd.DataFrame(pred, columns=['Prediction'])
    df14 = pd.concat([df12, df13], axis=1)
    print(df14[['ID', 'Prediction']])
    # print(df14.loc[df14['Prediction'] == 'Y'])
    plt.style.use ( 'seaborn' )
    df14['Prediction'].value_counts().plot(kind='bar')
    plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] )
    plt.ylabel('Number of occurrences after prediction by RBF algorithm');
    plt.show()
Beispiel #19
0
class _LabelPropagationImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
def iris_semi():
    X, y = load_iris(return_X_y=True)
    print('data shape: {}'.format(X.shape))

    # 降维,方便可视化
    pca = PCA(n_components=2)
    X = pca.fit_transform(X)

    # 设置画布
    from matplotlib.colors import ListedColormap
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    fig = plt.figure()
    for i, threshold in enumerate([0.3, 0.5, 0.8, 1]):
        new_y = y.copy()
        if threshold < 1:
            rng = np.random.RandomState(0)
            random_unlabeled = rng.rand(
                len(y)) <= threshold  # 0-1的随机数,小于等于threshold返回True
            # 未标记样本的标签设置为-1
            new_y[random_unlabeled] = -1

            model_name = 'LabelPropagation'
            model = LabelPropagation(kernel='rbf', gamma=20)
        else:
            model_name = 'SVC'
            model = SVC()
        model.fit(X, new_y)

        # 生成网格数据点
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05),
                             np.arange(y_min, y_max, 0.05))
        new_x = np.c_[xx.ravel(), yy.ravel()]
        z = model.predict(new_x)

        # 画出网格数据的预测值
        ax = fig.add_subplot(2, 2, i + 1)
        ax.pcolormesh(xx, yy, z.reshape(xx.shape), cmap=cmap_light, alpha=0.5)
        # 画出真实数据分布
        ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        ax.set_title('{}, {}% data'.format(model_name, threshold * 100))

    plt.show()
Beispiel #21
0
def do_label_propagation(input_data,
                        input_label,
                        output=None,
                        kernel='knn', 
                        gamma=None,
                        n_neighbors=10, 
                        alpha=1, 
                        max_iter=30, 
                        tol=0.001):
    n_neighbors += 1

    # input label
    input_label_fh = open(input_label, 'rb')
    label_lines = input_label_fh.readlines()
    label_lines = [int(_.strip()) for _ in label_lines]
    y = np.array(label_lines)
    input_label_fh.close()

    size = len(y)

    # input data
    input_data_fh = open(input_data, 'rb')
    data_lines = input_data_fh.readlines()[:size]
    data_lines = [_.strip() for _ in data_lines]
    X = np.array(np.mat(';'.join(data_lines)))
    input_data_fh.close()

    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X, y)

    prediction = label_prop_model.predict(X)

    if output:
        output_fh = open(output, 'wb')
        for p in prediction:
            output_fh.write(str(p)+'\n')
        output_fh.close()

    return label_prop_model
def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('../feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[4977:4977 + 3000]
    y_va = y[4977:4977 + 3000]
    x = np.concatenate((x[:4977], x[4977 + 3000:]))
    y = np.concatenate((y[:4977], y[4977 + 3000:]))

    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]
    x_nuls = x[nul(y)]

    # Undersample the stages
    x_obs, y_obs = shuffle(x_obs, y_obs, random_state=42)
    smpnum = min([sum(y_obs == i) for i in range(1, 6)])
    y_obs_us = y[y == 1][:smpnum]
    x_obs_us = x[y == 1][:smpnum]
    for i in range(2, 6):
        x_obs_us = np.concatenate([x_obs_us, x[y == i][:smpnum]])
        y_obs_us = np.concatenate([y_obs_us, y[y == i][:smpnum]])

    # apply Label Spreading
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs_us, y_obs_us)
    x_all = np.concatenate([x_obs, x_nuls], axis=0)
    y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

    # Undersample the stages
    x, y = shuffle(x, y, random_state=42)
    smpnum = min([sum(y == i) for i in range(1, 6)])
    y_btr = y[y == 1][:smpnum]
    x_btr = x[y == 1][:smpnum]
    for i in range(2, 6):
        x_btr = np.concatenate([x_btr, x[y == i][:smpnum]])
        y_btr = np.concatenate([y_btr, y[y == i][:smpnum]])
    x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size=0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va
Beispiel #23
0
class LabelPropagationClassifier(Classifier):
	
	def __init__(self, matrixdatabase):
		self._matrix_database = matrixdatabase
		self._has_fit = False
		self._lbl = LabelPropagation()

	def learn(self, ingredients, cuisine):
		return

	def classify(self, ingredients):
		if not self._has_fit:
			matrix, classes = self._matrix_database.make_train_matrix()
			matrix = matrix.toarray()
			self._lbl = self._lbl.fit(matrix, classes)
			print 'Fitting complete...'
			self._has_fit = True
		output = self._lbl.predict(self._matrix_database.make_row_from_recipe(ingredients).toarray())
		return output[0]
Beispiel #24
0
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6):
    prop = LabelPropagation(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            max_iter=MI,
                            n_jobs=-1)
    prop.fit(xTrain, yTrain)
    evaledY = prop.predict(xTrain)
    #def stats(trainY,evaledY,expectedY,day_one): return
    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, evaledY, yExpect, day_one)

    results = [
        'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'HC.csv'
    write_csv(file_name, results)
Beispiel #25
0
def semi_modelling(X_train, X_l, X_u, y_l, y_train, label_column):
    lp_model = LabelPropagation(gamma=1,
                                kernel='rbf',
                                max_iter=100000,
                                n_jobs=-1,
                                n_neighbors=7,
                                tol=0.001)
    tt = TriTraining([
        ExtraTreesClassifier(max_depth=None,
                             max_features='sqrt',
                             min_samples_leaf=1,
                             min_samples_split=10,
                             n_estimators=100,
                             random_state=200),
        RandomForestClassifier(max_depth=50,
                               max_features='sqrt',
                               min_samples_leaf=1,
                               min_samples_split=2,
                               n_estimators=100),
        XGBClassifier(max_depth=50, n_estimators=50, random_state=200)
    ])
    pseudo = PseudoLabeler(ExtraTreesClassifier(max_depth=None,
                                                max_features='log2',
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                n_estimators=100,
                                                random_state=200),
                           X_u,
                           X_u.columns,
                           label_column,
                           sample_rate=0.3)

    lp_model.fit(X_train.values, y_train)
    tt.fit(X_l.values, y_l.values, X_u.values)
    pseudo.seed = 42
    pseudo.fit(X_l, y_l)

    lp_predict = lp_model.predict(X_u.values)
    tt_predict = tt.predict(X_u.values)
    pse_predict = pseudo.predict(X_u)
    prediction_combine = np.vstack((lp_predict, tt_predict, pse_predict)).T
    return prediction_combine
Beispiel #26
0
def tryLabelPropagation(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.semi_supervised import LabelPropagation
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid

  propOperator = LabelPropagation(gamma=150)

  propOperator.fit(training_data[:3000],training_labels[:3000])
  score = accuracy_score(validation_labels, propOperator.predict(validation_data))
  print str(score)
Beispiel #27
0
    def evaluate_model(self, X, Y, gamma, seed, max_iter=100000):
        #set random seed:
        np.random.seed(seed)

        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            stratify=Y,
                                                            test_size=0.20,
                                                            random_state=seed)

        lp_model = LabelPropagation(kernel='rbf',
                                    gamma=gamma,
                                    max_iter=max_iter)

        lp_model.fit(X_train, Y_train)

        #test model on validation data
        predicted_labels = lp_model.predict(X_test)
        predicted_prob = lp_model.predict_proba(X_test)

        #get just the labeled testing data:
        labeled_prob = [
            p[1] for i, p in enumerate(predicted_prob) if Y_test[i] in [0, 1]
        ]
        labels = [
            p for i, p in enumerate(predicted_labels) if Y_test[i] in [0, 1]
        ]
        true_labels = [l for l in Y_test if l in [0, 1]]

        #evaluation
        accuracy = metrics.accuracy_score(true_labels, labels)
        precision = metrics.precision_score(true_labels, labels)
        auc = metrics.roc_auc_score(true_labels, labeled_prob)
        conf = metrics.confusion_matrix(true_labels, labels)

        return accuracy, precision, auc, conf
Beispiel #28
0
def simple_test():
    # read and split data
    read_data = ReadDataset()
    x, y = read_data.read("dna")
    x_l, x_u, y_l, y_u = train_test_split(x,
                                          y,
                                          test_size=0.99,
                                          random_state=40)
    print("shape of labeled part:")
    print(x_l.shape, y_l.shape)
    print("shape of unlabeled part:")
    print(x_u.shape, y_u.shape)
    print("class distribution of labeled examples:")
    print([np.sum(y_l == i) for i in range(len(np.unique(y)))])
    print("class distribution of unlabeled examples:")
    print([np.sum(y_u == i) for i in range(len(np.unique(y)))])
    print()

    # partially labeled view
    x_train, y_train, y_u_shuffled = partially_labeled_view(x_l, y_l, x_u, y_u)

    # purely supervised classification
    print("random forest:")
    t0 = time.time()
    model = RandomForestClassifier(n_estimators=200,
                                   oob_score=True,
                                   n_jobs=-1,
                                   random_state=40)
    model.fit(x_l, y_l)
    y_pred = model.predict(x_u)
    acc = [accuracy_score(y_u, y_pred)]
    f1 = [f1_score(y_u, y_pred, average="weighted")]
    print("accuracy:", acc[0])
    print("f1-score:", f1[0])
    t1 = time.time()
    print("random forest is done")
    print("time:", t1 - t0, "seconds")
    print()

    # label propagation
    print("label propagation:")
    t0 = time.time()
    label_prop_model = LabelPropagation(gamma=0.01, n_jobs=-1, tol=1e-3)
    label_prop_model.fit(x_train, y_train)
    y_pred = label_prop_model.predict(x_train[y_train == -1, :])
    acc.append(accuracy_score(y_u_shuffled, y_pred))
    f1.append(f1_score(y_u_shuffled, y_pred, average="weighted"))
    print("accuracy:", acc[1])
    print("f1-score:", f1[1])
    t1 = time.time()
    print("label propagation is done!")
    print("time:", t1 - t0, "seconds")
    print()

    # tsvm
    print("tsvm:")
    t0 = time.time()
    y_u_shuffled, y_pred = tsvm.ova_tsvm(x_l,
                                         y_l,
                                         x_u,
                                         y_u,
                                         db_name="dna",
                                         timeout=None)
    acc.append(accuracy_score(y_u_shuffled, y_pred))
    f1.append(f1_score(y_u_shuffled, y_pred, average="weighted"))
    print("accuracy:", acc[2])
    print("f1-score:", f1[2])
    t1 = time.time()
    print("tsvm is done!")
    print("time:", t1 - t0, "seconds")

    # multi-class self-learning algorithm with fixed theta
    theta = 0.7
    max_iter = 10
    print("fsla with theta={}:".format(theta))
    t0 = time.time()
    model = sl.fsla(x_l, y_l, x_u, theta, max_iter, random_state=40)
    y_pred = model.predict(x_u)
    acc.append(accuracy_score(y_u, y_pred))
    f1.append(f1_score(y_u, y_pred, average="weighted"))
    print("accuracy:", acc[3])
    print("f1-score:", f1[3])
    t1 = time.time()
    print("fsla is done!")
    print("time:", t1 - t0, "seconds")
    print()

    # multi-class self-learning algorithm
    print("msla:")
    t0 = time.time()
    model, thetas = sl.msla(x_l, y_l, x_u, random_state=40)
    y_pred = model.predict(x_u)
    print("optimal theta at each step:")
    print(thetas)
    acc.append(accuracy_score(y_u, y_pred))
    f1.append(f1_score(y_u, y_pred, average="weighted"))
    print("accuracy:", acc[4])
    print("f1-score:", f1[4])
    t1 = time.time()
    print("msla is done!")
    print("time:", t1 - t0, "seconds")
    print()

    # plot a graph
    plot_graph(acc, f1)
Beispiel #29
0
        #########clf = LogisticRegression(multi_class='auto', solver='lbfgs').fit(X_train, y_train)
        ######### clf = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)


        X = X[idx,:]
        y = y[idx]
        z = z[idx]
        X_train = X
        X_test = X[n_train:]
        y_train = np.concatenate([y[:n_train], -1*np.ones([n-n_train])])
        y_test = y[n_train:]
        z_test = z[n_train:]
        g = np.mean(pairwise_distances(X))
        clf = LabelPropagation(gamma = g).fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        res = 100 * np.sum(y_pred == y_test) / y_test.shape[0]

        idx_1 = (z_test == 1)
        res_1 = 100 * np.sum(y_pred[idx_1] == y_test[idx_1]) / np.sum(idx_1)

        idx_0 = (z_test == 0)
        res_0 = 100 * np.sum(y_pred[idx_0] == y_test[idx_0]) / np.sum(idx_0)


        res_diff = np.abs(res_1 - res_0)
        res_var = np.var([res_1, res_0])

        res_total.append(res)
        res_1_total.append(res_1)
class SentimentAnalysis(object):
    '''
    Identify the a sentiment for each document.
    '''


    def __init__(self):
        self.model = LabelPropagation()#(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx"
        sheet = "Sheet1"
        posData = da.readingDatabaseTetum(filePos, sheet)
        posData = posData[0].tolist()
        print len(posData)
        print posData[0]

        
 
        fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx"
        sheet = "Sheet1"
        negData = da.readingDatabaseTetum(fileNeg, sheet)  
        negData = negData[0].tolist() 
        print len(negData)
        print negData[0]
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        print unlabeledData[0]

       
        return (posData, negData, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (posData, negData, unlabeledData)  = self.readingDatabase()
        print "Finish reading database."
           
        # Divide training and test data
        cut = 10
        posDataTrain = posData[:cut]
        negDataTrain = negData[:cut]
        posDataTest = posData[cut:]
        negDataTest = negData[cut:]        
        
        random.seed(123456)
        # Training
        X_train = posDataTrain + negDataTrain + unlabeledData
        Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()
        z = zip(X_train, Y_train)
        random.shuffle(z)
        X_train, Y_train = zip(*z)
        self.train(X_train, Y_train)
        
        # Testing
        X_test = posDataTest + negDataTest
        Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist()
        z = zip(X_test, Y_test)
        random.shuffle(z)
        X_test, Y_test = zip(*z)
        Y_predicted = self.test(X_test)
        print Y_predicted

        (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted)
        print (accuracy,precision,recall,f1)  
Beispiel #31
0

# In[17]:

lspr = LP(gamma = 70)
lspr.fit(X_norm,Ytrain)


# In[15]:

print('nofClasses: ',lspr.classes_)


# In[16]:

pred = lspr.predict(X_norm)
notN = [1 for i in pred if i>0.0]
print(sum(notN))


# In[12]:

Y_pred = lspr.predict_proba(X_test)


# In[13]:

print(Y_pred.shape)


# In[ ]:
Beispiel #32
0
            date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000),
                                                                         now.month, now.day,
                                                                         now.hour, now.minute)

            #classification Type
            #clf = OneVsOneClassifier(clf)
            #clf = OneVsRestClassifier(clf)

            logging.info("start with training ")
            clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_valid)
            #print("min:{0}  max:{0}".format(y_pred.min(),y_pred.max()))
            #score = accuracy_score(y_valid, y_pred, True)

            print("found classes are {0}".format(clf.classes_))

            y_test = clf.predict(X_test)
            y_test = y_test.astype(np.uint32)

            lib_IO.write_Y("Data/pr4/{0}_{1}_{2}_{3}".format(name,param["kernel"],extra_param,date_time),y_test,Ids=ids)
            #Gridsearch
            #grid_search = GridSearchCV(clf, param, scoring='accuracy',cv=10, n_jobs=-1, verbose=1)
            #grid_search.fit(X_train, y_train)

            #clf_tmp = grid_search.best_estimator_
            #score = grid_search.best_score_
            #best_param = grid_search.best_params_

            #lib_IO.log_best_param_score(date_time,name,score,param)
            clf = None
            #time.sleep(30)
Beispiel #33
0
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
labels = np.copy(iris.target)
random_unlabeled_points = np.random.rand(len(iris.target))
random_unlabeled_points = random_unlabeled_points < 0.7
Y = labels[random_unlabeled_points]
labels[random_unlabeled_points] = -1
print("Unlabeled Number:", list(labels).count(-1))

from sklearn.semi_supervised import LabelPropagation
label_prop_model = LabelPropagation()
label_prop_model.fit(iris.data, labels)
Y_pred = label_prop_model.predict(iris.data)
Y_pred = Y_pred[random_unlabeled_points]
from sklearn.metrics import accuracy_score, recall_score, f1_score
print("ACC:", accuracy_score(Y, Y_pred))
print("REC:", recall_score(Y, Y_pred, average="micro"))
print("F-Score", f1_score(Y, Y_pred, average="micro"))
Beispiel #34
0
from sklearn.semi_supervised import LabelPropagation
from sklearn import metrics
import numpy as np
 
#K nearest neighbors model ensures we dont run over our memory
#rbf Kernel needs complete graph, so requires feature selection
lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model
Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data
Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data


#Unlabeled points - random size for now. 
unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels)))
labels = np.copy(Ytr_labels) #Save training labels for testing
labels[unlabeled_points] = -1  #Set unlabeled value, classes : 0, 1
lp_model.fit(Xtr,labels) #Train

#############################################
#   Models use n_neighbors and max_iteration to control kernel
#############################################

#############################################
#   Test Functions
#############################################
#Mean squared Error
yhat = lp_model.predict(Xtr);
mse = metrics.mean_squared_error(Ytr_labels,yhat);

###############################################
# Cross Validation
###############################################
Beispiel #35
0
def build_models(trainX, trainY, testX, testY, source_pos, target_pos, window):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX)
    acc_ss_propagation, acc_ss_propagation_INFO = checkAccuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX)
    acc_ss_spreading, acc_ss_spreading_INFO = checkAccuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = checkAccuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = checkAccuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = checkAccuracy(testY, predND)
    #
    print("WITHOUT TL ACC_LR:", accLR, " ACC_DT:", accDT, " ACC_NB:", accNB)
    ########################
    #### WITH TL ########
    ########################

    ####################################################
    ### Kernel Mean Matching (Huang et al., 2006)
    ###
    # Decision Tree
    print("\n Kernel Mean Matching (Huang et al., 2006) ")
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_KMM, acc_DT_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_KMM)
    # Logistic Regression
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_KMM, acc_LR_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_KMM)
    # Naive Bayes Bernoulli
    classifier = ImportanceWeightedClassifier(iwe='kmm', loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_KMM, acc_NB_KMM_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_KMM)
    ####################################################
    ### Nearest-neighbour-based weighting (Loog, 2015)
    ###
    # Decision Tree
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_NN, acc_DT_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_NN)
    # Logistic Regression
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_NN, acc_LR_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_NN)
    # Naive Bayes Bernoulli
    print("\n Nearest-neighbour-based weighting (Loog, 2015)    ")
    classifier = ImportanceWeightedClassifier(iwe='nn', loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_NN, acc_NB_NN_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_NN)

    ####################################################
    ### Transfer Component Analysis (Pan et al, 2009)
    ###
    # Decision Tree
    print("\n Transfer Component Analysis (Pan et al, 2009)")
    classifier = TransferComponentClassifier(loss="dtree", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_TCA, acc_DT_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_TCA)
    # Logistic Regression
    classifier = TransferComponentClassifier(loss="logistic", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_TCA, acc_LR_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_TCA)
    # Naive Bayes Bernoulli
    classifier = TransferComponentClassifier(loss="berno", num_components=6)
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_TCA, acc_NB_TCA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_TCA)

    ####################################################
    ### Subspace Alignment (Fernando et al., 2013)
    ###
    # Decision Tree
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="dtree")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_DT_SA, acc_DT_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_DT_SA)
    # Logistic Regression
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="logistic")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_LR_SA, acc_LR_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_LR_SA)
    # Naive Bayes Bernoulli
    print("\n Subspace Alignment (Fernando et al., 2013) ")
    classifier = SubspaceAlignedClassifier(loss="berno")
    classifier.fit(trainX, trainY, testX)
    pred_naive = classifier.predict(testX)
    acc_NB_SA, acc_NB_SA_INFO = checkAccuracy(testY, pred_naive)
    print("ACC:", acc_NB_SA)
    #################################
    ############# ENSEMBLE ##########
    #################################
    classifier_SA_DT = SubspaceAlignedClassifier(loss="dtree")
    classifier_SA_LR = SubspaceAlignedClassifier(loss="logistic")
    classifier_SA_NB = SubspaceAlignedClassifier(loss="berno")
    classifier_TCA_DT = TransferComponentClassifier(loss="dtree")
    classifier_TCA_LR = TransferComponentClassifier(loss="logistic")
    classifier_TCA_NB = TransferComponentClassifier(loss="berno")
    classifier_NN_DT = ImportanceWeightedClassifier(iwe='nn', loss="dtree")
    classifier_NN_LR = ImportanceWeightedClassifier(iwe='nn', loss="logistic")
    classifier_NN_NB = ImportanceWeightedClassifier(iwe='nn', loss="berno")
    classifier_KMM_DT = ImportanceWeightedClassifier(iwe='kmm', loss="dtree")
    classifier_KMM_LR = ImportanceWeightedClassifier(iwe='kmm',
                                                     loss="logistic")
    classifier_KMM_NB = ImportanceWeightedClassifier(iwe='kmm', loss="berno")
    #
    eclf = EnsembleClassifier(
        clfs=[classifier_TCA_DT, classifier_NN_DT, classifier_KMM_DT])
    eclf.fit(trainX, trainY, testX)
    pred = eclf.predict_v2(testX)
    acc_ENSEMBLE, acc_ENSEMBLE_INFO = checkAccuracy(testY, pred)

    ########################
    #### RETURN ########
    ########################
    return pd.DataFrame([{
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,
        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO': acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO': acc_ss_spreading_INFO,
        'acc_ENSEMBLE': acc_ENSEMBLE,
        'acc_LR': accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO),
        'acc_LR_KMM': acc_LR_KMM,
        'acc_LR_KMM_INFO': str(acc_LR_KMM_INFO),
        'acc_LR_NN': acc_LR_NN,
        'acc_LR_NN_INFO': str(acc_LR_NN_INFO),
        'acc_LR_TCA': acc_LR_TCA,
        'acc_LR_TCA_INFO': str(acc_LR_TCA_INFO),
        'acc_LR_SA': acc_LR_SA,
        'acc_LR_SA_INFO': str(acc_LR_SA_INFO),
        'acc_DT_KMM': acc_DT_KMM,
        'acc_DT_KMM_INFO': str(acc_DT_KMM_INFO),
        'acc_DT_NN': acc_DT_NN,
        'acc_DT_NN_INFO': str(acc_DT_NN_INFO),
        'acc_DT_TCA': acc_DT_TCA,
        'acc_DT_TCA_INFO': str(acc_DT_TCA_INFO),
        'acc_DT_SA': acc_DT_SA,
        'acc_DT_SA_INFO': str(acc_DT_SA_INFO),
        'acc_NB_KMM': acc_NB_KMM,
        'acc_NB_KMM_INFO': str(acc_NB_KMM_INFO),
        'acc_NB_NN': acc_NB_NN,
        'acc_NB_NN_INFO': str(acc_NB_NN_INFO),
        'acc_NB_TCA': acc_NB_TCA,
        'acc_NB_TCA_INFO': str(acc_NB_TCA_INFO),
        'acc_NB_SA': acc_NB_SA,
        'acc_NB_SA_INFO': str(acc_NB_SA_INFO)
    }])
class IndicatorIdentifier(object):
    '''
    Identify the an indicator for a document
    '''


    def __init__(self):
        self.model = LabelPropagation() #(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        labeledFile = "Database/Indicator/Indicators.xlsx"
        sheet = "Sheet1"
        df = da.readingDatabaseTetum(labeledFile, sheet, head= 0)


        cut = int(0.8*df.shape[0])    
        # re-duplicate the data => Result: one document has one label only
        columns = df.columns.tolist()
        columns.remove("Content")
        print columns
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []
        for index, row in df.iterrows():
            labels = list(set([row[col] for col in columns if not pd.isnull(row[col])]))
            content = row["Content"]
            if index < cut: # training part
                for label in labels:
                    X_train.append(content)
                    Y_train.append(label)
            else:
                X_test.append(content)
                Y_test.append(labels)
                
                
           
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        #print unlabeledData[0]
        
        return (X_train, Y_train, X_test, Y_test, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (X_train, Y_train, X_test, Y_test, unlabeledData)   = self.readingDatabase()
        print "Training size: " + str(len(X_train))
        print "Test size: " + str(len(X_test))
        '''
        X_train = X_train[:100]
        Y_train = Y_train[:100]
        X_test = X_test[:100]
        Y_test = Y_test[:100]
        '''
        print "Finish reading database."
        #print FreqDist(indicators).most_common()
        k = 0
        dictLabel = FreqDist(Y_train)
        for key in dictLabel:
            dictLabel[key] = k
            k+=1
        Y_train = [dictLabel[ind] for ind in Y_train]
        Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test]
        
        '''       
        random.seed(123456)
        # Training
        z = zip(labeledData, indicators)
        random.shuffle(z)
        labeledData, indicators = zip(*z)
        
        X_train = list(labeledData[:cut])
        Y_train = list(indicators[:cut])
        X_test = list(labeledData[cut:])
        Y_test = list(indicators[cut:])
        '''
        X_train += unlabeledData
        Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()

        #pprint(X_train)
        #print Y_train
        
        #print X_train[cut-2:cut+2]
        #print Y_train[cut-2:cut+2]
        print "Training..."
        self.train(X_train, Y_train)
        
        # Testing
        print "Testing..."
        Y_predicted = self.test(X_test)

        print Y_predicted

        # The Y_predicted only need to be one of the true labels in order to be calculated as correctness
        for i in range(len(Y_predicted)):
            lab = Y_predicted[i]
            if lab in Y_test[i]:
                Y_test[i] = lab
            else:
                Y_test[i] = -1
        (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted)
        print accuracy  
def rses_model(neighbor_delta, labeled_data, unlabeled_data, labels, name,
               alpha):
    def compute_NDER(dataset, radius, A, test_x, test_y):
        """

        :param A: feature set
        :param X: index of samples
        :param d:
        :return:
        """
        if not len(A):
            return 1

        clf = Neighborhood_Classifiers(dataset, x, y, A, radius)
        pre = []
        for s in test_x:
            pre.append(clf.predict(s))
        cnt = np.sum(np.not_equal(pre, test_y[test_x]).astype(int), axis=0)
        NDER = cnt / l
        return NDER

    # LPA生成标签
    lp_model = LabelPropagation()
    lp_model.fit(labeled_data, np.reshape(labels, len(labels)))
    y_inductive = lp_model.predict(unlabeled_data)

    # 整合数据
    x = np.concatenate((labeled_data, unlabeled_data), axis=0)
    y_labeled = np.reshape(labels, (len(labels)))
    y = np.concatenate((y_labeled, y_inductive), axis=0)

    # Algorithm of rough set based semi-supervised feature selection via ensemble selector.
    # n is number of decision classes
    n = np.max(labels) + 1
    l, c = x.shape
    AT = set([x for x in range(c)])
    A = set()
    # compute NDER
    X = [i for i in range(len(x))]
    NDER = compute_NDER(name, neighbor_delta, AT, X, y)

    # 按类别分组
    Xi = {i: np.where(y == i)[0].tolist() for i in range(n)}

    while True:
        C = set()
        for i in range(n):
            print("第{}次".format(i))
            max_phi = -1
            b = -1
            for a in AT.difference(A):
                l_nder1 = compute_NDER(name, neighbor_delta, A, Xi[i], y)
                A.add(a)
                l_nder2 = compute_NDER(name, neighbor_delta, A, Xi[i], y)
                phi = l_nder1 - l_nder2
                print("属性{}的phi={}".format(a, phi))
                if phi > max_phi:
                    max_phi = phi
                    b = a
                A.remove(a)
            C.add(b)
            print("选择{}".format(b))
        counter = Counter(C)
        b, t = counter.most_common(1)[0]  # 返回n个出现次数最大的值。计数值相等的元素按首次出现的顺序排序。
        print("{}: 最大".format(b))
        if b == -1:
            break
        A.add(b)
        nder_A = compute_NDER(name, neighbor_delta, A, X, y)
        if nder_A <= NDER:
            break
        print("red={}".format(A))
    return A, 0
def main(argv):
    trainFile = None
    testFile = None
    outFile = None

    try:
        opts, args = getopt.getopt(argv, "hi:t:o:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage()
            sys.exit()
        elif opt == '-i':
            trainFile = arg
        elif opt == '-t':
            testFile = arg
        elif opt == '-o':
            outFile = arg
        else:
            usage()
            print('Invalid argument %s' % opt)
            sys.exit(2)

    if (None == trainFile) or (None == testFile) or (None == outFile):
        print("Missing arguments")
        usage()
        sys.exit(2)

    facialData = pd.read_csv(trainFile)
    testData = pd.read_csv(testFile)

    testData.drop(columns=['id'], inplace=True)
    testData.reset_index(inplace=True, drop=True)

    labels = testData['class']
    classLabels = []
    for i in range(len(labels)):
        classLabels.append(1 if (labels[i] == 'deceptive') else 0)
    testData.drop(columns=['class'], inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(testData,
                                                        classLabels,
                                                        test_size=0.2,
                                                        stratify=classLabels,
                                                        random_state=42)

    X_train.insert(1, "class", y_train)
    sns.countplot(x="class", data=X_train)
    X_train = X_train.drop(columns=['class'])

    # Label Propagation
    modelLabelProp = LabelPropagation()
    labels = [-1] * len(facialData[:10000])
    labels.extend(y_train)
    inputData = pd.concat([facialData[:10000], X_train],
                          sort=False,
                          ignore_index=True,
                          copy=False)
    modelLabelProp.fit(inputData, labels)
    yPred = modelLabelProp.predict(X_test)
    print("LABEL PROPAGATION:")
    metricNPlot(modelLabelProp, X_test, y_test, yPred)

    with open(outFile, 'w') as f:
        f.write("Label Propagation prediction\n")
        for item in yPred:
            f.write("%s\n" % item)

    # Label Spreading
    modelLabelSpread = LabelSpreading(kernel='knn', n_neighbors=15)
    labels = [-1] * len(facialData[:10000])
    labels.extend(y_train)
    inputData = pd.concat([facialData[:10000], X_train],
                          sort=False,
                          ignore_index=True,
                          copy=False)
    modelLabelSpread.fit(inputData, labels)
    yPred = modelLabelSpread.predict(X_test)
    print("LABEL SPREADING:")
    metricNPlot(modelLabelSpread, X_test, y_test, yPred)

    with open(outFile, 'a') as f:
        f.write("Label Spreading prediction\n")
        for item in yPred:
            f.write("%s\n" % item)

    height = [0.8, 0.68]
    bars = ('Label Propagation', 'Label Spreading')
    y_pos = np.arange(len(bars))
    plt.title("Performance Comparison")
    plt.bar(y_pos, height, color=['cyan', 'red'])
    plt.xticks(y_pos, bars)
    plt.show()
Beispiel #39
0
nb_unlabeled = 750

if __name__ == '__main__':
    # Create the dataset
    X, Y = make_classification(n_samples=nb_samples,
                               n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               random_state=100)
    Y[nb_samples - nb_unlabeled:nb_samples] = -1

    # Create a LabelPropagation instance and fit it
    lp = LabelPropagation(kernel='rbf', gamma=10.0)
    lp.fit(X, Y)

    Y_final = lp.predict(X)

    # Show the final result
    sns.set()
    fig, ax = plt.subplots(1, 2, figsize=(18, 8))

    ax[0].scatter(X[Y == 0, 0],
                  X[Y == 0, 1],
                  color='#88d7f0',
                  marker='s',
                  s=100,
                  label="Class 0")
    ax[0].scatter(X[Y == 1, 0],
                  X[Y == 1, 1],
                  color='#55ffec',
                  marker='o',
Beispiel #40
0
tempind['PassengerId','Ticket','Cabin'] = False

tempage = tempage.loc[:,tempind]

tempage_labeled = tempage[tempage['Age'].notnull()]
newlabels = tempage_labeled['Age'].astype(int)
plabels =  tempage_labeled.loc[:,tempage_labeled.columns != 'Age'].astype(int)


tempage_unlabeled = tempage[tempage['Age'].isnull()]
unlabeled =  tempage_unlabeled.loc[:,tempage_unlabeled.columns != 'Age'].astype(int)

label_prop_modelage = LabelPropagation()
label_prop_modelage.fit(plabels,newlabels)
badind = temp[temp['Age'].isnull()]['Age'].index
newages = pd.Series(label_prop_modelage.predict(unlabeled),index = badind)
temp['Age'].fillna(newages, inplace = True)

temp['Embarked'].fillna('S', inplace = True)

temp.drop('Cabin', axis = 'columns',inplace = True)
print(temp.info())



h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]