def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('../feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[4977:4977+3000]
    y_va = y[4977:4977+3000]
    x = np.concatenate((x[:4977],x[4977+3000:]))
    y = np.concatenate((y[:4977],y[4977+3000:]))
    
    
    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]
    
    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x = np.concatenate([x_obs, x_nuls], axis=0)
    y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)
    
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va
Exemple #2
0
    def testing_predictions(self,
                            test_data,
                            model,
                            num_pcs,
                            gamma=False,
                            max_iter=1000000,
                            mean=False):

        pca_data = self.principal_components(test_data, self.pca, num_pcs)
        if mean == False:
            return np.array([p[1] for p in model.predict_proba(pca_data)])

        train_pca_data = self.principal_components(self.X, self.pca, num_pcs)

        predicted_probs = ""
        for seed in self.seeds:
            np.random.seed(seed)

            model = LabelPropagation(kernel='rbf',
                                     gamma=gamma,
                                     max_iter=max_iter)
            model.fit(train_pca_data, self.Y)

            predicted_prob = np.array(
                [p[1] for p in model.predict_proba(pca_data)])
            if predicted_probs == "":
                predicted_probs = predicted_prob
            else:
                predicted_probs = np.vstack((predicted_probs, predicted_prob))

        #get mean of each run:
        mean_probs = np.mean(predicted_probs, axis=0)
        return mean_probs
Exemple #3
0
def test_LabelPropagation_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
Exemple #4
0
def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化
    '''
    X, y, unlabeled_indices = data
    # 必须拷贝,后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)

    scores = []
    for gamma in gammas:
        clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(gammas, scores)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
Exemple #5
0
def semiLabelPropagation(feature_extractor, generator, val_generator, kernel,
                         neighbors, gamma):
    semi = LabelPropagation(kernel=kernel,
                            n_neighbors=neighbors,
                            gamma=gamma,
                            alpha=None,
                            tol=0.001,
                            max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes
Exemple #6
0
def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    '''
    X, y, unlabeled_indices, XPredict, yTrue = data
    #print("get ytrue")
    #print(yTrue)
    # 必须拷贝,后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1
    print(y_train)
    #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1)
    clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5)
    #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20)
    clf.fit(X, y_train)
    ### 获取预测准确率
    # 预测标记
    predicted_labels = clf.predict(XPredict)
    print(XPredict)
    #predicted_labels = clf.transduction_[unlabeled_indices]
    # 真实标记
    #yTrue
    #true_labels = y[unlabeled_indices]
    print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))
Exemple #7
0
def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]

    scores = []
    for K in Ks:
        clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(Ks, scores)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()
Exemple #8
0
def sklearn_lp(X, y,
            output=None,
            kernel='knn', 
            gamma=None,
            n_neighbors=10, 
            alpha=1, 
            max_iter=1000, 
            tol=0.00001):

    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3)
    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X_train, y_train)

    y_predict = label_prop_model.predict(X_test)
    print 'y_train: ', y_train
    print 'y_predict: ', y_predict
    
    print '+--------------------------------------------------------+'
    print '|                         Report                         +'
    print '+--------------------------------------------------------+'
    print classification_report(y_test, y_predict)
    print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
    print '\n\n'
Exemple #9
0
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6):
    prop = LabelPropagation(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            max_iter=MI,
                            n_jobs=-1)
    prop.fit(xTrain, yTrain)
    predY = prop.predict_proba(xTrain)
    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'HC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)
def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('./feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[[i in [8, 9] for i in o.values]]
    y_va = y[[i in [8, 9] for i in o.values]]
    x = x[[i not in [8, 9] for i in o.values]]
    y = y[[i not in [8, 9] for i in o.values]]
    o.unique()

    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]

    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x_all = np.concatenate([x_obs, x_nuls], axis=0)
    y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

    # Over sample the stages
    zen = SMOTE(random_state=8675309)
    x, y = zen.fit_resample(x_all, y_all)
    x, y = shuffle(x, y, random_state=42)
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va
Exemple #11
0
def _label_propagation(df):
    X = _generate_features(df)
    labels = _generate_labels(df)
    # for some reason pandas returns NaN for -1 values
    labels = labels.fillna(-1)
    label_prop_model = LabelPropagation()
    label_prop_model.fit(X.toarray(), labels)
    return label_prop_model.predict(X.toarray())
Exemple #12
0
def test_LabelPropagation(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))
Exemple #13
0
def test_LabelPropagation(*data):
    x, y ,unlabeled_indices = data
    y_train = np.copy(y)  # 这里选择复制,后面要用到y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为-1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    # 获取预测准确率
    true_labels = y[unlabeled_indices]  # 取得真实标记
    print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))
Exemple #14
0
def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0], ))
    all_labels = np.concatenate((labels, neg_ones), axis=0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)
Exemple #15
0
def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0],))
    all_labels = np.concatenate((labels, neg_ones), axis = 0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)
Exemple #16
0
def LP(source_train, target_test, label1, label3):
    label_prop_model = LabelPropagation()
    label_prop_model.fit(source_train, label1)
    source_predict = label_prop_model.predict(target_test)
    # 评价参数
    accuracy = metrics.accuracy_score(label3, source_predict)
    recall = metrics.recall_score(label3, source_predict, average='weighted')
    f1 = metrics.f1_score(label3, source_predict, average='weighted')
    precision = metrics.precision_score(label3, source_predict, average='weighted')
    print("LP:", accuracy, recall, f1, precision)
    return accuracy, recall, f1, precision
def create_label_prop(dataset):
    vectors, labels = make_vectors(dataset)

    Q_labels = -1 * np.ones(dataset.Q.shape[0] + dataset.test_X.shape[0])
    labels = np.concatenate((labels, Q_labels))
    vectors = np.concatenate((vectors, dataset.Q, dataset.test_X))

    label_prop = LabelPropagation()
    label_prop.fit(vectors, labels)
    print("\tLabel Propogation accuracy:")

    return label_prop
Exemple #18
0
def do_evaluation(X, y, 
                kernel='knn',
                output=None, 
                gamma=None,
                n_neighbors=10, 
                alpha=1, 
                max_iter=1000, 
                tol=0.00001):
    # from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    import random

    size = len(X)

    random_seeds = np.random.randint(1, 1000, size=10)
    for i in range(len(random_seeds)):
        
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i])
        labels = np.copy(y)
        tmp = np.arange(size)
        np.random.shuffle(tmp)
        train_test_split_rate = int(size*.9)
        random_unlabeled_points = tmp[:train_test_split_rate]
        labeled_points = tmp[train_test_split_rate:]
        random_unlabeled_points.sort()
        X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points]
        y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points]
        y_train = [y[_] for _ in range(size) if _ in labeled_points]

        labels[random_unlabeled_points] = -1

        label_prop_model = LabelPropagation(kernel=kernel, 
                                            gamma=gamma, 
                                            n_neighbors=n_neighbors, 
                                            alpha=alpha, 
                                            max_iter=max_iter, 
                                            tol=tol)
        label_prop_model.fit(X, labels)

        y_predict = label_prop_model.predict(X_test)
        
        print '+--------------------------------------------------------+'
        print '|                         Report                         |'
        print '+--------------------------------------------------------+'
        print 'test round:', (i+1), ' with random seed: ', random_seeds[i]
        print 'training label: ', y_train
        print 'training post id: ', [_+1 for _ in labeled_points]
        print 'predict label: ', y_predict
        print classification_report(y_test, y_predict)
        print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
        print '\n\n'
Exemple #19
0
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )
Exemple #20
0
def khren3(G):

    result_s = {}
    result_d = {}
    passed_set = []

    list_neighbrs = {}

    for v in G.nodes:
        list_neighbrs.update({v: set(nx.neighbors(G, v))})

    for u in G.nodes:
        passed_set.append(u)
        for v in nx.neighbors(G, u):
            if not v in passed_set:
                cmn_nmbr = list_neighbrs[u] & list_neighbrs[v]
                # dist = nx.shortest_path_length(G,u,v)
                # if dist == 2:
                # cmn_nmbr = G.distance(u,v)
                if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']:
                    result_s.update({(u, v): cmn_nmbr})
                else:
                    result_d.update({(u, v): cmn_nmbr})

    # max_s = max(len(result_s.values()))
    min_s = len(min(result_s.values(), key=len))
    min_d = len(min(result_d.values(), key=len))
    max_d = len(max(result_d.values(), key=len))

    for (pair, vertex_list) in result_d.items():
        if len(vertex_list) == max_d:
            max_pair = pair
            break

    print(min_s, min_d)

    adj_matrix = nx.adjacency_matrix(G).toarray()
    labels = [-1 for node in G.nodes]
    true_labels = [G.nodes[node]['ground_label'] for node in G.nodes]
    # labels[[0]] = 0
    labels[max_pair[0]] = 0
    labels[max_pair[1]] = 1
    # labels[0:10] = [0 for i in range(10)]
    # labels[900:910] = [1 for i in range(10)]

    lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000)
    lp.fit(adj_matrix, labels)
    print(lp.score(adj_matrix, true_labels))

    return (result_s, result_d)
Exemple #21
0
    def execute(self, function_context: FunctionContext,
                input_list: List) -> List:
        x_train = input_list[0]
        y_label = input_list[1]
        input_dim = 512

        x_train_columns = list()
        x_train_columns.append('face_id')
        for i in range(1, input_dim + 1):
            x_train_columns.append('col' + str(i))
        trainDf = pd.DataFrame(x_train, columns=x_train_columns)
        labelDf = pd.DataFrame(y_label, columns=('face_id', 'label'))

        trainDf = pd.merge(trainDf,
                           labelDf,
                           on=['face_id'],
                           how='inner',
                           suffixes=('_x', '_y'))
        y_label = trainDf['label'].values.astype(int)
        trainDf = trainDf.drop('face_id', 1)
        x_train = trainDf.drop('label', 1).values

        label_prop_model = None
        score = 0.0
        while score < 0.95:
            print('before train ACC:', score)
            random_unlabeled_points = np.random.rand(len(y_label))
            random_unlabeled_points = random_unlabeled_points < 0.3  # 0-1的随机数,小于0.7返回1,大于等于0.7返回0
            Y = y_label[random_unlabeled_points]  # label转换之前的
            y_label[random_unlabeled_points] = -1  # 标签重置,将标签为1的变为-1

            label_prop_model = LabelPropagation()
            label_prop_model.fit(x_train, y_label)

            Y_pred = label_prop_model.predict(x_train)
            Y_pred = Y_pred[random_unlabeled_points]
            score = accuracy_score(Y, Y_pred)

            y_label[random_unlabeled_points] = Y

        model_path = os.path.dirname(os.path.abspath(__file__)) + '/model'
        print('Save trained model to {}'.format(model_path))
        if not os.path.exists(model_path):
            joblib.dump(label_prop_model, model_path)

        model_meta: ModelMeta = function_context.node_spec.output_model
        # Register model version to notify that cluster serving is ready to start loading the registered model version.
        register_model_version(model=model_meta, model_path=model_path)
        return []
def label_prop():

    labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']]
    model = LabelPropagation(kernel=rbf_kernel_safe)
    model.fit(df10, labels.values.ravel())
    pred = np.array(model.predict(df12))
    df13 = pd.DataFrame(pred, columns=['Prediction'])
    df14 = pd.concat([df12, df13], axis=1)
    print(df14[['ID', 'Prediction']])
    # print(df14.loc[df14['Prediction'] == 'Y'])
    plt.style.use ( 'seaborn' )
    df14['Prediction'].value_counts().plot(kind='bar')
    plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] )
    plt.ylabel('Number of occurrences after prediction by RBF algorithm');
    plt.show()
def propagate_labels(X_u, y_u, X_l, num_unlabeled):
    # unlabeled samples are represented by -1 in labelprop
    y_u_placeholder = np.zeros(num_unlabeled) - 1

    X_train_prop = np.concatenate((X_l, X_u), axis=0)
    y_train_prop = np.concatenate((y_l, y_u_placeholder), axis=0)

    prop = LabelPropagation(gamma=15)
    prop.fit(X_train_prop, y_train_prop)

    y_train_lda = prop.transduction_

    X_train_lda = np.concatenate((X_l, X_u), axis=0)

    return X_train_lda, y_train_lda
Exemple #24
0
def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X, y_train)
    ### 获取预测准确率
    predicted_labels = clf.transduction_[unlabeled_indices]  # 预测标记
    true_labels = y[unlabeled_indices]  # 真实标记
    print("Accuracy:%f" %
          metrics.accuracy_score(true_labels, predicted_labels))
Exemple #25
0
    def fit(self, x_input, y_input):

        x = x_input.copy()
        y = y_input.copy()

        # standardize input
        if self.standardize_flag:
            self.scaler.fit(x)
            x = self.scaler.transform(x)

        # apply PCA
        if config.PCA_VAR_THR < 1:
            if self.PCA.n_components is None:
                self.PCA.n_components = x.shape[1]
                self.PCA.fit(x)
                n_components = np.where(self.PCA.explained_variance_ratio_.
                                        cumsum() > config.PCA_VAR_THR)[0][0]
                self.PCA = decomposition.PCA(n_components=n_components,
                                             whiten=True)
            self.PCA.fit(x)
            x = self.PCA.transform(x)

        cv = GridSearchCV(LabelPropagation(),
                          self.model_params,
                          scoring=self.metric_key,
                          cv=self.num_splits,
                          n_jobs=config.N_JOBS)

        cv.fit(x, y)

        self.estimator = cv.best_estimator_
        self.best_val_score = cv.cv_results_["mean_test_score"].max()
Exemple #26
0
 def __init__(self,
              method="spreading",
              kernel="knn",
              alpha=0.2,
              gamma=20,
              n_neighbors=7,
              **kwargs):
     super(LabSP, self).__init__(**kwargs)
     if method.lower() == "propagation":
         self.regressors = [
             LabelPropagation(kernel=kernel,
                              alpha=alpha,
                              gamma=gamma,
                              n_neighbors=n_neighbors)
             for _ in range(len(self.regions))
         ]
     elif method.lower() == "spreading":
         self.regressors = [
             LabelSpreading(kernel=kernel,
                            alpha=alpha,
                            gamma=gamma,
                            n_neighbors=n_neighbors)
             for _ in range(len(self.regions))
         ]
     else:
         raise InitializationError("Method %s not valid" % method)
    def start(self):
        """ 01. Initialise the data paths and transformation functions.  """
        self.data_dir = '../data/raw_data'
        self.trans_primitives = ['weekday', 'hour', 'time_since_previous']
        self.agg_primitives = [
            'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last',
            'time_since_last', 'mode'
        ]
        self.ignore_cols = [
            'num_contacts', 'num_referrals', 'num_successful_referrals'
        ]
        self.feature_windows = [10, 30, 60, 90]  #[10,20,30]
        self.max_feature_depth = 2

        # list of estimators to use
        self.estimators = [
            ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()),
            ('gbc',
             GradientBoostingClassifier(validation_fraction=0.15,
                                        n_iter_no_change=50)),
            ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()),
            ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()),
            ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()),
            ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()),
            ('dtc', DecisionTreeClassifier()),
            ('qda', QuadraticDiscriminantAnalysis()),
            ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)),
            ('knn6', KNeighborsClassifier(6)),
            ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()),
            ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()),
            ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()),
            ('sgdc', SGDClassifier()), ('svg', SVC()),
            ('ngbc', NGBClassifier(Dist=Bernoulli))
        ]
        self.next(self.load_raw_data)
Exemple #28
0
def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化

    :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100,
                                   gamma=gamma,
                                   alpha=alpha,
                                   kernel='rbf')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
Exemple #29
0
def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化

    :param data:  一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return:  None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelPropagation(max_iter=100,
                                   n_neighbors=K,
                                   alpha=alpha,
                                   kernel='knn')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()
def label_propagation_classification(kernel='rbf',
                                     gamma=20,
                                     n_neighbors=7,
                                     max_iter=30,
                                     tol=1e-3,
                                     n_jobs=None):
    '''
    Take in the parameters for label propation. Kernel can be a string or a dict. If it's a dict, it must have two keys:
    a "name" for the kernel and the "method" that is a callable function
    :param kernel: string (rbf, graph, knn) or dict
    :param gamma: if rbf this be used
    :param n_neighbors:  if knn this will be used
    :param max_iter: max number of iterations, default is 30
    :param tol: tolerance for convergence, default is 1e-3
    :param n_jobs: number of cores
    :return: th_model
    '''
    if isinstance(kernel, dict):
        if 'name' not in kernel.keys() or 'method' not in kernel.keys() or len(
                kernel.keys()) > 2:
            ValueError("input dictionary must have two keys: name and method")
        kernel_method = kernel['method']
        temp = kernel['name']
        kernel = temp
    elif kernel == 'graph':
        kernel_method = _graph_kernel
    else:
        kernel_method = kernel

    # Creating an sklearn random forest classification model:

    lp = LabelPropagation(kernel=kernel_method,
                          gamma=gamma,
                          n_neighbors=n_neighbors,
                          max_iter=max_iter,
                          tol=tol,
                          n_jobs=n_jobs)

    # Creating an instance of the SklearnClassification TestHarnessModel subclass
    if kernel == 'rbf':
        th_model = SklearnClassification(model=lp, model_author='Mohammed', \
                                         model_description="Label Propagation: kernel={0}, \
                                         gamma={1},  max_iter={2},\
                                         tol={3},n_jobs={4}"                                                            .format(
                                         kernel, gamma,  max_iter,tol,n_jobs))
    elif kernel == 'knn':
        th_model = SklearnClassification(model=lp, model_author='Mohammed', \
                                         model_description="Label Propagation: kernel={0}, \
                                         n_neighbors={1},  max_iter={2},\
                                         tol={3},n_jobs={4}"                                                            .format(
                                             kernel, n_neighbors,  max_iter, tol, n_jobs))
    else:
        th_model = SklearnClassification(model=lp, model_author='Mohammed', \
                                         model_description="Label Propagation: kernel={0}, \
                                              max_iter={1},\
                                             tol={2},n_jobs={3}"                                                                .format(
                                             kernel,  max_iter, tol, n_jobs))
    return th_model
Exemple #31
0
    def label_spr(self):

        RESULT_ACC_SS = 0

        for i in range(self.manyfit):

            #Initialisinig of variables:
            self.init_variables()

            #PCA preprocessing:
            if (self.PCA_MODE): self.pca_preprocess(self.pca)

            #Semi supervised algo
            if (self.ss_mod == 'LabSpr' and self.ss_kern == 'knn'):
                self.label_prop_model = LabelSpreading(
                    kernel='knn',
                    gamma=self.gamma,
                    n_neighbors=self.neighbors,
                    alpha=self.alpha)

            elif (self.ss_mod == 'LabProp' and self.ss_kern == 'rbf'):
                self.label_prop_model = LabelPropagation(
                    kernel='rbf',
                    gamma=self.gamma,
                    n_neighbors=self.neighbors,
                    alpha=self.alpha,
                    max_iter=10)
            else:
                self.label_prop_model = LabelPropagtion(
                    kernel=self.ss_kern,
                    gamma=self.gamma,
                    n_neighbors=self.neighbors)

            print('Starting to fit. Run for shelter!')

            self.label_prop_model.fit(self.X_tot, self.y_tot)

            temp_acc = self.label_prop_model.score(self.X_valid_lab,
                                                   self.y_valid)

            print('{} / {} :accuracy = {}'.format(i, self.manyfit, temp_acc))

            RESULT_ACC_SS += temp_acc

        self.y_tot = self.label_prop_model.transduction_

        self.y_submit = self.label_prop_model.predict(self.X_submit)

        if (self.datastate == "save"):
            self.save_to_csv(self.X_tot, self.y_tot, self.X_valid_lab,
                             self.y_valid)

        RESULT_ACC_SS /= self.manyfit

        self.json_dict['ss_accuracy'] = RESULT_ACC_SS

        print('accuracy obtained on the test set of the ss algo:',
              RESULT_ACC_SS)
Exemple #32
0
def ssl_label_prop(unlabel, clfs, true, x, y, test):
    for row in y:
        row = int(row)
    df_noise_x, df_noise_y, noisy_labels = shuffle.run(unlabel,
                                                       [-1] * len(unlabel), x,
                                                       y)
    ground = []
    point = []
    for row in test:
        ground.append(row[0])
        point.append(row[1:])
    # sklearn algo
    label_prop_model = LabelPropagation(kernel='knn',
                                        n_neighbors=2,
                                        max_iter=400,
                                        tol=0.01)
    label_prop_model.fit(df_noise_x, df_noise_y)
    return label_prop_model.score(point, ground)
Exemple #33
0
def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    '''
    X, y, unlabeled_indices = data
    # 必须拷贝,后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X, y_train)
    ### 获取预测准确率
    # 预测标记
    predicted_labels = clf.transduction_[unlabeled_indices]
    # 真实标记
    true_labels = y[unlabeled_indices]
    print("Accuracy:%f" %
          metrics.accuracy_score(true_labels, predicted_labels))
Exemple #34
0
def do_label_propagation(input_data,
                        input_label,
                        output=None,
                        kernel='knn', 
                        gamma=None,
                        n_neighbors=10, 
                        alpha=1, 
                        max_iter=30, 
                        tol=0.001):
    n_neighbors += 1

    # input label
    input_label_fh = open(input_label, 'rb')
    label_lines = input_label_fh.readlines()
    label_lines = [int(_.strip()) for _ in label_lines]
    y = np.array(label_lines)
    input_label_fh.close()

    size = len(y)

    # input data
    input_data_fh = open(input_data, 'rb')
    data_lines = input_data_fh.readlines()[:size]
    data_lines = [_.strip() for _ in data_lines]
    X = np.array(np.mat(';'.join(data_lines)))
    input_data_fh.close()

    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X, y)

    prediction = label_prop_model.predict(X)

    if output:
        output_fh = open(output, 'wb')
        for p in prediction:
            output_fh.write(str(p)+'\n')
        output_fh.close()

    return label_prop_model
Exemple #35
0
def tryLabelPropagation(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.semi_supervised import LabelPropagation
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid

  propOperator = LabelPropagation(gamma=150)

  propOperator.fit(training_data[:3000],training_labels[:3000])
  score = accuracy_score(validation_labels, propOperator.predict(validation_data))
  print str(score)
 def __init__(self):
     self.model = LabelPropagation() #(kernel='knn', alpha=1.0)
class IndicatorIdentifier(object):
    '''
    Identify the an indicator for a document
    '''


    def __init__(self):
        self.model = LabelPropagation() #(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        labeledFile = "Database/Indicator/Indicators.xlsx"
        sheet = "Sheet1"
        df = da.readingDatabaseTetum(labeledFile, sheet, head= 0)


        cut = int(0.8*df.shape[0])    
        # re-duplicate the data => Result: one document has one label only
        columns = df.columns.tolist()
        columns.remove("Content")
        print columns
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []
        for index, row in df.iterrows():
            labels = list(set([row[col] for col in columns if not pd.isnull(row[col])]))
            content = row["Content"]
            if index < cut: # training part
                for label in labels:
                    X_train.append(content)
                    Y_train.append(label)
            else:
                X_test.append(content)
                Y_test.append(labels)
                
                
           
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        #print unlabeledData[0]
        
        return (X_train, Y_train, X_test, Y_test, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (X_train, Y_train, X_test, Y_test, unlabeledData)   = self.readingDatabase()
        print "Training size: " + str(len(X_train))
        print "Test size: " + str(len(X_test))
        '''
        X_train = X_train[:100]
        Y_train = Y_train[:100]
        X_test = X_test[:100]
        Y_test = Y_test[:100]
        '''
        print "Finish reading database."
        #print FreqDist(indicators).most_common()
        k = 0
        dictLabel = FreqDist(Y_train)
        for key in dictLabel:
            dictLabel[key] = k
            k+=1
        Y_train = [dictLabel[ind] for ind in Y_train]
        Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test]
        
        '''       
        random.seed(123456)
        # Training
        z = zip(labeledData, indicators)
        random.shuffle(z)
        labeledData, indicators = zip(*z)
        
        X_train = list(labeledData[:cut])
        Y_train = list(indicators[:cut])
        X_test = list(labeledData[cut:])
        Y_test = list(indicators[cut:])
        '''
        X_train += unlabeledData
        Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()

        #pprint(X_train)
        #print Y_train
        
        #print X_train[cut-2:cut+2]
        #print Y_train[cut-2:cut+2]
        print "Training..."
        self.train(X_train, Y_train)
        
        # Testing
        print "Testing..."
        Y_predicted = self.test(X_test)

        print Y_predicted

        # The Y_predicted only need to be one of the true labels in order to be calculated as correctness
        for i in range(len(Y_predicted)):
            lab = Y_predicted[i]
            if lab in Y_test[i]:
                Y_test[i] = lab
            else:
                Y_test[i] = -1
        (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted)
        print accuracy  
    max_iter: 30
        Complexity control for knn
    n_neighbors: 7
        Parameter for knn, how many neighbors to consider
    alpha: float
        Clamping factor    
    tol: 0.001
        Converenge tolerance: threshold to consider system at steady state
"""
from sklearn.semi_supervised import LabelPropagation
from sklearn import metrics
import numpy as np
 
#K nearest neighbors model ensures we dont run over our memory
#rbf Kernel needs complete graph, so requires feature selection
lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model
Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data
Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data


#Unlabeled points - random size for now. 
unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels)))
labels = np.copy(Ytr_labels) #Save training labels for testing
labels[unlabeled_points] = -1  #Set unlabeled value, classes : 0, 1
lp_model.fit(Xtr,labels) #Train

#############################################
#   Models use n_neighbors and max_iteration to control kernel
#############################################

#############################################
Exemple #39
0
#count number of missing labels
noLabels = [1 for i in Y_train if i==-1]
labels = [1 for i in Y_train if not i==-1]
print('unlabeled: ',np.sum(noLabels))
print('labeled: ',np.sum(labels))


# In[8]:

from sklearn.semi_supervised import LabelPropagation as LP
from sklearn.semi_supervised import LabelSpreading as LS


# In[17]:

lspr = LP(gamma = 70)
lspr.fit(X_norm,Ytrain)


# In[15]:

print('nofClasses: ',lspr.classes_)


# In[16]:

pred = lspr.predict(X_norm)
notN = [1 for i in pred if i>0.0]
print(sum(notN))

class SentimentAnalysis(object):
    '''
    Identify the a sentiment for each document.
    '''


    def __init__(self):
        self.model = LabelPropagation()#(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx"
        sheet = "Sheet1"
        posData = da.readingDatabaseTetum(filePos, sheet)
        posData = posData[0].tolist()
        print len(posData)
        print posData[0]

        
 
        fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx"
        sheet = "Sheet1"
        negData = da.readingDatabaseTetum(fileNeg, sheet)  
        negData = negData[0].tolist() 
        print len(negData)
        print negData[0]
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        print unlabeledData[0]

       
        return (posData, negData, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (posData, negData, unlabeledData)  = self.readingDatabase()
        print "Finish reading database."
           
        # Divide training and test data
        cut = 10
        posDataTrain = posData[:cut]
        negDataTrain = negData[:cut]
        posDataTest = posData[cut:]
        negDataTest = negData[cut:]        
        
        random.seed(123456)
        # Training
        X_train = posDataTrain + negDataTrain + unlabeledData
        Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()
        z = zip(X_train, Y_train)
        random.shuffle(z)
        X_train, Y_train = zip(*z)
        self.train(X_train, Y_train)
        
        # Testing
        X_test = posDataTest + negDataTest
        Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist()
        z = zip(X_test, Y_test)
        random.shuffle(z)
        X_test, Y_test = zip(*z)
        Y_predicted = self.test(X_test)
        print Y_predicted

        (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted)
        print (accuracy,precision,recall,f1)  
 def __init__(self, P):
   LabelPropagation.__init__(self, kernel='knn')
   self.P = P
Exemple #42
0
             ]



names = [
        #"propagation",
        "spreading",
        ]

for grid in params:
    param_grid = list(ParameterGrid(grid))
    for param in param_grid:
        for name in names:
            if param["kernel"] == 'rbf':
                if name == "propagation":
                    clf = LabelPropagation(kernel=param["kernel"],
                                           gamma=param["gamma"])
                else:
                    clf = LabelSpreading(kernel=param["kernel"],
                                           gamma=param["gamma"])
                extra_param = param["gamma"]
            else:
                if name == "propagation":
                    clf = LabelPropagation(kernel=param["kernel"],
                                           n_neighbors=param["n_neighbors"])
                else:
                    clf = LabelSpreading(kernel=param["kernel"],
                                           n_neighbors=param["n_neighbors"])
                extra_param = param["n_neighbors"]

            now = datetime.datetime.now()
            date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000),