コード例 #1
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    length = len(rec_api_test)
    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 40
        for idx in range(n_queries):
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(length):
        try:
            y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        except ValueError:
            predict = [0.0 for n in range(length)]
        else:
            predict.append(float(y_pre[0, 1]))

    return predict, X, new_X_feedback, new_y_feedback
コード例 #2
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 100
        sel_idx, sel_label = [], []
        for idx in range(n_queries):
            # query_idx, query_instance = learner.query(X=X_train)
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            # print(idx, len(X_train))
            # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx])
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])
            # learner.teach(
            #     X=new_X_train.reshape(1, -1),
            #     y=new_y_train.reshape(1, )
            # )
            # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(400):
        y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        predict.append(float(y_pre[0, 1]))
        # predict.append(math.log(float(y_pre[0, 1])+1))
        # predict.extend(y_pre.tolist())
        x = X_test[query_idx].reshape(1, -1)
    # print(predict)
    # print('new_choose', len(choose_query), len(choose_answer))
    # fw = open('../data/add_FR.csv', 'a+', newline='')
    # writer = csv.writer(fw)
    # for i, fr_q in enumerate(choose_query):
    #     writer.writerow((fr_q, choose_answer[i]))
    # fw.close()

    return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)