def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) length = len(rec_api_test) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 40 for idx in range(n_queries): query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( estimator=KNeighborsClassifier(n_neighbors=4), # estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(length): try: y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) except ValueError: predict = [0.0 for n in range(length)] else: predict.append(float(y_pre[0, 1])) return predict, X, new_X_feedback, new_y_feedback
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 100 sel_idx, sel_label = [], [] for idx in range(n_queries): # query_idx, query_instance = learner.query(X=X_train) query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) # print(idx, len(X_train)) # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx]) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # learner.teach( # X=new_X_train.reshape(1, -1), # y=new_y_train.reshape(1, ) # ) # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(400): y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) predict.append(float(y_pre[0, 1])) # predict.append(math.log(float(y_pre[0, 1])+1)) # predict.extend(y_pre.tolist()) x = X_test[query_idx].reshape(1, -1) # print(predict) # print('new_choose', len(choose_query), len(choose_answer)) # fw = open('../data/add_FR.csv', 'a+', newline='') # writer = csv.writer(fw) # for i, fr_q in enumerate(choose_query): # writer.writerow((fr_q, choose_answer[i])) # fw.close() return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)