Example #1
0
        return self.model.predict(X)

    def predict_pro(self, X):
        return self.model.predict_proba(X)

    def get_feature_names(self):
        return self.model.classes_

    def load(self):
        return

    def dump(self):
        return

if __name__ == "__main__":
    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../tfidf/')
    from tfidf_model import *
    data = ['这是 一个 测试 项目 标题','测试 标题', '项目 标题']
    tfidf = tfidf_model()
    Y = [1, 2, 3]
    X = np.array([[1,2,3], [2,3,4], [3,4,5]])
    # X = tfidf.train(data, Y)

    model = svm_model()
    model.train(X, Y, 'linear')

    #print X
    print model.predict(X)
    print model.predict_pro(X)
    #print model.get_feature_names()
def main(group_id):

    topiclist_path = 'data-dynamic/TopicList-' + group_id + '-filtered.txt'
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)

    # set the pre-computed popularity level
    # 未来的最大评论数可能超过pop_level的最大值
    # 注意:这里将最小的popularity值,即0,忽略了
    #pop_level = [8, 13, 23, 43, float('inf')]  # group: zhuangb
    pop_level = [25, 50, float('inf')]  # group: zhuangb
    #pop_level = [25, 50, float('inf')]      # group: buybook
    #pop_level = [30, float('inf')]      # group: buybook
    
    # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测
    # target_date 的含义为:预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=5)
    prediction_date = timedelta(hours=10*5)
    response_time = timedelta(hours=50)
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    alpha = 1.5
    percentage_threshold = 0.7
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    #print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    # 调整所有帖子的顺序
    # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样
    #shuffle(dataset)
    
    print 'Down-sampling the datasets...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)
    
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    #num_level = len(pop_level)
    #raw_input()
    
    #import ipdb
    #ipdb.set_trace()
        
    print 'The proposed model:'
    k = 3
    num_level = 2
    num_factor = len(train_set[0][1][1])
    
    print 'Classify test instances...'
    y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list = classify(train_set, test_set, k, num_level)
    # evaluate results
    print 'Number of give-ups: ', len(give_up_list)
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    #save_predictions(prediction_list, y_pred, factor_name = 'num_authors')
    #save_predictions(prediction_list, y_true, factor_name = 'all')
    
    comment_RSE_evaluation(comment_true, comment_pred)
    
    #print 'The class prior:', prior_score
    
    from svm_model import svm_model
    print 'Building a svm model...'
    y_true, y_pred = svm_model(train_set, test_set)
    classification_evaluation(y_true, y_pred)

    # 查看对于不同的factor,它们在不同的ratio上的预测结果
    #from utils import ratio_accuracy_distribution_plot
    #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density')
    
    # S-H model
    print '\nThe S-H model:'
    baseline_train_set = comment_count_dataset[:train_cnt]
    baseline_test_set = comment_count_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha)
    # drop some intances with cat = 0
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    
    print '\nML model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nMLR model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nkNN method:'
    k = 1
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, k, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    
    print "\nBao's method:"
    Bao_train_set = Bao_dataset[:train_cnt]
    Bao_test_set = Bao_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
Example #3
0
    
    pred_crossval = predictor[train_start:train_stop]
    test_crossval = test[test_start:test_stop]
    etalon_crossval = etalon[test_start:test_stop]

    test_knn = knn.knn_model(pred_crossval, test_crossval)
    knn_f1.append(f1_score(etalon_crossval, test_knn, average='weighted'))
    knn_recall.append(recall_score(etalon_crossval, test_knn, average='weighted'))
    knn_precision.append(precision_score(etalon_crossval, test_knn, average='weighted'))

    test_nb = nb.nb_model(pred_crossval, test_crossval)
    nb_f1.append(f1_score(etalon_crossval, test_nb, average='weighted'))
    nb_recall.append(recall_score(etalon_crossval, test_nb, average='weighted'))
    nb_precision.append(precision_score(etalon_crossval, test_nb, average='weighted'))

    test_svm = svm.svm_model(pred_crossval, test_crossval)
    svm_f1.append(f1_score(etalon_crossval, test_svm, average='weighted'))
    svm_recall.append(recall_score(etalon_crossval, test_svm, average='weighted'))
    svm_precision.append(precision_score(etalon_crossval, test_svm, average='weighted'))

    test_tree = tree.tree_model(pred_crossval, test_crossval)
    tree_f1.append(f1_score(etalon_crossval, test_tree, average='weighted'))
    tree_recall.append(recall_score(etalon_crossval, test_tree, average='weighted'))
    tree_precision.append(precision_score(etalon_crossval, test_tree, average='weighted'))

knn_f1 = np.mean(knn_f1, axis=0).tolist()
knn_recall = np.mean(knn_recall, axis=0).tolist()
knn_precision = np.mean(knn_precision, axis=0).tolist()

nb_f1 = np.mean(nb_f1, axis=0).tolist()
nb_recall = np.mean(nb_recall, axis=0).tolist()
Example #4
0
def main(group_id, topic_list, threshold_p, prediction_date_tr, response_time_delta, gaptime_n, best_k):
    
    # 设置两个自由参数值,由外部传入
    percentage_threshold = threshold_p
    prediction_date = timedelta(hours=prediction_date_tr)
    response_time = timedelta(hours=response_time_delta)

    # set the pre-computed popularity level
    # 未来的最大评论数可能超过pop_level的最大值
    # 注意:这里将最小的popularity值,即0,忽略了
    #pop_level = [8, 13, 23, 43, float('inf')]  # group: zhuangb
    pop_level = [25, 50, float('inf')]  # group: zhuangb
    #pop_level = [25, 50, float('inf')]      # group: buybook
    #pop_level = [30, float('inf')]      # group: buybook
    
    # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测
    # target_date 的含义为:预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=gaptime_n)
    #prediction_date = timedelta(hours=10*3)
    #response_time = timedelta(hours=24) # 已经作为参数传递
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    #percentage_threshold = 0.7
    alpha = 1/percentage_threshold
    
    #"""
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    #print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    print 'Down-sampling the datasets...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)
    
    # 调整所有帖子的顺序
    # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样
    #shuffle(dataset)
    
    # 注意:每次使用的数据集是不同的
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    #num_level = len(pop_level)
    #save_filtered_topics(group_id, dataset)
    #raw_input()
    
    from MDT_method import prepare_MDT_dataset
    #prepare_MDT_dataset(train_set, 'MDT_train.pickle')
    #prepare_MDT_dataset(test_set, 'MDT_test.pickle')
    #return    
    
    k = best_k
    num_level = 2
    num_factor = len(train_set[0][1][1])
    
    print 'The proposed model:'
    #print 'Caculating class prior score...'
    #prior_score = np.ones((num_factor, num_level)) # 初始化
    #prior_score = caculate_class_prior_confidence_score(train_set, k, num_level = 2)
    #print prior_score; raw_input()
    
    print 'Caculating instance prior score...'
    prior_score = -1
    mutual_knn_graph_list = None
    #prior_score = caculate_instance_prior_confidence_score(train_set, k, num_level = 2) # for instance_prior_weighting3.py
    topic_popularity, prior_score, mutual_knn_graph_list = caculate_instance_prior_confidence_score(train_set, test_set, k, num_factor, num_level = 2) # for IPW_mutual_knn.py
    
    # 保存prior-score,train dataset,test-dataset
    #save_intermediate_results(train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list)
    #"""
    
    #print 'Loading train_set, test_set, comment_count_dataset, ... and prior_score...'
    #train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list = load_intermediate_results()
    #train_cnt = len(train_set)
    #k = best_k; num_level=2; num_factor = len(train_set[0][1][1])
    #factor_name_list = ['current_comment_count', 'num_authors', 'tree_density', 'reply_density'] # 需要考察的factor变量
    #factor_propagation_plot(group_id, train_set+test_set, num_feature, category_count_list, range(4), factor_name_list)
    #return 
    
    print 'Parameter set:'
    print 'Gap time: ', gaptime
    print 'Prediction date(in hours):', prediction_date.total_seconds() / 3600
    print 'Response time(in hours):', response_time.total_seconds() / 3600
    print 'percentage_threshold: ', percentage_threshold
    print 'k = ', k

    # TODO:测试是否过拟合
    #test_set = train_set  # 将训练集作为测试集,查看是否过拟合
    print 'Classify test instances...'
    y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list, factor_prediction = \
        classify(train_set, test_set, k, num_factor, num_level, prior_score, topic_popularity, mutual_knn_graph_list)
    # evaluate results
    print 'Number of give-ups: ', len(give_up_list)
    IPW_acc = classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    #save_predictions(prediction_list, y_pred, factor_name = 'fourfactor')
    #save_predictions(prediction_list, y_true, factor_name = 'all')
    
    comment_RSE_evaluation(comment_true, comment_pred)
    
    #print 'The class prior:', prior_score
    
    print 'Single factor and simple vote prediction result:'
    single_factor_acc = single_factor_prediction(y_true, factor_prediction)
    
    from svm_model import svm_model
    print 'Building a svm model...'
    y_true, y_pred = svm_model(train_set, test_set)
    classification_evaluation(y_true, y_pred)

    # 查看对于不同的factor,它们在不同的ratio上的预测结果
    from utils import ratio_accuracy_distribution_plot
    #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density')
    
    # S-H model
    print '\nThe S-H model:'
    baseline_train_set = comment_count_dataset[:train_cnt]
    baseline_test_set = comment_count_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha)
    # drop some intances with cat = 0
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    
    print '\nML model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nMLR model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nkNN method:'
    knn_k = 1
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, knn_k, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    
    print "\nBao's method:"
    Bao_train_set = Bao_dataset[:train_cnt]
    Bao_test_set = Bao_dataset[train_cnt:]
    print 'With link density:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 1)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    print 'With diffusion depth:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 2)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    return IPW_acc, single_factor_acc # 返回正确率