def find_effective_factor(train_set, k):
    """ 找出每个训练样本在哪些dynamic factor下能够分类成功
    """
    # import ipdb; ipdb.set_trace()
    num_factor = len(train_set[0][1][1])  # get the number of dynamic factors
    train_count = len(train_set)

    effective_factors = dict()  # topic_id ==> effective factor set
    for topic_id, ins, true_level in train_set:
        effective_factor_set = set()
        for findex in range(num_factor):
            distance_list = [0] * (train_count - 1)
            index = 0
            for train_topic_id, train_ins, train_true_level in train_set:
                if train_topic_id == topic_id:  # 不考虑自身
                    continue
                dis = get_instance_distance(ins, train_ins, findex)
                distance_list[index] = [train_topic_id, dis, train_true_level]
                index += 1

            distance_list = distance_list[:index]
            distance_list.sort(key=operator.itemgetter(1), reverse=False)
            level_list, nn_list = get_knn_level_list(distance_list, k)
            pred_level = trusted_vote(level_list, num_level=2, majority_threshold=0.66)
            if pred_level == true_level:
                effective_factor_set.add(findex)

        print "Effecitive factors for %s: %r" % (topic_id, effective_factor_set)
        effective_factors[topic_id] = effective_factor_set

    return effective_factors
Example #2
0
def find_nearest_neighbor_level(test_ins, train_set, k):
    """ 简单版本找到k近邻的level,但只能处理一种feature
    Return: k个近邻的level
    """
    nearest_neighbors = [] # [similarity, level]
    for train_topic_id, train_ins, level in train_set:
        sim = get_instance_distance(test_ins, train_ins, findex=1)
        insert_neighbor(nearest_neighbors, k, sim, level)
        
    nearest_neighbor_level = [0] * k
    for i in range(k):
        nearest_neighbor_level[i] = nearest_neighbors[i][1] # get the level
        
    return nearest_neighbor_level
def effective_factor_knn(train_set, test_ins, k, effective_factors):
    """ 给定有效因素进行分类
    """
    train_count = len(train_set)
    num_factor = len(test_ins[1])  # get the number of features
    test_topic_id = test_ins[0][3]
    # 初始化所有的topic的rank score

    topic_popularity = dict()  # topic_id ==> (level, comment_count)
    for train_topic_id, train_ins, level in train_set:
        target_comment_count = train_ins[0][0]
        prediction_comment_count = train_ins[0][4]
        ratio = target_comment_count * 1.0 / prediction_comment_count
        topic_popularity[train_topic_id] = (level, target_comment_count, prediction_comment_count, ratio)

    topic_score_list = []  # 每个feature都保存一份score list
    """
    注:分别在不同的dynamic factor中查找最近邻,然后将这些最近邻组合起来投票
    """
    # import ipdb; ipdb.set_trace()
    pred_level_list = []
    for findex in range(num_factor):
        # print 'Caculating score and rank for feature: ', findex
        distance_list = [0] * train_count
        index = 0
        # import ipdb, ipdb.set_trace()
        for train_topic_id, train_ins, level in train_set:
            if not findex in effective_factors[train_topic_id]:
                continue
            dis = get_instance_distance(test_ins, train_ins, findex)
            distance_list[index] = [train_topic_id, dis, level]
            index += 1

        distance_list = distance_list[:index]
        # 按照dis进行升序排序
        distance_list.sort(key=operator.itemgetter(1), reverse=False)
        level_list, nn_list = get_knn_level_list(distance_list, k)

        pred_level = trusted_vote(level_list, num_level=2, majority_threshold=0.66)
        if pred_level != -1:
            pred_level_list.append(pred_level)

    # print '\nOverall pred level list: ', pred_level_list

    if len(pred_level_list) == 0:
        return [], "", 0

    return pred_level_list, "", 0

    num_neighbour = len(knn_list)
    knn_level = [0] * num_neighbour
    knn_topic_id = [0] * num_neighbour  # 真正的k个近邻的topic id
    # 将k近邻的评论数进行加权平均,将score值作为权值
    weighted_num_comment = 0
    weighted_ratio = 0
    total_score = 0
    for i in range(num_neighbour):
        topic_id = knn_list[i][0]
        dis = knn_list[i][1]
        prediction_comment_count = knn_list[i][2]
        target_comment_count = knn_list[i][3]
        level = knn_list[i][4]
        ratio = knn_list[i][5]

        knn_level[i] = level
        knn_topic_id[i] = topic_id

        score = 1
        total_score += score
        weighted_num_comment += score * target_comment_count
        weighted_ratio += score * ratio

    weighted_num_comment = weighted_num_comment * 1.0 / total_score
    weighted_ratio = weighted_ratio * 1.0 / total_score
    weighted_ratio_pred = test_ins[0][4] * weighted_ratio

    return knn_level, knn_topic_id, weighted_ratio_pred