Example #1
0
def factor_knn(findex, topic_popularity, dataset, num_neigh):
    """ 对于每个dynamic factor,分别计算其knn邻居
    """
    num_level = 2
    # topic_id ==> set of knn neighbours' topic id
    factor_knn_graph = dict()
    total = len(dataset)
    for topic_id, ins, level in dataset:
        #print 'Finding knn for topic: ', topic_id
        distance_comment_list = [0] * total
        index = 0
        for topic_id_other, ins_other, level_other in dataset:
            if topic_id_other == topic_id:
                continue
            # 程序的瓶颈:计算两个ts的距离
            dis = get_instance_distance(ins, ins_other, findex)
            if dis == 0:
                dis = 1e-6
            level                   = topic_popularity[topic_id_other][0]
            target_comment_count    = topic_popularity[topic_id_other][1]
            prediction_comment_count= topic_popularity[topic_id_other][2]
            ratio                   = topic_popularity[topic_id_other][3]
            
            distance_comment_list[index] = [topic_id_other, dis, prediction_comment_count, target_comment_count, level, ratio]
            index += 1
            
        distance_comment_list = distance_comment_list[:index]
        # 按照dis进行升序排序
        distance_comment_list.sort(key=operator.itemgetter(1), reverse=False)
        # 将所有的最短距离都记录
        # 需要确保knn_level_list中包括两类的样本
        knn_level_list, knn_list, level_count_list = get_knn_level_list_old(distance_comment_list, num_neigh, num_level)
        
        factor_knn_graph[topic_id] = set()
        k = len(knn_list)
        for i in range(k):
            neighbour_topic_id = knn_list[i][0]
            factor_knn_graph[topic_id].add(neighbour_topic_id)
    
    return factor_knn_graph
Example #2
0
def factor_score_knn(findex, mutual_knn_graph, target_topic_id, topic_popularity, num_level, prior_score = -1, gamma = 1):
    """ 计算每个topic的confidence score和level score
    """
    # 标记是否考虑先验信息
    with_prior_flag = isinstance(prior_score, dict)
    
    num_mutual_neighbour = len(mutual_knn_graph[target_topic_id])
    neighbour_topic_id = list(mutual_knn_graph[target_topic_id])
    
    level_confidence_score = np.zeros((num_level,), float)
    level_prior_score = np.array([0] * num_level, float)
    
    if num_mutual_neighbour == 0:
        print 'Topic %s in factor %d does not have any mutual knn neighbours.' % (target_topic_id, findex)
        return level_confidence_score, level_prior_score
        
    # normalize the distance
    dis_list = [0] * num_mutual_neighbour
    for i in range(num_mutual_neighbour):
        topic_id = neighbour_topic_id[i]
        ins = topic_popularity[target_topic_id][4]
        ins_other = topic_popularity[topic_id][4]
        dis_list[i] = get_instance_distance(ins, ins_other, findex)
    
    # use the min-max normalizer
    #dis_list = my_min_max_scaler(dis_list)
    #print 'Transformed distance list:', dis_list
    #import ipdb; ipdb.set_trace()
    Z = [0] * 2
    gamma = 1
    for i in range(num_mutual_neighbour):
        topic_id = neighbour_topic_id[i]
        #dis = knn_list[i][1]
        # 如果topic_id不在训练集中则不考虑
        if not topic_popularity[topic_id][-1]:
            continue
            
        level = topic_popularity[topic_id][0]
        dis = dis_list[i]
        
        # TODO: 这里的weight的值很可能覆盖prior
        try:
            weight = math.exp(-gamma * dis)
        except OverflowError:
            print 'Error in math.exp: ', -gamma * dis_list[i]
            continue
        
        Z[level] += weight
        if with_prior_flag: # 如果已经传递了先验信息
            #import ipdb; ipdb.set_trace()
            level_confidence_score[level] += weight
            # 计算每个instance在这个factor下的level prior score
            level_prior = prior_score[topic_id]
            level_prior_score[level] += (weight * level_prior[findex])
        else:
            level_confidence_score[level] += weight
    
    # normalize
    if sum(Z) > 0:
        level_confidence_score /= sum(Z)
    else:
        print 'Warning: Topic(%s) in factor(%d) dose not have any mutual knn neighbours.' % (topic_id, findex)
        level_confidence_score[:] = 1/num_level
        
    # 在不同factor下的level confidence下加入level_prior_score信息
    if with_prior_flag:
        if Z[0] > 0:
            level_prior_score[0] /= Z[0]
        else:
            level_prior_score[0] = 0
            
        if Z[1] > 0:
            level_prior_score[1] /= Z[1]
        else:
            level_prior_score[1] = 0
        
        if np.sum(level_prior_score) > 0:
            # 归一化, 此时 level_prior_score 的作用和level_confidence_score相同,只不过包含了先验信息
            level_prior_score /= np.sum(level_prior_score)
        else:
            level_prior_score[:] = 1/num_level
        
    #print 'Level confidence score:', level_confidence_score
    
    return level_confidence_score, level_prior_score