def extract_features(eid, tree, label): features = {} # 初始化 for i in range(1, 11): features[i] = { 'count': 0, # '社交网络特征' 'rep_count': 0, 'comments_count': 0, # 文本特征 # 'text_length': 0, # 'text_NN_rat': 0, # 'text_verb_rat': 0, # 'text_adj_rat': 0, # # 'pos_count': 0, # 'neg_count': 0, # 'neu_count': 0, # '@_count': 0, # 'stopword_count': 0, # 用户特征 'bi_followers_count': 0, # 'user_des_len': 0, 'friends_count': 0, 'verified_count': 0, 'followers_count': 0, 'statuses_count': 0, 'male_count': 0, # m男 f女 'female_count': 0, 'favourites_count': 0, } #提取特征,从第一层到第十层 for node in tree.all_nodes_itr(): level = tree.depth(node=node) if level <= 10 and level > 0: #只统计1到10层的个数 features[level]['count'] += 1 features[level]['rep_count'] += node.data.reposts_count features[level]['comments_count'] += node.data.comments_count # features[level]['pos_count'] += node.data.reposts_count # features[level]['neg_count'] += node.data.reposts_count # features[level]['neu_count'] += node.data.reposts_count features[level][ 'bi_followers_count'] += node.data.bi_followers_count features[level]['friends_count'] += node.data.friends_count if node.data.verified == True: features[level]['verified_count'] += node.data.verified features[level]['followers_count'] += node.data.followers_count features[level]['statuses_count'] += node.data.statuses_count if node.data.gender == 'm': features[level]['male_count'] += 1 elif node.data.gender == 'f': features[level]['female_count'] += 1 features[level]['favourites_count'] += node.data.favourites_count #对第一层到第十层的特征求平均值 for i in range(1, 11): if features[i]['count'] != 0: features[i].update({ 'rep_count': round(features[i]['rep_count'] / features[i]['count'], 2), 'comments_count': round(features[i]['comments_count'] / features[i]['count'], 2), 'bi_followers_count': round(features[i]['bi_followers_count'] / features[i]['count'], 2), 'friends_count': round(features[i]['friends_count'] / features[i]['count'], 2), 'verified_count': round(features[i]['verified_count'] / features[i]['count'], 2), 'followers_count': round(features[i]['followers_count'] / features[i]['count'], 2), 'statuses_count': round(features[i]['statuses_count'] / features[i]['count'], 2), 'male_count': round(features[i]['male_count'] / features[i]['count'], 2), 'female_count': round(features[i]['female_count'] / features[i]['count'], 2), 'favourites_count': round(features[i]['favourites_count'] / features[i]['count'], 2), }) return features
def cal_node_level_count(type): data = pd.read_csv('D:/chenjiao/SinaWeibo/datasets2/Weibo.txt', sep='\t', header=None) if type == 'fake': data = data.loc[data[1] == 'label:1'] elif type == 'real': data = data.loc[data[1] == 'label:0'] data_array = data.as_matrix() tree_dict_list = [] max_depth = 0 all_infos = [] for i in range(data_array.shape[0]): eid = str(data_array[i][0]).replace('eid:', '') label = str(data_array[i][1].replace('label:', '')) load_f = open( 'D:/chenjiao/SinaWeibo/datasets2/Weibo/{}.json'.format(eid), 'r', encoding='utf-8') json_data = json.load(load_f) print('-----', eid) tree = Tree() tree.create_node(json_data[0].get("mid"), json_data[0].get("mid")) for j in range(1, len(json_data)): try: tree.create_node(json_data[j].get("mid"), json_data[j].get("mid"), parent=json_data[j].get("parent")) except: pass # tree.show() tree_depth = tree.depth() if tree_depth > max_depth: max_depth = tree_depth #统计各层节点个数 tree_dict = {} for node in tree.all_nodes_itr(): level = tree.depth(node=node) if level not in tree_dict: tree_dict[level] = 1 else: tree_dict[level] += 1 #统计完 tree_dict_list.append(tree_dict) tree_levels_count_list = {} for dict in tree_dict_list: for i in range(max_depth + 1): if i in dict: if i in tree_levels_count_list: tree_levels_count_list[i] += dict[i] else: tree_levels_count_list[i] = dict[i] print(tree_levels_count_list) for key in tree_levels_count_list: print(key, tree_levels_count_list[key] / data_array.shape[0])