def cal_sim_pos(sent_a, sent_b): """ 计算两个句子之间的词性相似度 :param sent_a: :param sent_b: :return: """ def __cal_vecmod(pos_dict): mod_result = 0.0 for i in pos_dict.values(): mod_result += i ** 2 return math.sqrt(mod_result) pos_a = sentence_utils.get_pos(sent_a) pos_b = sentence_utils.get_pos(sent_b) pos_a_dict = {} pos_b_dict = {} # 建立句子a的pos词典 for a in pos_a: if a[1] in pos_a_dict.keys(): pos_a_dict[a[1]] += 1.0 / len(pos_a) else: pos_a_dict[a[1]] = 1.0 / len(pos_a) # 建立句子b的pos词典 for b in pos_b: if b[1] in pos_b_dict.keys(): pos_b_dict[b[1]] += 1.0 / len(pos_b) else: pos_b_dict[b[1]] = 1.0 / len(pos_b) # 处理二者间的相似度 # 首先是分子的计算 fenzi = 0.0 for key_a in pos_a_dict: if key_a in pos_b_dict: fenzi += pos_a_dict[key_a] * pos_b_dict[key_a] # 计算分母 fenmu = __cal_vecmod(pos_a_dict) * __cal_vecmod(pos_b_dict) if fenmu == 0: return 0 else: return fenzi / fenmu
def get_graph_map_by_json(data_json, is_train=False): graph_map = {} for edge in data_json['edges']: source = edge['source'] target = edge['target'] if source in graph_map: graph_map[source]['neighbors'].append(target) else: graph_map[source] = {'content': data_json['nodes'][int(source)]['label'].split('-')[1], 'neighbors': [target], 'label': random.choice(['0', '1']), 'datetime': arrow.get(data_json['datetimes'][source], u'YYYY年MM月DD日 HH:mm', tzinfo=tz.tzlocal()), 'feature': {'pos': get_pos(data_json['nodes'][int(source)]['label'].split('-')[1])}} if target in graph_map: graph_map[target]['neighbors'].append(source) else: graph_map[target] = {'content': data_json['nodes'][int(target)]['label'].split('-')[1], 'neighbors': [source], 'label': random.choice(['0', '1']), 'datetime': arrow.get(data_json['datetimes'][source], u'YYYY年MM月DD日 HH:mm'), 'feature': {'pos': get_pos(data_json['nodes'][int(source)]['label'].split('-')[1])}} return graph_map