def gen_local_data(pids, labels, idf_threshold=10 ): # 对每一个作者名, 生成局部数据, 包括文档特征 与 文档网络; 输入参数是阀值, 也就是相似度高于多少才连边 """ generate local data (including paper features and paper network) for each associated name :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29) """ idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') # 加载 特征的 idf值 {word: idf} INTER_LMDB_NAME = 'author_triplets.emb' # 加载 作者在triplet训练后的 内部嵌入 (pid-j, y) lc_inter = LMDBClient(INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" # 加载 作者 原始 特征 (pid-j, author_feature) lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) graph_dir = join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)) # 建立目录, 做好保存局部模型 的工作 os.makedirs(graph_dir, exist_ok=True) name = "Name" wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w') shuffle(pids) # 打乱 for i, pid in enumerate(pids): cur_pub_emb = lc_inter.get(pid) # 获得文档嵌入 y if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) #把cur_pub_emb 转换成字符串 表达 wf_content.write('{}\t'.format(pid)) # 文档id wf_content.write('\t'.join(cur_pub_emb)) # 嵌入 y wf_content.write('\t{}\n'.format(pid)) # pid else: print("ERROR: not found embedding y for pid:%s\n" % (pid)) wf_content.close() # pid-j, y, aid # generate network n_pubs = len(pids) wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w') # 作者名 - 网络保存路径 (pid-j, pid-j) edges_num = 0 for i in range(n_pubs - 1): # 枚举 文档 i author_feature1 = set(lc_feature.get( pids[i])) # 取出 文档i 原始 特征 (pid-j, author_feature) for j in range(i + 1, n_pubs): # 枚举 后面 点 文档 j author_feature2 = set(lc_feature.get(pids[j])) # 取出 文档j 原始 特征 common_features = author_feature1.intersection( author_feature2) # 提取 公共特征 idf_sum = 0 for f in common_features: # 枚举 公共特征 中的 特征f idf_sum += idf.get(f, idf_threshold) # 计算 idf 和 # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: # 和 大于阀值 wf_network.write('{}\t{}\n'.format( pids[i], pids[j])) # 连边, 写入 图网络 文件中 (pid-j, pid-j) edges_num = edges_num + 1 print('n_egdes', edges_num) wf_network.close() ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # 加载 测试集 name->aid->pid-j
def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME) lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = {} name_to_pubs_train = {} TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_test.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) for name in name_to_pubs_train: print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_train.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def test(idf_threshold): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding' lc_inter = LMDBClient(INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) graph_dir = join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)) os.makedirs(graph_dir, exist_ok=True) for i, name in enumerate(name_to_pubs_test): print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # 286 hongbin_li_pubs_content.txt # generate content for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: pids_set.add(pid) # generate network1 # generate network1 all_idf_sum = 0 pathCnt = 0 pids_filter = list(pids_set) n_pubs = len(pids_filter) for i in range(n_pubs - 1): author_feature1 = set(lc_feature.get(pids_filter[i])) for j in range(i + 1, n_pubs): author_feature2 = set(lc_feature.get(pids_filter[j])) # print('author_feature2: ', author_feature2) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) all_idf_sum += idf_sum if idf_sum >= idf_threshold: pathCnt = pathCnt + 1 if name == "kexin_xu": print("all_idf_sum: ", all_idf_sum) print("pathCnt: ", pathCnt)
def dump_inter_emb(): """ dump hidden embedding via trained global_ model for local model to use """ Res = defaultdict(list) LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # print(name_to_pubs_test) for name in name_to_pubs_test: name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) Res[pid_].append(inter_embs[i]) # the same as the train data name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') for name in name_to_pubs_train: name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def dump_test_emb(): LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
def test_prepare_local_data(Name): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) # cnt = 0 wf_contents = [] for i, name in enumerate(name_to_pubs_test): if name != Name: continue print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # generate content for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] # if len(items) < 5: # continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: pids_set.add(pid) wf_contents.append({'pid': pid, 'label': pids2label[pid]}) # cur_pub_emb = list(map(str, cur_pub_emb)) # wf_content.write('{}\t'.format(pid)) # wf_content.write('\t'.join(cur_pub_emb)) # wf_content.write('\t{}\n'.format(pids2label[pid])) PidsLabels = [x['label'] for x in wf_contents] print(len(set(PidsLabels)))
def process_by_name(pids): ### preprocessing print('n_papers: ', len(pids)) if len(pids) < 10: print("too few parpers, continue...") return ### prepare_local_data IDF_THRESHOLD = 32 dump_inter_emb(pids) gen_local_data(idf_threshold=IDF_THRESHOLD, pids=pids, labels=None) ### count_size LMDB_NAME = "author_100.emb.weighted" #(pid-j, x^-) lc = LMDBClient(LMDB_NAME) # 作者 特者 嵌入 加权 平均 (x^-) k = 300 test_x = [] x = [] # 在name下 抽样k个 文档特征x^- 放入一个列表中 sampled_points = [ pids[p] for p in np.random.choice(len(pids), k, replace=True) ] # 文档集 中 随机取样 k 个 for p in sampled_points: x.append(lc.get(p)) # 否则 从 数据库 中 取出 特征x^- test_x.append(np.stack(x)) test_x = np.stack(test_x) model_dir = join(settings.OUT_DIR, 'model') #设定模型目录 rf = open(join(model_dir, 'model-count.json'), 'r') # 加载模型结构 model_json = rf.read() rf.close() loaded_model = model_from_json(model_json) loaded_model.load_weights(join(model_dir, 'model-count.h5')) # 加载模型 权重 kk = loaded_model.predict(test_x) print('num_pred:', kk) ### local\gae\train ret = gae_for_na('Name', int(kk[0][0])) return ret
def dump_inter_emb(pids): # 从训练的全局模型中 取出 隐藏层, 给局部模型使用 """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-) lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' # (pid-j, y) lc_inter = LMDBClient(INTER_LMDB_NAME) # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中 global_model = GlobalTripletModel(data_scale=1000000) # 实例化一个全局模型 trained_global_model = global_model.load_triplets_model() # 加载一个训练好的全局模型 embs_input = [] for pid in pids: cur_emb = lc_input.get(pid) if cur_emb is None: print("ERROR: not found embedding x for pid:%s\n" % (pid)) continue embs_input.append(cur_emb) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i]) ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
def gen_local_data(idf_threshold): """ generate local data (including paper features and paper network) for each associated name :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29) """ AuthorSocial = inputData.loadAuthorSocial() name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding' lc_inter = LMDBClient(INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) graph_dir = join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)) os.makedirs(graph_dir, exist_ok=True) for i, name in enumerate(name_to_pubs_test): print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # 286 hongbin_li_pubs_content.txt # generate content wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w') for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) pids_set.add(pid) wf_content.write('{}\t'.format(pid)) wf_content.write('\t'.join(cur_pub_emb)) wf_content.write('\t{}'.format(pids2label[pid])) LabelId = getLabelId(pid[:IDLength], name) wf_content.write('\t{}\n'.format(LabelId)) wf_content.close() # generate network1 pids_filter = list(pids_set) n_pubs = len(pids_filter) print('n_pubs', n_pubs) wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w') for i in range(n_pubs - 1): if i % 10 == 0: print(i) author_feature1 = set(lc_feature.get(pids_filter[i])) for j in range(i + 1, n_pubs): author_feature2 = set(lc_feature.get(pids_filter[j])) # print('author_feature2: ', author_feature2) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close() def CountNumber(A, B): res = 0 for x in A: for y in B: if x == y: res = res + 1 return res wf_network = open(join(graph_dir, '{}_pubs_network2.txt'.format(name)), 'w') for i in range(n_pubs - 1): for j in range(i + 1, n_pubs): Graph1Socials = AuthorSocial[pids_filter[i]] Graph2Socials = AuthorSocial[pids_filter[j]] if CountNumber(Graph1Socials, Graph2Socials) >= 1: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close()
def dump_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=10000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train.json') for name in name_to_pubs_train: if name == "roger_williams": continue # print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: # print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json') for name in name_to_pubs_test: if name == "roger_williams" or name == "j_yu": continue print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
def prepareData(type='train'): # SNALabelDict, numberofCluss = preprocessSNALabels() # TestLabelDict, TestLabelNumberofCluss = preprocessTestLabels() if type == 'train': LabelDict, numberofCluss = preprocessLabels() else: LabelDict, numberofCluss = preprocessSNALabels() print("LabelDict : ", LabelDict) TrainPids = np.array(list(LabelDict.keys())) AllPids = np.array(TrainPids) print("AllPids : ", list(AllPids)) if type == 'train': TrainPids, ValidPids = train_test_split(AllPids, stratify=list( LabelDict.values()), random_state=42) else: TrainPids, ValidPids = train_test_split(AllPids, random_state=42) # TrainPids, ValidPids = train_test_split(TrainPids, test_size=0.1, random_state=42) LMDB_NAME_EMB = "publication.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) AllX = [] Ally = [] TrainX = [] TrainY = [] ValidX = [] ValidY = [] Allpids = [] for pid in TrainPids: emb = lc_emb.get(pid) label = LabelDict[pid] # print ("pid: ", pid, ", label: ", label, ', emb: ', emb) if emb is None: continue Allpids.append(pid) AllX.append(emb) TrainX.append(emb) TrainY.append(label) Ally.append(label) for pid in ValidPids: emb = lc_emb.get(pid) label = LabelDict[pid] # print ("pid: ", pid, ", label: ", label, ', emb: ', emb) if emb is None: continue Allpids.append(pid) AllX.append(emb) ValidX.append(emb) ValidY.append(label) Ally.append(label) return np.array(TrainX), np.array(TrainY), np.array(ValidX), np.array( ValidY), numberofCluss, AllX, Ally, Allpids
def disambiguate(name_pubs, number): print('Run task (%s)...' % (os.getpid())) start1 = time.time() lc1 = LMDBClient('sci_all_data') result = {} for n, name in enumerate(name_pubs): pubs = name_pubs[name] ##存储某一人名下的所有文章 print(n, name, len(pubs)) if len(pubs) == 0: result[name] = [] continue result1 = [] if len(pubs) <= 5: result[name] = [] for i, pid in enumerate(pubs): result1.append(pid[0]) result[name].append(result1) continue ##保存关系 ############################################################### name_pubs_raw = {} for i, pid in enumerate(pubs): paper = lc1.get(pid[0]) paper['org_name'] = pid[1] paper.pop('abstract') paper.pop('uid') name_pubs_raw[pid[0]] = paper save_relation(name_pubs_raw, name, number) # print('save features down') ############################################################### ##元路径游走类 mpg = MetaPathGenerator() mpg.read_data("gene", number) # print('path down') ############################################################### ##论文关系表征向量(关系特征嵌入),采用了bagging思想 all_embs = [] rw_num = 3 cp = set() ##孤立节点 for k in range(rw_num): mpg.generate_WMRW("gene/RW_{}.txt".format(number), 5, 10) #生成路径集 sentences = word2vec.Text8Corpus(r'gene/RW_{}.txt'.format(number)) model = word2vec.Word2Vec(sentences, size=100, negative=25, min_count=1, window=10) embs = [] for i, pid in enumerate(pubs): if pid[0] in model.wv: embs.append(model.wv[pid[0]]) else: cp.add(i) embs.append(np.zeros(100)) all_embs.append(embs) all_embs = np.array(all_embs) # print('real emb down') ############################################################### ##论文文本表征向量 ############################################################### # ptext_emb=load_data('gene','ptext_emb_{}.pkl'.format(number)) # tcp=load_data('gene','tcp_{}.pkl'.format(number)) # tembs=[] # for i,pid in enumerate(pubs): # tembs.append(ptext_emb[pid[0]]) # # print('paper emb down') ############################################################### ##网络嵌入向量相似度 sk_sim = np.zeros((len(pubs), len(pubs)), dtype='float16') for k in range(rw_num): sk_sim = sk_sim + pairwise_distances(all_embs[k], metric="cosine") sk_sim = sk_sim / rw_num ##文本相似度 # t_sim = pairwise_distances(tembs,metric="cosine") # if tcp >= len(pubs)/2: sim = np.array(sk_sim) # else: # w=1#相似度矩阵融合权重 # sim = (np.array(sk_sim) + w*np.array(t_sim))/(1+w) ##实现消歧聚类 ############################################################### pre = DBSCAN(eps=0.2, min_samples=1, metric="precomputed").fit_predict(sim) ##离散点 outlier = set() for i in range(len(pre)): if pre[i] == -1: outlier.add(i) for i in cp: outlier.add(i) ## (给每一个离群节点打上标签,基于tanimoto相似度矩阵) paper_pair = generate_pair(pubs, outlier, number) paper_pair1 = paper_pair.copy() K = len(set(pre)) for i in range(len(pre)): if i not in outlier: continue j = np.argmax(paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax(paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 ## find nodes in outlier is the same label or not ## 将各个离群节点通过相似度匹配来打上相同标签,相似阈值为1.5 for ii, i in enumerate(outlier): for jj, j in enumerate(outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] ##存储消歧预测结果 result[name] = [] for lab in set(pre): sameauthor = [] for index, lab1 in enumerate(pre): if lab == lab1: sameauthor.append(pubs[index][0]) result[name].append(sameauthor) print('消歧之后的作者数量:%d' % (len(result[name]))) dump_json(result, "output", "sci_result_1_10_all_{}.json".format(number), indent=4) print('task %s run %0.2f seconds.' % (os.getpid(), (time.time() - start1)))
cur_author = name_to_pubs_train[name] pids = [] labels = [] rf = [] tf = [] attentionf = [] for aid in cur_author: if len(cur_author[aid]) < 5: continue for pid in cur_author[aid]: pids.append(pid) labels.append(aid) rf.append(rawFeature.get(pid)) tf.append(tripletFeature.get(pid)) attentionf.append(lc_emb.get(pid)) labels = encode_labels(labels) numberofLabels = len(set(labels)) def clusterTest(embedding, numberofLabels): clusters_pred = clustering(embedding, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) return [prec, rec, f1] tSNEAnanlyse(rf, labels, join(settings.PIC_DIR, "FINALResult", "%s_rawFeature.png" % (name))) tSNEAnanlyse(tf, labels, join(settings.PIC_DIR, "FINALResult", "%s_tripletFeature.png" % (name)))
def genPAPandPSP(self, authorName="hongbin_li", idf_threshold=10): idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') raw_word2vec = 'author_100.emb.weighted' lc_emb = LMDBClient(raw_word2vec) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) cur_person_dict = self.name2pubs_train[authorName] pids_set = set() pids = [] pids2label = {} print ("pass0") graph_dir = join(settings.DATA_DIR, 'AttentionNetwork' , 'graph-{}'.format(idf_threshold)) # generate content wf_content = open(join(graph_dir, '{}_feature_and_label.txt'.format(authorName)), 'w') for i, aid in enumerate(cur_person_dict): personPids = cur_person_dict[aid] # It's no nessary to use these data which's length is less than 5 if len(personPids) < 5: continue print ("aid: ", aid, ", pids: ", pids) for pid in personPids: pids2label[str(pid)] = str(aid) pids.append(pid) print ("pass1") for pid in pids: # use raw feature rather than Triplet Loss cur_pub_emb = lc_emb.get(pid) # cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) pids_set.add(pid) wf_content.write('{}\t'.format(pid)) wf_content.write('\t'.join(cur_pub_emb)) wf_content.write('\t{}\n'.format(pids2label[pid])) wf_content.close() print ("pass2") # generate network1 pids_filter = list(pids_set) n_pubs = len(pids_filter) print('n_pubs', n_pubs) wf_network = open(join(graph_dir, '{}_PAP.txt'.format(authorName)), 'w') for i in range(n_pubs-1): if i % 10 == 0: print(i) author_feature1 = set(lc_feature.get(pids_filter[i])) for j in range(i+1, n_pubs): author_feature2 = set(lc_feature.get(pids_filter[j])) # print('author_feature2: ', author_feature2) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close() def CountNumber(A, B): res = 0 for x in A: for y in B: if x == y: res = res + 1 return res print ("pass3") wf_network = open(join(graph_dir, '{}_PSP.txt'.format(authorName)), 'w') for i in range(n_pubs-1): for j in range(i + 1, n_pubs): Graph1Socials = self.AuthorSocial[pids_filter[i]] Graph2Socials = self.AuthorSocial[pids_filter[j]] if CountNumber(Graph1Socials, Graph2Socials) >= Author_THRESHOLD: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close()
def gen_local_data(idf_threshold=10): """ generate local data (including paper features and paper network) for each associated name :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29) """ name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) graph_dir = join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)) os.makedirs(graph_dir, exist_ok=True) for i, name in enumerate(name_to_pubs_test): print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # generate content wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w') for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) pids_set.add(pid) wf_content.write('{}\t'.format(pid)) wf_content.write('\t'.join(cur_pub_emb)) wf_content.write('\t{}\n'.format(pids2label[pid])) wf_content.close() # generate network pids_filter = list(pids_set) n_pubs = len(pids_filter) print('n_pubs', n_pubs) wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w') for i in range(n_pubs - 1): if i % 10 == 0: print(i) author_feature1 = set(lc_feature.get(pids_filter[i])) for j in range(i + 1, n_pubs): author_feature2 = set(lc_feature.get(pids_filter[j])) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close()
from utils import data_utils from utils import settings import codecs from os.path import join from datetime import datetime from utils import feature_utils from global_.embedding import EmbeddingModel from utils.cache import LMDBClient from collections import defaultdict as dd import math pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') LMDB_NAME_EMB = "publication.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) cnt = 0 for i, pid in enumerate(pubs_dict): if i % 1000 == 0: print("idx: %d" % (i)) emb = lc_emb.get(pid) if emb is None: print("%s emb is null" % (pid)) cnt = cnt + 1 print("cnt: %d" % (cnt))
"gang_yin", "gang_zou", "guo_hua_zhang", "h_hu", "hai_yan_chen", "hai_yang_li" ] for name in names: name_data = name_to_pubs_train[name] res_embs = [] embs_input = [] labels = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) labels.append(aid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) labels = encode_labels(labels) for i, pid_ in enumerate(pids): res_embs.append(inter_embs[i]) # Clustering and save the result tSNEAnanlyse(