def dump_test_emb(): LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
def dump_author_embs(): # 将作者嵌入 导入到 lmdb 中, 作者嵌入 是 词向量 IDF 的 加权平均 """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data( settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') #取出 上个函数 计算的 idf {feature: idf} print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' # (pid-j, author_feature) lc_feature = LMDBClient(LMDB_NAME_FEATURE) # 连接 作者特征 lmdb LMDB_NAME_EMB = "author_100.emb.weighted" # (pid-j, x^-) lc_emb = LMDBClient(LMDB_NAME_EMB) # 连接 作者嵌入 lmdb cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): # 遍历 特征 if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # 解码获得 文章 编号 features = data_utils.deserialize_embedding( k[1]) # 反序列化 得 对应 作者特征 对象 cur_emb = emb_model.project_embedding( features, idf) # 获得 对应 加权平均IDF 的 嵌入 x^- if cur_emb is not None: lc_emb.set( pid_order, cur_emb ) # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中 (pid-j, x^-) else: print(pid_order)
def testDataRun(): cnt = 0 metrics = np.zeros(3) wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8') LMDB_NAME_EMB = "graph_auto_encoder_embedding" lc_emb = LMDBClient(LMDB_NAME_EMB) han = HAN(lc_emb) name_to_pubs_test = load_test_names() for name in name_to_pubs_test: prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train( name=name, needtSNE=True) print(name, prec, rec, f1) wf.write('{0},{1:.5f},{2:.5f},{3:.5f}\n'.format(name, prec, rec, f1)) wf.flush() metrics[0] = metrics[0] + prec metrics[1] = metrics[1] + rec metrics[2] = metrics[2] + f1 cnt += 1 for pid, embedding in zip(pids, attentionEmbeddings): lc_emb.set(pid, embedding) macro_prec = metrics[0] / cnt macro_rec = metrics[1] / cnt macro_f1 = eval_utils.cal_f1(macro_prec, macro_rec) wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format( macro_prec, macro_rec, macro_f1)) wf.close()
def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def dump_author_embs(): """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' lc_feature = LMDBClient(LMDB_NAME_FEATURE) LMDB_NAME_EMB = "author_100.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # print ("pid_order: ", pid_order) features = data_utils.deserialize_embedding(k[1]) cur_emb = emb_model.project_embedding(features, idf) if cur_emb is not None: # print ("pid_order: is not none", pid_order) lc_emb.set(pid_order, cur_emb)
def dump_file_todict(): ##采用分块读取的方法,主要用到参数chunksize,iterator参数(常用参数) yy = pd.read_csv('/home/wss/sites/disamb/sci_process/data/t_018_sci_disamb_string_precess.csv', usecols = ['uid','author','title','abstract','keyword','org_name','pubyear','source'],sep = ',',iterator=True,encoding ='utf-8') # df = yy.get_chunk(1) # print(len(df)) # print(df.columns) # print(df.head) loop = True chunkSize = 5000 cnt = 0 lc = LMDBClient('sci_all_data') while loop: try: chunk = yy.get_chunk(chunkSize) cnt += 1 print('sci1800万论文存储了:%0.2f 万行'%(cnt*0.5)) dataset_dict = chunk.to_dict(orient = 'records') for pap in dataset_dict: pap['author'] = pap['author'].split('|') pid_order = pap['uid'] lc.set(pid_order, pap) except StopIteration: loop = False print("Iteration is stopped.") lc.db.close() print("分块处理存取进lmdb数据库用时为:%0.2f s"%(time.time()-start))
def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME) lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = {} name_to_pubs_train = {} TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_test.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) for name in name_to_pubs_train: print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_train.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def testHAN(): LMDB_NAME_EMB = "lc_attention_network_embedding2" lc_emb = LMDBClient(LMDB_NAME_EMB) han = HAN(lc_emb) name_to_pubs_train = load_train_names() for name in name_to_pubs_train: prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train( name=name, ispretrain=True, needtSNE=False) for pid, attentionEmbedding in zip(pids, attentionEmbeddings): lc_emb.set(pid, attentionEmbedding) print(name, prec, rec, f1)
def dump_author_features_to_cache(): """ dump author features to lmdb """ LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(LMDB_NAME) with codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'r', encoding='utf-8') as rf: for i, line in enumerate(rf): if i % 1000 == 0: print('line', i) items = line.rstrip().split('\t') pid_order = items[0] author_features = items[1].split() lc.set(pid_order, author_features)
def dump_author_features_to_cache(): """ dump author features to lmdb """ LMDB_NAME = 'publication_IslandLoss.feature' lc = LMDBClient(LMDB_NAME) with codecs.open(join(settings.ISLAND_LOSS_DIR, 'paper_features.txt'), 'r', encoding='utf-8') as rf: for i, line in enumerate(rf): if i % 1000 == 0: print('line', i) items = line.rstrip().split('\t') pid_order = items[0] # print ("pid_order: ", pid_order, items) author_features = items[1].split() lc.set(pid_order, author_features)
def dump_author_features_to_cache(): #将作者特征 导入到 cache 中 本地数据库 lmdb """ dump author features to lmdb """ LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(LMDB_NAME) with codecs.open( join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'r', encoding='utf-8') as rf: #之前把特征写入到了文件 auther_features.txt 中, 这里 读取 for i, line in enumerate( rf): #枚举 第i行 line 1行对应一个author_feature pid-j\tauthor_feature if i % 1000 == 0: print('line', i) items = line.rstrip().split( '\t') #删除末尾空格 后 按'\t'分割 pid-j, author_feature pid_order = items[0] #提取文档序号 对应 上一个函数中的输出格式 pid-j 文档id-第j个作者 author_features = items[1].split() # 提取作者特征 每个特征用空格分割为 列表了 lc.set(pid_order, author_features) #导入 到 数据库 中
def dump_author_features_to_cache(dataset_name): """ dump author features to lmdb """ LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(dataset_name, LMDB_NAME) with codecs.open(join(settings.get_global_data_dir(dataset_name), 'author_features.txt'), 'r', encoding='utf-8') as rf: for i, line in enumerate(rf): if i % 1000 == 0: print('line', i) items = line.rstrip().split('\t') # print(line) pid_order = items[0] if len(items) > 1: author_features = items[1].split() else: author_features = [] lc.set(pid_order, author_features)
def dump_features_to_cache(): ''' generate author features by raw publication data and dump to cache ''' lc = LMDBClient('sci_all_data') lm = LMDBClient('sci_all_data_feature') cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): cnt += 1 pid = k[0].decode() paper = data_utils.deserialize_embedding(k[1]) if len(paper["author"]) > 100: print(cnt, pid, len(paper["author"])) continue features = extract_author_features(paper) if cnt % 10000 == 0: print('已经提取:%d 万篇论文'%(cnt/10000)) lm.set(pid,features) lm.db.close() lc.db.close()
def dump_inter_emb(pids): # 从训练的全局模型中 取出 隐藏层, 给局部模型使用 """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-) lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' # (pid-j, y) lc_inter = LMDBClient(INTER_LMDB_NAME) # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中 global_model = GlobalTripletModel(data_scale=1000000) # 实例化一个全局模型 trained_global_model = global_model.load_triplets_model() # 加载一个训练好的全局模型 embs_input = [] for pid in pids: cur_emb = lc_input.get(pid) if cur_emb is None: print("ERROR: not found embedding x for pid:%s\n" % (pid)) continue embs_input.append(cur_emb) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i]) ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
for name in name_to_test: for aid in name_to_test[name]: if len(name_to_test[name][aid]) < 5: continue for pid in name_to_test[name][aid]: TestDataPids.append(pid) def getRawEmbedding(pids): rawEmbedding = [] for pid in pids: rawEmbedding.append(rawFeature.get(pid)) rawEmbedding = np.array(rawEmbedding) rawEmbedding.reshape(-1,1) return rawEmbedding TestDataEmbedding = getRawEmbedding(TestDataPids) transformEmbedding = raw2localTrans.predict(TestDataEmbedding) LMDB_NAME_EMB = "raw_transform_local_embedding" lc_emb = LMDBClient(LMDB_NAME_EMB) # print (transformEmbedding) for idx, pid in enumerate(TestDataPids): print (idx, pid) embedd = transformEmbedding[idx] lc_emb.set(pid, embedd) print ("done")
feature = tf.keras.layers.Lambda(l2Norm, name='norm_layer', output_shape=[64])(layer2) logits = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(feature) export_path = join(settings.ISLAND_LOSS_DIR, "feature_model") saver = tf.train.Saver() with tf.Session() as sess: # vali_acc = "0.47049925" vali_acc = "0.63398695" path = join(settings.ISLAND_LOSS_DIR, "200", "vali_acc_%s" % (vali_acc), "feature_model") saver.restore(sess, path) # saver.restore(sess, join(settings.ISLAND_LOSS_DIR, "feature_model")) Features = sess.run(feature, feed_dict={ input_images: AllX - MeanAllX, labels: Ally }) # EmbedingCheck.check(Features, Ally, name="train2_embedding.jpg") for idx, pid in enumerate(pids): emb = Features[idx] lc.set(pid, emb) # EmbedingCheck.check(Features, TestY, name="train2_embedding.jpg")
def dump_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=10000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train.json') for name in name_to_pubs_train: if name == "roger_williams": continue # print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: # print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json') for name in name_to_pubs_test: if name == "roger_williams" or name == "j_yu": continue print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])