def dump_author_embs(): # 将作者嵌入 导入到 lmdb 中, 作者嵌入 是 词向量 IDF 的 加权平均 """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data( settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') #取出 上个函数 计算的 idf {feature: idf} print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' # (pid-j, author_feature) lc_feature = LMDBClient(LMDB_NAME_FEATURE) # 连接 作者特征 lmdb LMDB_NAME_EMB = "author_100.emb.weighted" # (pid-j, x^-) lc_emb = LMDBClient(LMDB_NAME_EMB) # 连接 作者嵌入 lmdb cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): # 遍历 特征 if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # 解码获得 文章 编号 features = data_utils.deserialize_embedding( k[1]) # 反序列化 得 对应 作者特征 对象 cur_emb = emb_model.project_embedding( features, idf) # 获得 对应 加权平均IDF 的 嵌入 x^- if cur_emb is not None: lc_emb.set( pid_order, cur_emb ) # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中 (pid-j, x^-) else: print(pid_order)
def dump_author_embs(): """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' lc_feature = LMDBClient(LMDB_NAME_FEATURE) LMDB_NAME_EMB = "author_100.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # print ("pid_order: ", pid_order) features = data_utils.deserialize_embedding(k[1]) cur_emb = emb_model.project_embedding(features, idf) if cur_emb is not None: # print ("pid_order: is not none", pid_order) lc_emb.set(pid_order, cur_emb)
idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' lc_feature = LMDBClient(LMDB_NAME_FEATURE) LMDB_NAME_EMB = "author_100.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') features = data_utils.deserialize_embedding(k[1]) cur_emb = emb_model.project_embedding(features, idf) if cur_emb is not None: lc_emb.set(pid_order, cur_emb) if __name__ == '__main__': """ some pre-processing """ dump_author_features_to_file() dump_author_features_to_cache() emb_model = EmbeddingModel.Instance() emb_model.train('aminer') # training word embedding model cal_feature_idf() dump_author_embs() print('done', datetime.now() - start_time)
cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): # 遍历 特征 if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # 解码获得 文章 编号 features = data_utils.deserialize_embedding( k[1]) # 反序列化 得 对应 作者特征 对象 cur_emb = emb_model.project_embedding( features, idf) # 获得 对应 加权平均IDF 的 嵌入 x^- if cur_emb is not None: lc_emb.set( pid_order, cur_emb ) # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中 (pid-j, x^-) else: print(pid_order) if __name__ == '__main__': """ some pre-processing """ dump_author_features_to_file() # 将作者特征写入本地文件 中 dump_author_features_to_cache() # 将作者特征写入 cache 中 emb_model = EmbeddingModel.Instance() # 实例化 嵌入 模型 emb_model.train('aminer') # training word embedding model Word2Vec cal_feature_idf() # 计算 特征 到 逆文档频率 dump_author_embs() # 将作者嵌入 导入 数据库 中 print('done', datetime.now() - start_time)
LMDB_NAME_EMB = "author_100.emb.weighted" lc_emb = LMDBClient(dataset_name, LMDB_NAME_EMB) cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') features = data_utils.deserialize_embedding(k[1]) cur_emb = emb_model.project_embedding(features, idf) if cur_emb is not None: lc_emb.set(pid_order, cur_emb) if __name__ == '__main__': """ some pre-processing """ parser = argparse.ArgumentParser() parser.add_argument("--dataset_name", default="whoiswho_new", type=str) args = parser.parse_args() dataset_name = args.dataset_name dump_author_features_to_file(dataset_name) dump_author_features_to_cache(dataset_name) emb_model = EmbeddingModel.Instance(dataset_name) emb_model.train() # training word embedding model cal_feature_idf() dump_author_embs() print('done', datetime.now() - start_time)