def dump_feature_id_to_file(): """ transform a publication into a set of author and word IDs, dump it to csv """ model = EmbeddingModel.Instance() author_emb_model = model.load_author_name_emb() author_emb_file = "author_emb.array" word_emb_model = model.load_word_name_emb() word_emb_file = "word_emb.array" dump_emb_array(author_emb_model, author_emb_file) dump_emb_array(word_emb_model, word_emb_file) features = data_utils.load_data('Essential_Embeddings/', "pub.features") author_idfs = data_utils.load_data('Essential_Embeddings/global/', 'author_feature_idf.pkl') word_idfs = data_utils.load_data('Essential_Embeddings/global/', 'word_feature_idf.pkl') index = 0 feature_dict = {} for pub_index in range(len(features)): pub_features = features[pub_index] if (pub_features == None): continue for author_index in range(len(pub_features)): aid, author_features, word_features = pub_features[author_index] if index % 100000 == 0: print(index, author_features, word_features) index += 1 author_id_list, author_idf_list = get_feature_ids_idfs_for_one_pub(author_features, author_emb_model, author_idfs) word_id_list, word_idf_list = get_feature_ids_idfs_for_one_pub(word_features, word_emb_model, word_idfs) if author_id_list is not None or word_id_list is not None: feature_dict[aid] = (author_id_list, author_idf_list, word_id_list, word_idf_list) data_utils.dump_data(feature_dict, 'Essential_Embeddings/emb/', "pub_feature.ids")
def cal_feature_idf(): #计算逆文档频率 """ calculate word IDF (Inverse document frequency) using publication data """ feature_dir = join(settings.DATA_DIR, 'global') #特征目录 counter = dd(int) # 一种字典, 比{}多一个 如果没有查询到的key, 会返回int(0) cnt = 0 LMDB_NAME = 'pub_authors.feature' # (pid-j, author_feature) lc = LMDBClient(LMDB_NAME) #连接 lmdb author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): #遍历 lmdb features = data_utils.deserialize_embedding( k[1]) #反序列化 得到 特征对象 k[0]是id, k[1]是author_feature if author_cnt % 10000 == 0: print( author_cnt, features[0], counter.get(features[0]) ) #features[0] 是 类似"__NAME__yanjun_zhang" 是合作者的name_feature author_cnt += 1 #作者计数 for f in features: cnt += 1 #记总数 counter[f] += 1 # 记特征f 的出现次数 idf = {} for k in counter: # 计算特征k 对应的 idf idf[k] = math.log(cnt / counter[k]) data_utils.dump_data( dict(idf), feature_dir, "feature_idf.pkl") #写入 feature_idf.pkl 中 {feature: idf}
def cal_feature_idf(): """ calculate word IDF (Inverse document frequency) using publication data """ feature_dir = join(settings.DATA_DIR, 'global') counter = dd(int) cnt = 0 LMDB_NAME = 'sci_all_data_feature' lc = LMDBClient(LMDB_NAME) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): # print(k[0]) features = data_utils.deserialize_embedding(k[1]) # print(features) if author_cnt % 10000 == 0: print(author_cnt, features[0], counter.get(features[0])) author_cnt += 1 for f in features: cnt += 1 counter[f] += 1 idf = {} for k in counter: idf[k] = math.log(cnt / counter[k]) data_utils.dump_data(dict(idf), feature_dir, "feature_idf.pkl")
def dump_emb_array(emb_model, output_name): global _emb_model _emb_model = emb_model # transform the feature embeddings from embedding to (id, embedding) res = multithread_utils.processed_by_multi_thread(get_feature_index, range(len(_emb_model.wv.vocab))) sorted_embeddings = sorted(res, key=lambda x:x[0]) word_embeddings = list(list(zip(*sorted_embeddings))[1]) data_utils.dump_data(np.array(word_embeddings), 'Essential_Embeddings/emb/', output_name)
def dump_pub_features_to_file(): """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ global _pubs_dict # Load publication features _pubs_dict = data_utils.load_json('./OAG_WhoIsWho_data', 'your_pub_file_name') res = multithread_utils.processed_by_multi_thread(get_pub_feature, range(len(_pubs_dict))) data_utils.dump_data(res, "Essential_Embeddings/", "pub.features")
def prepare_network_input(self, role, fold): """ prepare cnn model input :param role: 'train' or 'test' :param fold: cross validation fold :return: constructed matrices """ pos_pairs = self.paper_data_utils.construct_positive_paper_pairs( role, fold) logger.info('positive paper pairs built') neg_pairs = self.paper_data_utils.load_train_neg_paper_pairs(fold) logger.info('negative paper pairs loaded') pos_title_matrices, pos_author_matrices = self.pairs2multiple_matrices( pos_pairs) data_utils.dump_data(pos_author_matrices, self.matrices_dir, 'pos_author_matrices_{}.pkl'.format(fold)) data_utils.dump_data(pos_title_matrices, self.matrices_dir, 'pos_title_matrices_{}.pkl'.format(fold)) neg_title_matrices, neg_author_matrices = self.pairs2multiple_matrices( neg_pairs) data_utils.dump_data(neg_author_matrices, self.matrices_dir, 'neg_author_matrices_{}.pkl'.format(fold)) data_utils.dump_data(neg_title_matrices, self.matrices_dir, 'neg_title_matrices_{}.pkl'.format(fold)) return pos_title_matrices, pos_author_matrices, neg_title_matrices, neg_author_matrices
def idf_calc(): df = defaultdict(int) lc = LMDBClient(LMDB_AUTHOR) with lc.db.begin() as txn: n_doc = txn.stat()['entries'] for cnt, raw in enumerate(txn.cursor()): if (cnt + 1) % 10000 == 0: print('idf_calc %d' % (cnt + 1)) author_feature = deserialize_embedding(raw[1]) for word in author_feature: df[word] += 1 idf_dict = defaultdict(float, [(word, math.log(n_doc / cnt)) for word, cnt in df.items()]) dump_data(idf_dict, WORD_IDF)
def cal_feature_idf(): """ calculate word IDF (Inverse document frequency) using publication data """ features = data_utils.load_data('Essential_Embeddings/', "pub.features") feature_dir = join('Essential_Embeddings/', 'global') index = 0 author_counter = dd(int) author_cnt = 0 word_counter = dd(int) word_cnt = 0 none_count = 0 for pub_index in range(len(features)): pub_features = features[pub_index] # print(pub_features) if (pub_features == None): none_count += 1 continue for author_index in range(len(pub_features)): aid, author_features, word_features = pub_features[author_index] if index % 100000 == 0: print(index, aid) index += 1 for af in author_features: author_cnt += 1 author_counter[af] += 1 for wf in word_features: word_cnt +=1 word_counter[wf] +=1 author_idf = {} for k in author_counter: author_idf[k] = math.log(author_cnt / author_counter[k]) word_idf = {} for k in word_counter: word_idf[k] = math.log(word_cnt / word_counter[k]) data_utils.dump_data(dict(author_idf), feature_dir, "author_feature_idf.pkl") data_utils.dump_data(dict(word_idf), feature_dir, "word_feature_idf.pkl") print("None count: ", none_count)
def dump_triplets(self, role='train'): triplets = self.gen_triplets_mp(role) if role == 'train': out_dir = join(settings.OUT_DIR, 'triplets-{}'.format(self.save_size)) else: out_dir = join(settings.OUT_DIR, 'test-triplets') os.makedirs(out_dir, exist_ok=True) anchor_embs = [] pos_embs = [] neg_embs = [] f_idx = 0 for i, t in enumerate(triplets): if i % 100 == 0: print(i, datetime.now() - start_time) emb_anc, emb_pos, emb_neg = t[0], t[1], t[2] anchor_embs.append(emb_anc) pos_embs.append(emb_pos) neg_embs.append(emb_neg) if len(anchor_embs) == self.batch_size: data_utils.dump_data( anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx)) f_idx += 1 anchor_embs = [] pos_embs = [] neg_embs = [] if anchor_embs: data_utils.dump_data(anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx)) print('dumped')
def prepare_LSH_parameters(self, role, fold): proj = np.random.normal(size=(self.vectors_dim, self.title_bit)) fname = 'LSH_proj_matrix_{}_{}.pkl'.format(role, fold) if not self.without_inner_results: data_utils.dump_data(proj, self.para_dir, fname) return proj
def dump_triplets(self, role='train'): triplets = self.gen_triplets_mp(role) #得到 嵌入表达(x^-) 的三元组集 使用了多进程 multi_process if role == 'train': #设定输出目录 out_dir = join(settings.OUT_DIR, 'triplets-{}'.format(self.save_size)) else: out_dir = join(settings.OUT_DIR, 'test-triplets') os.makedirs(out_dir, exist_ok=True) #创建目录 anchor_embs = [] pos_embs = [] neg_embs = [] f_idx = 0 for i, t in enumerate(triplets): #枚举 三元组t if i % 100 == 0: print(i, datetime.now()-start_time) emb_anc, emb_pos, emb_neg = t[0], t[1], t[2] #取出对应的嵌入向量x^- anchor_embs.append(emb_anc) #依次加入到对应列表中 pos_embs.append(emb_pos) neg_embs.append(emb_neg) if len(anchor_embs) == self.batch_size: #到达了设定的批次规模, 批次写入到文件中 data_utils.dump_data(anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) #若干个x^- data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx)) f_idx += 1 #批次计数 anchor_embs = [] #及时清空 pos_embs = [] neg_embs = [] if anchor_embs: #如果还有剩的, 把最后剩的一批也输出出去 data_utils.dump_data(anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx)) data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx)) print('dumped')
from utils.cache import LMDBClient dataset_names = [ "whoiswho_new", "aminerv1", "aminerv2", "aminerv3", "citeseerx", ] counter = dd(int) for dataset_name in dataset_names: overall_feature_dir = settings.get_overall_feature_dir() cnt = 0 LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(dataset_name, LMDB_NAME) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): features = data_utils.deserialize_embedding(k[1]) if author_cnt % 10000 == 0: print(author_cnt, features[0], counter.get(features[0])) author_cnt += 1 for f in features: cnt += 1 counter[f] += 1 idf = {} for k in counter: idf[k] = math.log(cnt / counter[k]) data_utils.dump_data(dict(idf), settings.get_overall_feature_dir(), "feature_idf.pkl")
def pid2index(rfpath, wfpath): pubs = pd.read_parquet(rfpath) index = {} for name, pub in pubs.groupby('name'): index[name] = pub.loc[:, 'id'].values dump_data(index, wfpath=wfpath)