def cal_feature_idf(): #计算逆文档频率 """ calculate word IDF (Inverse document frequency) using publication data """ feature_dir = join(settings.DATA_DIR, 'global') #特征目录 counter = dd(int) # 一种字典, 比{}多一个 如果没有查询到的key, 会返回int(0) cnt = 0 LMDB_NAME = 'pub_authors.feature' # (pid-j, author_feature) lc = LMDBClient(LMDB_NAME) #连接 lmdb author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): #遍历 lmdb features = data_utils.deserialize_embedding( k[1]) #反序列化 得到 特征对象 k[0]是id, k[1]是author_feature if author_cnt % 10000 == 0: print( author_cnt, features[0], counter.get(features[0]) ) #features[0] 是 类似"__NAME__yanjun_zhang" 是合作者的name_feature author_cnt += 1 #作者计数 for f in features: cnt += 1 #记总数 counter[f] += 1 # 记特征f 的出现次数 idf = {} for k in counter: # 计算特征k 对应的 idf idf[k] = math.log(cnt / counter[k]) data_utils.dump_data( dict(idf), feature_dir, "feature_idf.pkl") #写入 feature_idf.pkl 中 {feature: idf}
def get(self, key): with self.db.begin() as txn: value = txn.get(key.encode()) if value: return data_utils.deserialize_embedding(value) else: return None
def dump_author_embs(): """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' lc_feature = LMDBClient(LMDB_NAME_FEATURE) LMDB_NAME_EMB = "author_100.emb.weighted" lc_emb = LMDBClient(LMDB_NAME_EMB) cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # print ("pid_order: ", pid_order) features = data_utils.deserialize_embedding(k[1]) cur_emb = emb_model.project_embedding(features, idf) if cur_emb is not None: # print ("pid_order: is not none", pid_order) lc_emb.set(pid_order, cur_emb)
def dump_author_embs(): # 将作者嵌入 导入到 lmdb 中, 作者嵌入 是 词向量 IDF 的 加权平均 """ dump author embedding to lmdb author embedding is calculated by weighted-average of word vectors with IDF """ emb_model = EmbeddingModel.Instance() idf = data_utils.load_data( settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') #取出 上个函数 计算的 idf {feature: idf} print('idf loaded') LMDB_NAME_FEATURE = 'pub_authors.feature' # (pid-j, author_feature) lc_feature = LMDBClient(LMDB_NAME_FEATURE) # 连接 作者特征 lmdb LMDB_NAME_EMB = "author_100.emb.weighted" # (pid-j, x^-) lc_emb = LMDBClient(LMDB_NAME_EMB) # 连接 作者嵌入 lmdb cnt = 0 with lc_feature.db.begin() as txn: for k in txn.cursor(): # 遍历 特征 if cnt % 1000 == 0: print('cnt', cnt, datetime.now() - start_time) cnt += 1 pid_order = k[0].decode('utf-8') # 解码获得 文章 编号 features = data_utils.deserialize_embedding( k[1]) # 反序列化 得 对应 作者特征 对象 cur_emb = emb_model.project_embedding( features, idf) # 获得 对应 加权平均IDF 的 嵌入 x^- if cur_emb is not None: lc_emb.set( pid_order, cur_emb ) # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中 (pid-j, x^-) else: print(pid_order)
def cal_feature_idf(): """ calculate word IDF (Inverse document frequency) using publication data """ feature_dir = join(settings.DATA_DIR, 'global') counter = dd(int) cnt = 0 LMDB_NAME = 'sci_all_data_feature' lc = LMDBClient(LMDB_NAME) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): # print(k[0]) features = data_utils.deserialize_embedding(k[1]) # print(features) if author_cnt % 10000 == 0: print(author_cnt, features[0], counter.get(features[0])) author_cnt += 1 for f in features: cnt += 1 counter[f] += 1 idf = {} for k in counter: idf[k] = math.log(cnt / counter[k]) data_utils.dump_data(dict(idf), feature_dir, "feature_idf.pkl")
def get_batch(self, keys): values = [] with self.db.begin() as txn: for key in keys: value = txn.get(key.encode()) if value: values.append(data_utils.deserialize_embedding(value)) return values
def idf_calc(): df = defaultdict(int) lc = LMDBClient(LMDB_AUTHOR) with lc.db.begin() as txn: n_doc = txn.stat()['entries'] for cnt, raw in enumerate(txn.cursor()): if (cnt + 1) % 10000 == 0: print('idf_calc %d' % (cnt + 1)) author_feature = deserialize_embedding(raw[1]) for word in author_feature: df[word] += 1 idf_dict = defaultdict(float, [(word, math.log(n_doc / cnt)) for word, cnt in df.items()]) dump_data(idf_dict, WORD_IDF)
def train(self, wf_name, size=EMB_DIM): data = [] LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(LMDB_NAME) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): author_feature = data_utils.deserialize_embedding(k[1]) if author_cnt % 10000 == 0: print(author_cnt, author_feature[0]) author_cnt += 1 random.shuffle(author_feature) # print(author_feature) data.append(author_feature) self.model = Word2Vec( data, size=size, window=5, min_count=5, workers=20, ) self.model.save(join(settings.EMB_DATA_DIR, '{}.emb'.format(wf_name)))
def dump_features_to_cache(): ''' generate author features by raw publication data and dump to cache ''' lc = LMDBClient('sci_all_data') lm = LMDBClient('sci_all_data_feature') cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): cnt += 1 pid = k[0].decode() paper = data_utils.deserialize_embedding(k[1]) if len(paper["author"]) > 100: print(cnt, pid, len(paper["author"])) continue features = extract_author_features(paper) if cnt % 10000 == 0: print('已经提取:%d 万篇论文'%(cnt/10000)) lm.set(pid,features) lm.db.close() lc.db.close()
def train(self, wf_name, size=EMB_DIM): #训练 data = [] LMDB_NAME = 'pub_authors.feature' # 用author_feature.txt 导入到 到 数据库 lc = LMDBClient(LMDB_NAME) #连接 数据库 (pid-j, author_feature) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): #通过cursor 遍历 author_feature = data_utils.deserialize_embedding( k[1]) #从k[1]中 反序列化 得到作者特征对象 if author_cnt % 10000 == 0: print(author_cnt, author_feature[0]) author_cnt += 1 #计算作者总数 random.shuffle(author_feature) #打乱 作者特征 # print(author_feature) data.append(author_feature) #加入 数据集 data 中 self.model = Word2Vec( data, size=size, window=5, min_count=5, workers=20, ) # 输入字符集, 词向量维数, 窗口大小(当前词与目标词的最大距离), 词频过滤值, 训练的并行 self.model.save(join(settings.EMB_DATA_DIR, '{}.emb'.format(wf_name))) #训练结果的保存 至aminer.emb
from utils.cache import LMDBClient dataset_names = [ "whoiswho_new", "aminerv1", "aminerv2", "aminerv3", "citeseerx", ] counter = dd(int) for dataset_name in dataset_names: overall_feature_dir = settings.get_overall_feature_dir() cnt = 0 LMDB_NAME = 'pub_authors.feature' lc = LMDBClient(dataset_name, LMDB_NAME) author_cnt = 0 with lc.db.begin() as txn: for k in txn.cursor(): features = data_utils.deserialize_embedding(k[1]) if author_cnt % 10000 == 0: print(author_cnt, features[0], counter.get(features[0])) author_cnt += 1 for f in features: cnt += 1 counter[f] += 1 idf = {} for k in counter: idf[k] = math.log(cnt / counter[k]) data_utils.dump_data(dict(idf), settings.get_overall_feature_dir(), "feature_idf.pkl")
def __iter__(self): with self.lc.db.begin() as txn: for k in txn.cursor(): author_feature = data_utils.deserialize_embedding(k[1]) random.shuffle(author_feature) yield author_feature