def dump_feature_id_to_file():
    """
    transform a publication into a set of author and word IDs, dump it to csv
    """
    model = EmbeddingModel.Instance()
    author_emb_model = model.load_author_name_emb()
    author_emb_file = "author_emb.array"
    word_emb_model = model.load_word_name_emb()
    word_emb_file = "word_emb.array"
    dump_emb_array(author_emb_model, author_emb_file)
    dump_emb_array(word_emb_model, word_emb_file)

    features = data_utils.load_data('Essential_Embeddings/', "pub.features")
    author_idfs = data_utils.load_data('Essential_Embeddings/global/', 'author_feature_idf.pkl')
    word_idfs = data_utils.load_data('Essential_Embeddings/global/', 'word_feature_idf.pkl')
    index = 0
    feature_dict = {}
    for pub_index in range(len(features)):
    	pub_features = features[pub_index]
    	if (pub_features == None):
            continue
    	for author_index in range(len(pub_features)):
    		aid, author_features, word_features = pub_features[author_index]
    		if index % 100000 == 0:
    			print(index, author_features, word_features)
    		index += 1
    		author_id_list, author_idf_list = get_feature_ids_idfs_for_one_pub(author_features, author_emb_model, author_idfs)
    		word_id_list, word_idf_list = get_feature_ids_idfs_for_one_pub(word_features, word_emb_model, word_idfs)

    		if author_id_list is not None or word_id_list is not None:
    			feature_dict[aid] = (author_id_list, author_idf_list, word_id_list, word_idf_list)
    data_utils.dump_data(feature_dict, 'Essential_Embeddings/emb/', "pub_feature.ids")
Example #2
0
def cal_feature_idf():  #计算逆文档频率
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')  #特征目录
    counter = dd(int)  # 一种字典, 比{}多一个 如果没有查询到的key, 会返回int(0)
    cnt = 0
    LMDB_NAME = 'pub_authors.feature'  # (pid-j, author_feature)
    lc = LMDBClient(LMDB_NAME)  #连接 lmdb
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():  #遍历 lmdb
            features = data_utils.deserialize_embedding(
                k[1])  #反序列化 得到 特征对象 k[0]是id, k[1]是author_feature
            if author_cnt % 10000 == 0:
                print(
                    author_cnt, features[0], counter.get(features[0])
                )  #features[0] 是 类似"__NAME__yanjun_zhang" 是合作者的name_feature
            author_cnt += 1  #作者计数
            for f in features:
                cnt += 1  #记总数
                counter[f] += 1  # 记特征f 的出现次数
    idf = {}
    for k in counter:  # 计算特征k 对应的 idf
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(
        dict(idf), feature_dir,
        "feature_idf.pkl")  #写入 feature_idf.pkl 中 {feature: idf}
def cal_feature_idf():
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')
    counter = dd(int)
    cnt = 0
    LMDB_NAME = 'sci_all_data_feature'
    lc = LMDBClient(LMDB_NAME)
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
#            print(k[0])
            features = data_utils.deserialize_embedding(k[1])
#            print(features)
            if author_cnt % 10000 == 0:
                print(author_cnt, features[0], counter.get(features[0]))
            author_cnt += 1
            for f in features:
                cnt += 1
                counter[f] += 1
    idf = {}
    for k in counter:
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(dict(idf), feature_dir, "feature_idf.pkl")
def dump_emb_array(emb_model, output_name):
	global _emb_model
	_emb_model = emb_model
	# transform the feature embeddings from embedding to (id, embedding)
	res = multithread_utils.processed_by_multi_thread(get_feature_index, range(len(_emb_model.wv.vocab)))
	sorted_embeddings = sorted(res, key=lambda x:x[0])
	word_embeddings = list(list(zip(*sorted_embeddings))[1])
	data_utils.dump_data(np.array(word_embeddings), 'Essential_Embeddings/emb/', output_name)
def dump_pub_features_to_file():
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    global _pubs_dict

    # Load publication features
    _pubs_dict = data_utils.load_json('./OAG_WhoIsWho_data', 'your_pub_file_name')
    res = multithread_utils.processed_by_multi_thread(get_pub_feature, range(len(_pubs_dict)))
    data_utils.dump_data(res, "Essential_Embeddings/", "pub.features")
Example #6
0
 def prepare_network_input(self, role, fold):
     """
     prepare cnn model input
     :param role: 'train' or 'test'
     :param fold: cross validation fold
     :return: constructed matrices
     """
     pos_pairs = self.paper_data_utils.construct_positive_paper_pairs(
         role, fold)
     logger.info('positive paper pairs built')
     neg_pairs = self.paper_data_utils.load_train_neg_paper_pairs(fold)
     logger.info('negative paper pairs loaded')
     pos_title_matrices, pos_author_matrices = self.pairs2multiple_matrices(
         pos_pairs)
     data_utils.dump_data(pos_author_matrices, self.matrices_dir,
                          'pos_author_matrices_{}.pkl'.format(fold))
     data_utils.dump_data(pos_title_matrices, self.matrices_dir,
                          'pos_title_matrices_{}.pkl'.format(fold))
     neg_title_matrices, neg_author_matrices = self.pairs2multiple_matrices(
         neg_pairs)
     data_utils.dump_data(neg_author_matrices, self.matrices_dir,
                          'neg_author_matrices_{}.pkl'.format(fold))
     data_utils.dump_data(neg_title_matrices, self.matrices_dir,
                          'neg_title_matrices_{}.pkl'.format(fold))
     return pos_title_matrices, pos_author_matrices, neg_title_matrices, neg_author_matrices
def idf_calc():
    df = defaultdict(int)

    lc = LMDBClient(LMDB_AUTHOR)
    with lc.db.begin() as txn:
        n_doc = txn.stat()['entries']
        for cnt, raw in enumerate(txn.cursor()):
            if (cnt + 1) % 10000 == 0:
                print('idf_calc %d' % (cnt + 1))
            author_feature = deserialize_embedding(raw[1])
            for word in author_feature:
                df[word] += 1

    idf_dict = defaultdict(float, [(word, math.log(n_doc / cnt))
                                   for word, cnt in df.items()])
    dump_data(idf_dict, WORD_IDF)
def cal_feature_idf():
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    features = data_utils.load_data('Essential_Embeddings/', "pub.features")
    feature_dir = join('Essential_Embeddings/', 'global')
    index = 0
    author_counter = dd(int)
    author_cnt = 0
    word_counter = dd(int)
    word_cnt = 0
    none_count = 0
    for pub_index in range(len(features)):
        pub_features = features[pub_index]
        # print(pub_features)
        if (pub_features == None):
            none_count += 1
            continue
        for author_index in range(len(pub_features)):
            aid, author_features, word_features = pub_features[author_index]

            if index % 100000 == 0:
                print(index, aid)
            index += 1
            
            for af in author_features:
                author_cnt += 1
                author_counter[af] += 1

            for wf in word_features:
                word_cnt +=1
                word_counter[wf] +=1        

    author_idf = {}
    for k in author_counter:
        author_idf[k] = math.log(author_cnt / author_counter[k])

    word_idf = {}
    for k in word_counter:
        word_idf[k] = math.log(word_cnt / word_counter[k])

    data_utils.dump_data(dict(author_idf), feature_dir, "author_feature_idf.pkl")
    data_utils.dump_data(dict(word_idf), feature_dir, "word_feature_idf.pkl")
    print("None count: ", none_count)
Example #9
0
 def dump_triplets(self, role='train'):
     triplets = self.gen_triplets_mp(role)
     if role == 'train':
         out_dir = join(settings.OUT_DIR,
                        'triplets-{}'.format(self.save_size))
     else:
         out_dir = join(settings.OUT_DIR, 'test-triplets')
     os.makedirs(out_dir, exist_ok=True)
     anchor_embs = []
     pos_embs = []
     neg_embs = []
     f_idx = 0
     for i, t in enumerate(triplets):
         if i % 100 == 0:
             print(i, datetime.now() - start_time)
         emb_anc, emb_pos, emb_neg = t[0], t[1], t[2]
         anchor_embs.append(emb_anc)
         pos_embs.append(emb_pos)
         neg_embs.append(emb_neg)
         if len(anchor_embs) == self.batch_size:
             data_utils.dump_data(
                 anchor_embs, out_dir,
                 'anchor_embs_{}_{}.pkl'.format(role, f_idx))
             data_utils.dump_data(pos_embs, out_dir,
                                  'pos_embs_{}_{}.pkl'.format(role, f_idx))
             data_utils.dump_data(neg_embs, out_dir,
                                  'neg_embs_{}_{}.pkl'.format(role, f_idx))
             f_idx += 1
             anchor_embs = []
             pos_embs = []
             neg_embs = []
     if anchor_embs:
         data_utils.dump_data(anchor_embs, out_dir,
                              'anchor_embs_{}_{}.pkl'.format(role, f_idx))
         data_utils.dump_data(pos_embs, out_dir,
                              'pos_embs_{}_{}.pkl'.format(role, f_idx))
         data_utils.dump_data(neg_embs, out_dir,
                              'neg_embs_{}_{}.pkl'.format(role, f_idx))
     print('dumped')
Example #10
0
 def prepare_LSH_parameters(self, role, fold):
     proj = np.random.normal(size=(self.vectors_dim, self.title_bit))
     fname = 'LSH_proj_matrix_{}_{}.pkl'.format(role, fold)
     if not self.without_inner_results:
         data_utils.dump_data(proj, self.para_dir, fname)
     return proj
Example #11
0
 def dump_triplets(self, role='train'): 
     triplets = self.gen_triplets_mp(role) #得到 嵌入表达(x^-) 的三元组集 使用了多进程 multi_process
     if role == 'train': #设定输出目录
         out_dir = join(settings.OUT_DIR, 'triplets-{}'.format(self.save_size)) 
     else:
         out_dir = join(settings.OUT_DIR, 'test-triplets')
     os.makedirs(out_dir, exist_ok=True) #创建目录
     anchor_embs = []
     pos_embs = []
     neg_embs = []
     f_idx = 0
     for i, t in enumerate(triplets): #枚举 三元组t
         if i % 100 == 0:
             print(i, datetime.now()-start_time)
         emb_anc, emb_pos, emb_neg = t[0], t[1], t[2] #取出对应的嵌入向量x^-
         anchor_embs.append(emb_anc) #依次加入到对应列表中
         pos_embs.append(emb_pos)
         neg_embs.append(emb_neg)
         if len(anchor_embs) == self.batch_size: #到达了设定的批次规模, 批次写入到文件中
             data_utils.dump_data(anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) #若干个x^-
             data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx))
             data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx))
             f_idx += 1 #批次计数
             anchor_embs = [] #及时清空
             pos_embs = []
             neg_embs = []
     if anchor_embs: #如果还有剩的, 把最后剩的一批也输出出去
         data_utils.dump_data(anchor_embs, out_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx))
         data_utils.dump_data(pos_embs, out_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx))
         data_utils.dump_data(neg_embs, out_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx))
     print('dumped') 
from utils.cache import LMDBClient

dataset_names = [
    "whoiswho_new",
    "aminerv1",
    "aminerv2",
    "aminerv3",
    "citeseerx",
]

counter = dd(int)
for dataset_name in dataset_names:
    overall_feature_dir = settings.get_overall_feature_dir()
    cnt = 0
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(dataset_name, LMDB_NAME)
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
            features = data_utils.deserialize_embedding(k[1])
            if author_cnt % 10000 == 0:
                print(author_cnt, features[0], counter.get(features[0]))
            author_cnt += 1
            for f in features:
                cnt += 1
                counter[f] += 1
idf = {}
for k in counter:
    idf[k] = math.log(cnt / counter[k])
data_utils.dump_data(dict(idf), settings.get_overall_feature_dir(),
                     "feature_idf.pkl")
def pid2index(rfpath, wfpath):
    pubs = pd.read_parquet(rfpath)
    index = {}
    for name, pub in pubs.groupby('name'):
        index[name] = pub.loc[:, 'id'].values
    dump_data(index, wfpath=wfpath)