def cal_feature_idf():  #计算逆文档频率
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')  #特征目录
    counter = dd(int)  # 一种字典, 比{}多一个 如果没有查询到的key, 会返回int(0)
    cnt = 0
    LMDB_NAME = 'pub_authors.feature'  # (pid-j, author_feature)
    lc = LMDBClient(LMDB_NAME)  #连接 lmdb
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():  #遍历 lmdb
            features = data_utils.deserialize_embedding(
                k[1])  #反序列化 得到 特征对象 k[0]是id, k[1]是author_feature
            if author_cnt % 10000 == 0:
                print(
                    author_cnt, features[0], counter.get(features[0])
                )  #features[0] 是 类似"__NAME__yanjun_zhang" 是合作者的name_feature
            author_cnt += 1  #作者计数
            for f in features:
                cnt += 1  #记总数
                counter[f] += 1  # 记特征f 的出现次数
    idf = {}
    for k in counter:  # 计算特征k 对应的 idf
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(
        dict(idf), feature_dir,
        "feature_idf.pkl")  #写入 feature_idf.pkl 中 {feature: idf}
Beispiel #2
0
 def get(self, key):
     with self.db.begin() as txn:
         value = txn.get(key.encode())
     if value:
         return data_utils.deserialize_embedding(value)
     else:
         return None
Beispiel #3
0
def dump_author_embs():
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            # print ("pid_order: ", pid_order)
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                # print ("pid_order: is not none", pid_order)
                lc_emb.set(pid_order, cur_emb)
def dump_author_embs():  # 将作者嵌入 导入到 lmdb 中,  作者嵌入 是  词向量 IDF 的 加权平均
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(
        settings.GLOBAL_DATA_DIR,
        'feature_idf.pkl')  #取出 上个函数 计算的 idf {feature: idf}
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'  # (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)  # 连接 作者特征 lmdb
    LMDB_NAME_EMB = "author_100.emb.weighted"  # (pid-j, x^-)
    lc_emb = LMDBClient(LMDB_NAME_EMB)  # 连接 作者嵌入 lmdb
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)
def cal_feature_idf():
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')
    counter = dd(int)
    cnt = 0
    LMDB_NAME = 'sci_all_data_feature'
    lc = LMDBClient(LMDB_NAME)
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
#            print(k[0])
            features = data_utils.deserialize_embedding(k[1])
#            print(features)
            if author_cnt % 10000 == 0:
                print(author_cnt, features[0], counter.get(features[0]))
            author_cnt += 1
            for f in features:
                cnt += 1
                counter[f] += 1
    idf = {}
    for k in counter:
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(dict(idf), feature_dir, "feature_idf.pkl")
Beispiel #6
0
 def get_batch(self, keys):
     values = []
     with self.db.begin() as txn:
         for key in keys:
             value = txn.get(key.encode())
             if value:
                 values.append(data_utils.deserialize_embedding(value))
     return values
def idf_calc():
    df = defaultdict(int)

    lc = LMDBClient(LMDB_AUTHOR)
    with lc.db.begin() as txn:
        n_doc = txn.stat()['entries']
        for cnt, raw in enumerate(txn.cursor()):
            if (cnt + 1) % 10000 == 0:
                print('idf_calc %d' % (cnt + 1))
            author_feature = deserialize_embedding(raw[1])
            for word in author_feature:
                df[word] += 1

    idf_dict = defaultdict(float, [(word, math.log(n_doc / cnt))
                                   for word, cnt in df.items()])
    dump_data(idf_dict, WORD_IDF)
Beispiel #8
0
 def train(self, wf_name, size=EMB_DIM):
     data = []
     LMDB_NAME = 'pub_authors.feature'
     lc = LMDBClient(LMDB_NAME)
     author_cnt = 0
     with lc.db.begin() as txn:
         for k in txn.cursor():
             author_feature = data_utils.deserialize_embedding(k[1])
             if author_cnt % 10000 == 0:
                 print(author_cnt, author_feature[0])
             author_cnt += 1
             random.shuffle(author_feature)
             # print(author_feature)
             data.append(author_feature)
     self.model = Word2Vec(
         data, size=size, window=5, min_count=5, workers=20,
     )
     self.model.save(join(settings.EMB_DATA_DIR, '{}.emb'.format(wf_name)))
def dump_features_to_cache():
    '''
    generate author features by raw publication data and dump to cache
    
    '''
    lc = LMDBClient('sci_all_data')
    lm = LMDBClient('sci_all_data_feature')
    cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
            cnt += 1
            pid = k[0].decode()
            paper = data_utils.deserialize_embedding(k[1])
            if len(paper["author"]) > 100:
                print(cnt, pid, len(paper["author"]))
                continue
            features = extract_author_features(paper)
            if cnt % 10000 == 0:
                print('已经提取:%d 万篇论文'%(cnt/10000))
            lm.set(pid,features)
    lm.db.close()
    lc.db.close()
Beispiel #10
0
 def train(self, wf_name, size=EMB_DIM):  #训练
     data = []
     LMDB_NAME = 'pub_authors.feature'  # 用author_feature.txt 导入到 到 数据库
     lc = LMDBClient(LMDB_NAME)  #连接 数据库 (pid-j, author_feature)
     author_cnt = 0
     with lc.db.begin() as txn:
         for k in txn.cursor():  #通过cursor 遍历
             author_feature = data_utils.deserialize_embedding(
                 k[1])  #从k[1]中  反序列化  得到作者特征对象
             if author_cnt % 10000 == 0:
                 print(author_cnt, author_feature[0])
             author_cnt += 1  #计算作者总数
             random.shuffle(author_feature)  #打乱 作者特征
             # print(author_feature)
             data.append(author_feature)  #加入 数据集 data 中
     self.model = Word2Vec(
         data,
         size=size,
         window=5,
         min_count=5,
         workers=20,
     )  # 输入字符集, 词向量维数, 窗口大小(当前词与目标词的最大距离), 词频过滤值, 训练的并行
     self.model.save(join(settings.EMB_DATA_DIR,
                          '{}.emb'.format(wf_name)))  #训练结果的保存 至aminer.emb
from utils.cache import LMDBClient

dataset_names = [
    "whoiswho_new",
    "aminerv1",
    "aminerv2",
    "aminerv3",
    "citeseerx",
]

counter = dd(int)
for dataset_name in dataset_names:
    overall_feature_dir = settings.get_overall_feature_dir()
    cnt = 0
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(dataset_name, LMDB_NAME)
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
            features = data_utils.deserialize_embedding(k[1])
            if author_cnt % 10000 == 0:
                print(author_cnt, features[0], counter.get(features[0]))
            author_cnt += 1
            for f in features:
                cnt += 1
                counter[f] += 1
idf = {}
for k in counter:
    idf[k] = math.log(cnt / counter[k])
data_utils.dump_data(dict(idf), settings.get_overall_feature_dir(),
                     "feature_idf.pkl")
 def __iter__(self):
     with self.lc.db.begin() as txn:
         for k in txn.cursor():
             author_feature = data_utils.deserialize_embedding(k[1])
             random.shuffle(author_feature)
             yield author_feature