Esempio n. 1
0
def dump_author_embs():  # 将作者嵌入 导入到 lmdb 中,  作者嵌入 是  词向量 IDF 的 加权平均
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(
        settings.GLOBAL_DATA_DIR,
        'feature_idf.pkl')  #取出 上个函数 计算的 idf {feature: idf}
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'  # (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)  # 连接 作者特征 lmdb
    LMDB_NAME_EMB = "author_100.emb.weighted"  # (pid-j, x^-)
    lc_emb = LMDBClient(LMDB_NAME_EMB)  # 连接 作者嵌入 lmdb
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)
Esempio n. 2
0
def dump_author_embs():
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            # print ("pid_order: ", pid_order)
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                # print ("pid_order: is not none", pid_order)
                lc_emb.set(pid_order, cur_emb)
Esempio n. 3
0
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                lc_emb.set(pid_order, cur_emb)


if __name__ == '__main__':
    """
    some pre-processing
    """
    dump_author_features_to_file()
    dump_author_features_to_cache()
    emb_model = EmbeddingModel.Instance()
    emb_model.train('aminer')  # training word embedding model
    cal_feature_idf()
    dump_author_embs()
    print('done', datetime.now() - start_time)
Esempio n. 4
0
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)


if __name__ == '__main__':
    """
    some pre-processing
    """
    dump_author_features_to_file()  # 将作者特征写入本地文件 中
    dump_author_features_to_cache()  # 将作者特征写入 cache 中
    emb_model = EmbeddingModel.Instance()  # 实例化 嵌入 模型
    emb_model.train('aminer')  # training word embedding model Word2Vec
    cal_feature_idf()  # 计算 特征 到 逆文档频率
    dump_author_embs()  # 将作者嵌入 导入 数据库 中
    print('done', datetime.now() - start_time)
Esempio n. 5
0
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(dataset_name, LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                lc_emb.set(pid_order, cur_emb)


if __name__ == '__main__':
    """
    some pre-processing
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_name", default="whoiswho_new", type=str)
    args = parser.parse_args()
    dataset_name = args.dataset_name
    dump_author_features_to_file(dataset_name)
    dump_author_features_to_cache(dataset_name)
    emb_model = EmbeddingModel.Instance(dataset_name)
    emb_model.train()  # training word embedding model
    cal_feature_idf()
    dump_author_embs()
    print('done', datetime.now() - start_time)