Example #1
0
def dump_test_emb():
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])
Example #2
0
def dump_author_embs():  # 将作者嵌入 导入到 lmdb 中,  作者嵌入 是  词向量 IDF 的 加权平均
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(
        settings.GLOBAL_DATA_DIR,
        'feature_idf.pkl')  #取出 上个函数 计算的 idf {feature: idf}
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'  # (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)  # 连接 作者特征 lmdb
    LMDB_NAME_EMB = "author_100.emb.weighted"  # (pid-j, x^-)
    lc_emb = LMDBClient(LMDB_NAME_EMB)  # 连接 作者嵌入 lmdb
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)
Example #3
0
def testDataRun():
    cnt = 0
    metrics = np.zeros(3)
    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'),
                     'w',
                     encoding='utf-8')
    LMDB_NAME_EMB = "graph_auto_encoder_embedding"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)
    name_to_pubs_test = load_test_names()
    for name in name_to_pubs_test:
        prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train(
            name=name, needtSNE=True)
        print(name, prec, rec, f1)
        wf.write('{0},{1:.5f},{2:.5f},{3:.5f}\n'.format(name, prec, rec, f1))
        wf.flush()

        metrics[0] = metrics[0] + prec
        metrics[1] = metrics[1] + rec
        metrics[2] = metrics[2] + f1
        cnt += 1

        for pid, embedding in zip(pids, attentionEmbeddings):
            lc_emb.set(pid, embedding)

    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = eval_utils.cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))
    wf.close()
Example #4
0
def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
Example #5
0
def dump_author_embs():
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            # print ("pid_order: ", pid_order)
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                # print ("pid_order: is not none", pid_order)
                lc_emb.set(pid_order, cur_emb)
def dump_file_todict():
    ##采用分块读取的方法,主要用到参数chunksize,iterator参数(常用参数)
    yy = pd.read_csv('/home/wss/sites/disamb/sci_process/data/t_018_sci_disamb_string_precess.csv',
                       usecols = ['uid','author','title','abstract','keyword','org_name','pubyear','source'],sep = ',',iterator=True,encoding ='utf-8')

    # df = yy.get_chunk(1)
    # print(len(df))
    # print(df.columns)
    # print(df.head)
    loop = True
    chunkSize = 5000
    cnt = 0
    lc = LMDBClient('sci_all_data')
    while loop:
        try:
            chunk = yy.get_chunk(chunkSize)
            cnt += 1
            print('sci1800万论文存储了:%0.2f 万行'%(cnt*0.5))
            dataset_dict = chunk.to_dict(orient = 'records')
            for pap in dataset_dict:
                pap['author'] = pap['author'].split('|')
                pid_order = pap['uid']
                lc.set(pid_order, pap)
        except StopIteration:
            loop = False
            print("Iteration is stopped.")
    lc.db.close()
    print("分块处理存取进lmdb数据库用时为:%0.2f s"%(time.time()-start))
def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME)
    lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = {}
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name)
    _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json")
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_test.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
    for name in name_to_pubs_train:
        print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_train.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
def testHAN():
    LMDB_NAME_EMB = "lc_attention_network_embedding2"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)

    name_to_pubs_train = load_train_names()
    for name in name_to_pubs_train:
        prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train(
            name=name, ispretrain=True, needtSNE=False)
        for pid, attentionEmbedding in zip(pids, attentionEmbeddings):
            lc_emb.set(pid, attentionEmbedding)
        print(name, prec, rec, f1)
Example #9
0
def dump_author_features_to_cache():
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            pid_order = items[0]
            author_features = items[1].split()
            lc.set(pid_order, author_features)
Example #10
0
def dump_author_features_to_cache():
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'publication_IslandLoss.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(join(settings.ISLAND_LOSS_DIR, 'paper_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            pid_order = items[0]
            # print ("pid_order: ", pid_order, items)
            author_features = items[1].split()
            lc.set(pid_order, author_features)
Example #11
0
def dump_author_features_to_cache():  #将作者特征 导入到 cache 中 本地数据库 lmdb
    """
    dump author features to lmdb 
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(
            join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
            'r',
            encoding='utf-8') as rf:  #之前把特征写入到了文件 auther_features.txt 中, 这里 读取
        for i, line in enumerate(
                rf):  #枚举 第i行 line 1行对应一个author_feature  pid-j\tauthor_feature
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split(
                '\t')  #删除末尾空格 后  按'\t'分割  pid-j, author_feature
            pid_order = items[0]  #提取文档序号 对应 上一个函数中的输出格式  pid-j 文档id-第j个作者
            author_features = items[1].split()  # 提取作者特征 每个特征用空格分割为 列表了
            lc.set(pid_order, author_features)  #导入 到 数据库 中
Example #12
0
def dump_author_features_to_cache(dataset_name):
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(dataset_name, LMDB_NAME)
    with codecs.open(join(settings.get_global_data_dir(dataset_name),
                          'author_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            # print(line)
            pid_order = items[0]
            if len(items) > 1:
                author_features = items[1].split()
            else:
                author_features = []
            lc.set(pid_order, author_features)
def dump_features_to_cache():
    '''
    generate author features by raw publication data and dump to cache
    
    '''
    lc = LMDBClient('sci_all_data')
    lm = LMDBClient('sci_all_data_feature')
    cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
            cnt += 1
            pid = k[0].decode()
            paper = data_utils.deserialize_embedding(k[1])
            if len(paper["author"]) > 100:
                print(cnt, pid, len(paper["author"]))
                continue
            features = extract_author_features(paper)
            if cnt % 10000 == 0:
                print('已经提取:%d 万篇论文'%(cnt/10000))
            lm.set(pid,features)
    lm.db.close()
    lc.db.close()
def dump_inter_emb(pids):  # 从训练的全局模型中 取出 隐藏层, 给局部模型使用
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"  # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-)
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'  # (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)  # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中
    global_model = GlobalTripletModel(data_scale=1000000)  # 实例化一个全局模型
    trained_global_model = global_model.load_triplets_model()  # 加载一个训练好的全局模型

    embs_input = []
    for pid in pids:
        cur_emb = lc_input.get(pid)
        if cur_emb is None:
            print("ERROR: not found embedding x for pid:%s\n" % (pid))
            continue
        embs_input.append(cur_emb)
    embs_input = np.stack(embs_input)
    inter_embs = get_hidden_output(trained_global_model, embs_input)
    for i, pid in enumerate(pids):
        lc_inter.set(pid, inter_embs[i])
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
Example #15
0
for name in name_to_test:
    for aid in name_to_test[name]:
        if len(name_to_test[name][aid]) < 5:
            continue
        for pid in name_to_test[name][aid]:
            TestDataPids.append(pid)


def getRawEmbedding(pids):
    rawEmbedding = []
    for pid in pids:
        rawEmbedding.append(rawFeature.get(pid))
    rawEmbedding = np.array(rawEmbedding)
    rawEmbedding.reshape(-1,1)
    return rawEmbedding

TestDataEmbedding = getRawEmbedding(TestDataPids)
transformEmbedding = raw2localTrans.predict(TestDataEmbedding)

LMDB_NAME_EMB = "raw_transform_local_embedding"
lc_emb = LMDBClient(LMDB_NAME_EMB)

# print (transformEmbedding)

for idx, pid in enumerate(TestDataPids):
    print (idx, pid)
    embedd = transformEmbedding[idx]
    lc_emb.set(pid, embedd)

print ("done")
feature = tf.keras.layers.Lambda(l2Norm, name='norm_layer',
                                 output_shape=[64])(layer2)
logits = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(feature)

export_path = join(settings.ISLAND_LOSS_DIR, "feature_model")

saver = tf.train.Saver()
with tf.Session() as sess:
    # vali_acc = "0.47049925"
    vali_acc = "0.63398695"
    path = join(settings.ISLAND_LOSS_DIR, "200", "vali_acc_%s" % (vali_acc),
                "feature_model")

    saver.restore(sess, path)
    # saver.restore(sess, join(settings.ISLAND_LOSS_DIR, "feature_model"))

    Features = sess.run(feature,
                        feed_dict={
                            input_images: AllX - MeanAllX,
                            labels: Ally
                        })

    # EmbedingCheck.check(Features, Ally, name="train2_embedding.jpg")

    for idx, pid in enumerate(pids):
        emb = Features[idx]
        lc.set(pid, emb)

    # EmbedingCheck.check(Features, TestY, name="train2_embedding.jpg")
Example #17
0
def dump_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=10000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train.json')
    for name in name_to_pubs_train:
        if name == "roger_williams":
            continue
        # print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    # print ("pid emb is null: ", pid)
                    continue

                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])

    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json')
    for name in name_to_pubs_test:
        if name == "roger_williams" or name == "j_yu":
            continue
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    print ("pid emb is null: ", pid)
                    continue

                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])