Ejemplo n.º 1
0
def dump_author_embs():  # 将作者嵌入 导入到 lmdb 中,  作者嵌入 是  词向量 IDF 的 加权平均
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(
        settings.GLOBAL_DATA_DIR,
        'feature_idf.pkl')  #取出 上个函数 计算的 idf {feature: idf}
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'  # (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)  # 连接 作者特征 lmdb
    LMDB_NAME_EMB = "author_100.emb.weighted"  # (pid-j, x^-)
    lc_emb = LMDBClient(LMDB_NAME_EMB)  # 连接 作者嵌入 lmdb
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)
Ejemplo n.º 2
0
def dump_file_todict():
    ##采用分块读取的方法,主要用到参数chunksize,iterator参数(常用参数)
    yy = pd.read_csv('/home/wss/sites/disamb/sci_process/data/t_018_sci_disamb_string_precess.csv',
                       usecols = ['uid','author','title','abstract','keyword','org_name','pubyear','source'],sep = ',',iterator=True,encoding ='utf-8')

    # df = yy.get_chunk(1)
    # print(len(df))
    # print(df.columns)
    # print(df.head)
    loop = True
    chunkSize = 5000
    cnt = 0
    lc = LMDBClient('sci_all_data')
    while loop:
        try:
            chunk = yy.get_chunk(chunkSize)
            cnt += 1
            print('sci1800万论文存储了:%0.2f 万行'%(cnt*0.5))
            dataset_dict = chunk.to_dict(orient = 'records')
            for pap in dataset_dict:
                pap['author'] = pap['author'].split('|')
                pid_order = pap['uid']
                lc.set(pid_order, pap)
        except StopIteration:
            loop = False
            print("Iteration is stopped.")
    lc.db.close()
    print("分块处理存取进lmdb数据库用时为:%0.2f s"%(time.time()-start))
def test_prepare_local_data(Name):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    # cnt = 0
    wf_contents = []
    for i, name in enumerate(name_to_pubs_test):
        if name != Name: continue
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            # if len(items) < 5:
            #     continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)
                wf_contents.append({'pid': pid, 'label': pids2label[pid]})
                # cur_pub_emb = list(map(str, cur_pub_emb))
                # wf_content.write('{}\t'.format(pid))
                # wf_content.write('\t'.join(cur_pub_emb))
                # wf_content.write('\t{}\n'.format(pids2label[pid]))
    PidsLabels = [x['label'] for x in wf_contents]
    print(len(set(PidsLabels)))
Ejemplo n.º 4
0
def dump_author_embs():
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            # print ("pid_order: ", pid_order)
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                # print ("pid_order: is not none", pid_order)
                lc_emb.set(pid_order, cur_emb)
Ejemplo n.º 5
0
def testDataRun():
    cnt = 0
    metrics = np.zeros(3)
    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'),
                     'w',
                     encoding='utf-8')
    LMDB_NAME_EMB = "graph_auto_encoder_embedding"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)
    name_to_pubs_test = load_test_names()
    for name in name_to_pubs_test:
        prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train(
            name=name, needtSNE=True)
        print(name, prec, rec, f1)
        wf.write('{0},{1:.5f},{2:.5f},{3:.5f}\n'.format(name, prec, rec, f1))
        wf.flush()

        metrics[0] = metrics[0] + prec
        metrics[1] = metrics[1] + rec
        metrics[2] = metrics[2] + f1
        cnt += 1

        for pid, embedding in zip(pids, attentionEmbeddings):
            lc_emb.set(pid, embedding)

    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = eval_utils.cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))
    wf.close()
def dump_inter_emb():
    """
    dump hidden embedding via trained global_ model for local model to use
    """
    Res = defaultdict(list)
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    # print(name_to_pubs_test)
    for name in name_to_pubs_test:
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
            Res[pid_].append(inter_embs[i])

    # the same as the train data
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                              'name_to_pubs_train_500.json')
    for name in name_to_pubs_train:
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
def testHAN():
    LMDB_NAME_EMB = "lc_attention_network_embedding2"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)

    name_to_pubs_train = load_train_names()
    for name in name_to_pubs_train:
        prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train(
            name=name, ispretrain=True, needtSNE=False)
        for pid, attentionEmbedding in zip(pids, attentionEmbeddings):
            lc_emb.set(pid, attentionEmbedding)
        print(name, prec, rec, f1)
Ejemplo n.º 8
0
def dumpPublicationLabel():
    PUBLICATION_LABEL = 'Publication.label'
    lc_publication_label = LMDBClient(PUBLICATION_LABEL)
    with open(join(settings.TRAIN_PUB_DIR, "train_author.json"), "r") as fp:
        train_author = json.load(fp)
        fp.close()
    classes_dict = encode_labels(train_author)

    for name in train_author.keys():
        for aid in train_author[name].keys():
            for pid in train_author[name][aid]:
                print("%s : %s" % (pid, classes_dict[aid]))
                lc_publication_label.set_label(pid, classes_dict[aid])
Ejemplo n.º 9
0
def dump_author_features_to_cache():
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            pid_order = items[0]
            author_features = items[1].split()
            lc.set(pid_order, author_features)
Ejemplo n.º 10
0
def cal_feature_idf():
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')
    counter = dd(int)
    cnt = 0
    LMDB_NAME = 'sci_all_data_feature'
    lc = LMDBClient(LMDB_NAME)
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
#            print(k[0])
            features = data_utils.deserialize_embedding(k[1])
#            print(features)
            if author_cnt % 10000 == 0:
                print(author_cnt, features[0], counter.get(features[0]))
            author_cnt += 1
            for f in features:
                cnt += 1
                counter[f] += 1
    idf = {}
    for k in counter:
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(dict(idf), feature_dir, "feature_idf.pkl")
Ejemplo n.º 11
0
def cal_feature_idf():  #计算逆文档频率
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    feature_dir = join(settings.DATA_DIR, 'global')  #特征目录
    counter = dd(int)  # 一种字典, 比{}多一个 如果没有查询到的key, 会返回int(0)
    cnt = 0
    LMDB_NAME = 'pub_authors.feature'  # (pid-j, author_feature)
    lc = LMDBClient(LMDB_NAME)  #连接 lmdb
    author_cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():  #遍历 lmdb
            features = data_utils.deserialize_embedding(
                k[1])  #反序列化 得到 特征对象 k[0]是id, k[1]是author_feature
            if author_cnt % 10000 == 0:
                print(
                    author_cnt, features[0], counter.get(features[0])
                )  #features[0] 是 类似"__NAME__yanjun_zhang" 是合作者的name_feature
            author_cnt += 1  #作者计数
            for f in features:
                cnt += 1  #记总数
                counter[f] += 1  # 记特征f 的出现次数
    idf = {}
    for k in counter:  # 计算特征k 对应的 idf
        idf[k] = math.log(cnt / counter[k])
    data_utils.dump_data(
        dict(idf), feature_dir,
        "feature_idf.pkl")  #写入 feature_idf.pkl 中 {feature: idf}
Ejemplo n.º 12
0
def test():
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            pid_order = k[0].decode('utf-8')
            print("pid order", pid_order)
Ejemplo n.º 13
0
def test(name):
    LMDB_NAME_EMB = "lc_attention_network_embedding"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)
    prec, rec, f1, pids, embedding = han.prepare_and_train(name=name,
                                                           ispretrain=False,
                                                           needtSNE=True)
    print(name, prec, rec, f1)
Ejemplo n.º 14
0
def dump_author_features_to_cache():
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'publication_IslandLoss.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(join(settings.ISLAND_LOSS_DIR, 'paper_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            pid_order = items[0]
            # print ("pid_order: ", pid_order, items)
            author_features = items[1].split()
            lc.set(pid_order, author_features)
Ejemplo n.º 15
0
def dump_test_emb():
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])
Ejemplo n.º 16
0
def gen_local_data(pids,
                   labels,
                   idf_threshold=10
                   ):  # 对每一个作者名, 生成局部数据, 包括文档特征 与 文档网络; 输入参数是阀值, 也就是相似度高于多少才连边
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR,
                               'feature_idf.pkl')  # 加载 特征的 idf值 {word: idf}
    INTER_LMDB_NAME = 'author_triplets.emb'  # 加载 作者在triplet训练后的 内部嵌入 (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"  # 加载 作者 原始 特征 (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))  # 建立目录, 做好保存局部模型 的工作
    os.makedirs(graph_dir, exist_ok=True)

    name = "Name"

    wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w')
    shuffle(pids)  # 打乱

    for i, pid in enumerate(pids):
        cur_pub_emb = lc_inter.get(pid)  # 获得文档嵌入 y
        if cur_pub_emb is not None:
            cur_pub_emb = list(map(str, cur_pub_emb))  #把cur_pub_emb 转换成字符串 表达
            wf_content.write('{}\t'.format(pid))  # 文档id
            wf_content.write('\t'.join(cur_pub_emb))  # 嵌入 y
            wf_content.write('\t{}\n'.format(pid))  # pid
        else:
            print("ERROR: not found embedding y for pid:%s\n" % (pid))

    wf_content.close()  # pid-j, y, aid

    # generate network
    n_pubs = len(pids)
    wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                      'w')  # 作者名 - 网络保存路径 (pid-j, pid-j)
    edges_num = 0
    for i in range(n_pubs - 1):  # 枚举 文档 i
        author_feature1 = set(lc_feature.get(
            pids[i]))  # 取出 文档i 原始 特征 (pid-j, author_feature)
        for j in range(i + 1, n_pubs):  # 枚举 后面 点 文档 j
            author_feature2 = set(lc_feature.get(pids[j]))  # 取出 文档j 原始 特征
            common_features = author_feature1.intersection(
                author_feature2)  # 提取 公共特征
            idf_sum = 0
            for f in common_features:  # 枚举 公共特征 中的 特征f
                idf_sum += idf.get(f, idf_threshold)  # 计算 idf 和
                # print(f, idf.get(f, idf_threshold))
            if idf_sum >= idf_threshold:  # 和 大于阀值
                wf_network.write('{}\t{}\n'.format(
                    pids[i], pids[j]))  # 连边, 写入 图网络 文件中 (pid-j, pid-j)
                edges_num = edges_num + 1
    print('n_egdes', edges_num)
    wf_network.close()
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # 加载 测试集 name->aid->pid-j
Ejemplo n.º 17
0
def dump_author_features_to_cache():  #将作者特征 导入到 cache 中 本地数据库 lmdb
    """
    dump author features to lmdb 
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(LMDB_NAME)
    with codecs.open(
            join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
            'r',
            encoding='utf-8') as rf:  #之前把特征写入到了文件 auther_features.txt 中, 这里 读取
        for i, line in enumerate(
                rf):  #枚举 第i行 line 1行对应一个author_feature  pid-j\tauthor_feature
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split(
                '\t')  #删除末尾空格 后  按'\t'分割  pid-j, author_feature
            pid_order = items[0]  #提取文档序号 对应 上一个函数中的输出格式  pid-j 文档id-第j个作者
            author_features = items[1].split()  # 提取作者特征 每个特征用空格分割为 列表了
            lc.set(pid_order, author_features)  #导入 到 数据库 中
def test(idf_threshold):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # 286 hongbin_li_pubs_content.txt
        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)

        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)

        # generate network1
        # generate network1
        all_idf_sum = 0
        pathCnt = 0
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        for i in range(n_pubs - 1):
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                all_idf_sum += idf_sum
                if idf_sum >= idf_threshold:
                    pathCnt = pathCnt + 1

        if name == "kexin_xu":
            print("all_idf_sum: ", all_idf_sum)
            print("pathCnt: ", pathCnt)
Ejemplo n.º 19
0
def dump_author_features_to_cache(dataset_name):
    """
    dump author features to lmdb
    """
    LMDB_NAME = 'pub_authors.feature'
    lc = LMDBClient(dataset_name, LMDB_NAME)
    with codecs.open(join(settings.get_global_data_dir(dataset_name),
                          'author_features.txt'),
                     'r',
                     encoding='utf-8') as rf:
        for i, line in enumerate(rf):
            if i % 1000 == 0:
                print('line', i)
            items = line.rstrip().split('\t')
            # print(line)
            pid_order = items[0]
            if len(items) > 1:
                author_features = items[1].split()
            else:
                author_features = []
            lc.set(pid_order, author_features)
def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(test_dataset_name, LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = {}
    TRAIN_NAME_LIST, TEST_NAME_LIST = settings.get_split_name_list(
        test_dataset_name)
    for case_name in TRAIN_NAME_LIST + TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(test_dataset_name), case_name),
            "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        if len(embs_input) <= 1:
            embs_input = []
            pids = []
            for i, aid in enumerate(name_data.keys()):
                if len(name_data[aid]
                       ) < 1:  # n_pubs of current author is too small
                    continue
                for pid in name_data[aid]:
                    cur_emb = lc_input.get(pid)
                    if cur_emb is None:
                        continue
                    embs_input.append(cur_emb)
                    pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
Ejemplo n.º 21
0
def process_by_name(pids):
    ### preprocessing
    print('n_papers: ', len(pids))
    if len(pids) < 10:
        print("too few parpers, continue...")
        return

    ### prepare_local_data
    IDF_THRESHOLD = 32
    dump_inter_emb(pids)
    gen_local_data(idf_threshold=IDF_THRESHOLD, pids=pids, labels=None)

    ### count_size
    LMDB_NAME = "author_100.emb.weighted"  #(pid-j, x^-)
    lc = LMDBClient(LMDB_NAME)  # 作者 特者 嵌入 加权 平均 (x^-)

    k = 300
    test_x = []
    x = []  # 在name下 抽样k个 文档特征x^- 放入一个列表中
    sampled_points = [
        pids[p] for p in np.random.choice(len(pids), k, replace=True)
    ]  # 文档集 中 随机取样 k 个
    for p in sampled_points:
        x.append(lc.get(p))  # 否则 从 数据库 中 取出 特征x^-
    test_x.append(np.stack(x))
    test_x = np.stack(test_x)

    model_dir = join(settings.OUT_DIR, 'model')  #设定模型目录
    rf = open(join(model_dir, 'model-count.json'), 'r')  # 加载模型结构
    model_json = rf.read()
    rf.close()
    loaded_model = model_from_json(model_json)
    loaded_model.load_weights(join(model_dir, 'model-count.h5'))  # 加载模型 权重

    kk = loaded_model.predict(test_x)
    print('num_pred:', kk)

    ### local\gae\train
    ret = gae_for_na('Name', int(kk[0][0]))
    return ret
Ejemplo n.º 22
0
def dump_features_to_cache():
    '''
    generate author features by raw publication data and dump to cache
    
    '''
    lc = LMDBClient('sci_all_data')
    lm = LMDBClient('sci_all_data_feature')
    cnt = 0
    with lc.db.begin() as txn:
        for k in txn.cursor():
            cnt += 1
            pid = k[0].decode()
            paper = data_utils.deserialize_embedding(k[1])
            if len(paper["author"]) > 100:
                print(cnt, pid, len(paper["author"]))
                continue
            features = extract_author_features(paper)
            if cnt % 10000 == 0:
                print('已经提取:%d 万篇论文'%(cnt/10000))
            lm.set(pid,features)
    lm.db.close()
    lc.db.close()
Ejemplo n.º 23
0
def testHAN():
    LMDB_NAME_EMB = "lc_attention_network_embedding"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)
    start = False

    name_to_pubs_train = load_train_names()
    for name in name_to_pubs_train:
        if name == 'din_ping_tsai':
            start = True

        if start:
            prec, rec, f1 = han.prepare_and_train(name=name, ispretrain=True)
            print(name, prec, rec, f1)
Ejemplo n.º 24
0
 def train(self, wf_name, size=EMB_DIM):
     data = []
     LMDB_NAME = 'pub_authors.feature'
     lc = LMDBClient(LMDB_NAME)
     author_cnt = 0
     with lc.db.begin() as txn:
         for k in txn.cursor():
             author_feature = data_utils.deserialize_embedding(k[1])
             if author_cnt % 10000 == 0:
                 print(author_cnt, author_feature[0])
             author_cnt += 1
             random.shuffle(author_feature)
             # print(author_feature)
             data.append(author_feature)
     self.model = Word2Vec(
         data, size=size, window=5, min_count=5, workers=20,
     )
     self.model.save(join(settings.EMB_DATA_DIR, '{}.emb'.format(wf_name)))
Ejemplo n.º 25
0
 def train(self, wf_name, size=EMB_DIM):  #训练
     data = []
     LMDB_NAME = 'pub_authors.feature'  # 用author_feature.txt 导入到 到 数据库
     lc = LMDBClient(LMDB_NAME)  #连接 数据库 (pid-j, author_feature)
     author_cnt = 0
     with lc.db.begin() as txn:
         for k in txn.cursor():  #通过cursor 遍历
             author_feature = data_utils.deserialize_embedding(
                 k[1])  #从k[1]中  反序列化  得到作者特征对象
             if author_cnt % 10000 == 0:
                 print(author_cnt, author_feature[0])
             author_cnt += 1  #计算作者总数
             random.shuffle(author_feature)  #打乱 作者特征
             # print(author_feature)
             data.append(author_feature)  #加入 数据集 data 中
     self.model = Word2Vec(
         data,
         size=size,
         window=5,
         min_count=5,
         workers=20,
     )  # 输入字符集, 词向量维数, 窗口大小(当前词与目标词的最大距离), 词频过滤值, 训练的并行
     self.model.save(join(settings.EMB_DATA_DIR,
                          '{}.emb'.format(wf_name)))  #训练结果的保存 至aminer.emb
Ejemplo n.º 26
0
def dump_inter_emb(pids):  # 从训练的全局模型中 取出 隐藏层, 给局部模型使用
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"  # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-)
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'  # (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)  # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中
    global_model = GlobalTripletModel(data_scale=1000000)  # 实例化一个全局模型
    trained_global_model = global_model.load_triplets_model()  # 加载一个训练好的全局模型

    embs_input = []
    for pid in pids:
        cur_emb = lc_input.get(pid)
        if cur_emb is None:
            print("ERROR: not found embedding x for pid:%s\n" % (pid))
            continue
        embs_input.append(cur_emb)
    embs_input = np.stack(embs_input)
    inter_embs = get_hidden_output(trained_global_model, embs_input)
    for i, pid in enumerate(pids):
        lc_inter.set(pid, inter_embs[i])
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
flags.DEFINE_integer('hidden2', 64, 'Number of units in hidden layer 2.')  # 16
flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).')

flags.DEFINE_string('model', 'gcn_vae', 'Model string.')
flags.DEFINE_string('name', 'hui_fang', 'Dataset string.')
# flags.DEFINE_integer('features', 1, 'Whether to use features (1) or not (0).')
flags.DEFINE_integer('is_sparse', 0, 'Whether input features are sparse.')

model_str = FLAGS.model
name_str = FLAGS.name
start_time = time.time()

from utils.cache import LMDBClient
INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
lc_inter = LMDBClient(INTER_LMDB_NAME)

RAW_INTER_NAME = 'author_100.emb.weighted'
lc_inter_raw = LMDBClient(INTER_LMDB_NAME)

tripleteLossLMDBName = 'author_triplets.emb'
tripletFeature = LMDBClient(tripleteLossLMDBName)


RAWFEATURE = "rawfeature"
ATTENTIONFEATURE = "attention_feature"
TRIPLETFEATURE = "triplet_feature"

def encode_labels(labels):
    classes = set(labels)
    classes_dict = {c: i for i, c in enumerate(classes)}
Ejemplo n.º 28
0
from utils.cache import LMDBClient
import tensorflow as tf
from os.path import join
from utils import settings
import numpy as np
from IslandLoss.prepareTrainData import prepareData
from utils import EmbedingCheck
from keras import backend as K

LMDB_NAME = "author_IslandLoss.emb.weighted"
lc = LMDBClient(LMDB_NAME)

TrainX, TrainY, TestX, TestY, NumberOfClass, AllX, Ally, pids = prepareData(
    'all')
MeanAllX = np.mean(AllX, axis=0)

Embedding = 100
NUM_CLASSES = NumberOfClass
CENTER_LOSS_ALPHA = 0.0001
Island_Loss_ALPHA = 1.0
ratio = 0.0001
epochs = 3000


def l2Norm(x):
    return K.l2_normalize(x, axis=-1)


with tf.name_scope('input'):
    input_images = tf.placeholder(tf.float32,
                                  shape=(None, Embedding),
Ejemplo n.º 29
0
from utils import data_utils
from utils import settings
import codecs
from os.path import join
from datetime import datetime
from utils import feature_utils
from global_.embedding import EmbeddingModel
from utils.cache import LMDBClient
from collections import defaultdict as dd

import math

pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json')
LMDB_NAME_EMB = "publication.emb.weighted"
lc_emb = LMDBClient(LMDB_NAME_EMB)
cnt = 0

for i, pid in enumerate(pubs_dict):
    if i % 1000 == 0:
        print("idx: %d" % (i))
        emb = lc_emb.get(pid)
        if emb is None:
            print("%s emb is null" % (pid))
            cnt = cnt + 1

print("cnt: %d" % (cnt))
Ejemplo n.º 30
0
from os.path import join
import numpy as np
import keras.backend as K
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from utils.cache import LMDBClient
from utils import data_utils
from utils import settings

LMDB_NAME = "author_100.emb.weighted"
lc = LMDBClient(LMDB_NAME)

data_cache = {}


def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))


def root_mean_log_squared_error(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), np.inf) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), np.inf) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))


def create_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(64), input_shape=(300, 100)))
    model.add(Dropout(0.5))
    model.add(Dense(1))