def gen_local_data(pids,
                   labels,
                   idf_threshold=10
                   ):  # 对每一个作者名, 生成局部数据, 包括文档特征 与 文档网络; 输入参数是阀值, 也就是相似度高于多少才连边
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR,
                               'feature_idf.pkl')  # 加载 特征的 idf值 {word: idf}
    INTER_LMDB_NAME = 'author_triplets.emb'  # 加载 作者在triplet训练后的 内部嵌入 (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"  # 加载 作者 原始 特征 (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))  # 建立目录, 做好保存局部模型 的工作
    os.makedirs(graph_dir, exist_ok=True)

    name = "Name"

    wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w')
    shuffle(pids)  # 打乱

    for i, pid in enumerate(pids):
        cur_pub_emb = lc_inter.get(pid)  # 获得文档嵌入 y
        if cur_pub_emb is not None:
            cur_pub_emb = list(map(str, cur_pub_emb))  #把cur_pub_emb 转换成字符串 表达
            wf_content.write('{}\t'.format(pid))  # 文档id
            wf_content.write('\t'.join(cur_pub_emb))  # 嵌入 y
            wf_content.write('\t{}\n'.format(pid))  # pid
        else:
            print("ERROR: not found embedding y for pid:%s\n" % (pid))

    wf_content.close()  # pid-j, y, aid

    # generate network
    n_pubs = len(pids)
    wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                      'w')  # 作者名 - 网络保存路径 (pid-j, pid-j)
    edges_num = 0
    for i in range(n_pubs - 1):  # 枚举 文档 i
        author_feature1 = set(lc_feature.get(
            pids[i]))  # 取出 文档i 原始 特征 (pid-j, author_feature)
        for j in range(i + 1, n_pubs):  # 枚举 后面 点 文档 j
            author_feature2 = set(lc_feature.get(pids[j]))  # 取出 文档j 原始 特征
            common_features = author_feature1.intersection(
                author_feature2)  # 提取 公共特征
            idf_sum = 0
            for f in common_features:  # 枚举 公共特征 中的 特征f
                idf_sum += idf.get(f, idf_threshold)  # 计算 idf 和
                # print(f, idf.get(f, idf_threshold))
            if idf_sum >= idf_threshold:  # 和 大于阀值
                wf_network.write('{}\t{}\n'.format(
                    pids[i], pids[j]))  # 连边, 写入 图网络 文件中 (pid-j, pid-j)
                edges_num = edges_num + 1
    print('n_egdes', edges_num)
    wf_network.close()
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # 加载 测试集 name->aid->pid-j
def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME)
    lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = {}
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name)
    _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json")
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_test.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
    for name in name_to_pubs_train:
        print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_train.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
def test(idf_threshold):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # 286 hongbin_li_pubs_content.txt
        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)

        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)

        # generate network1
        # generate network1
        all_idf_sum = 0
        pathCnt = 0
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        for i in range(n_pubs - 1):
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                all_idf_sum += idf_sum
                if idf_sum >= idf_threshold:
                    pathCnt = pathCnt + 1

        if name == "kexin_xu":
            print("all_idf_sum: ", all_idf_sum)
            print("pathCnt: ", pathCnt)
def dump_inter_emb():
    """
    dump hidden embedding via trained global_ model for local model to use
    """
    Res = defaultdict(list)
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    # print(name_to_pubs_test)
    for name in name_to_pubs_test:
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
            Res[pid_].append(inter_embs[i])

    # the same as the train data
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                              'name_to_pubs_train_500.json')
    for name in name_to_pubs_train:
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
Example #5
0
def dump_test_emb():
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])
def test_prepare_local_data(Name):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    # cnt = 0
    wf_contents = []
    for i, name in enumerate(name_to_pubs_test):
        if name != Name: continue
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            # if len(items) < 5:
            #     continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)
                wf_contents.append({'pid': pid, 'label': pids2label[pid]})
                # cur_pub_emb = list(map(str, cur_pub_emb))
                # wf_content.write('{}\t'.format(pid))
                # wf_content.write('\t'.join(cur_pub_emb))
                # wf_content.write('\t{}\n'.format(pids2label[pid]))
    PidsLabels = [x['label'] for x in wf_contents]
    print(len(set(PidsLabels)))
Example #7
0
def process_by_name(pids):
    ### preprocessing
    print('n_papers: ', len(pids))
    if len(pids) < 10:
        print("too few parpers, continue...")
        return

    ### prepare_local_data
    IDF_THRESHOLD = 32
    dump_inter_emb(pids)
    gen_local_data(idf_threshold=IDF_THRESHOLD, pids=pids, labels=None)

    ### count_size
    LMDB_NAME = "author_100.emb.weighted"  #(pid-j, x^-)
    lc = LMDBClient(LMDB_NAME)  # 作者 特者 嵌入 加权 平均 (x^-)

    k = 300
    test_x = []
    x = []  # 在name下 抽样k个 文档特征x^- 放入一个列表中
    sampled_points = [
        pids[p] for p in np.random.choice(len(pids), k, replace=True)
    ]  # 文档集 中 随机取样 k 个
    for p in sampled_points:
        x.append(lc.get(p))  # 否则 从 数据库 中 取出 特征x^-
    test_x.append(np.stack(x))
    test_x = np.stack(test_x)

    model_dir = join(settings.OUT_DIR, 'model')  #设定模型目录
    rf = open(join(model_dir, 'model-count.json'), 'r')  # 加载模型结构
    model_json = rf.read()
    rf.close()
    loaded_model = model_from_json(model_json)
    loaded_model.load_weights(join(model_dir, 'model-count.h5'))  # 加载模型 权重

    kk = loaded_model.predict(test_x)
    print('num_pred:', kk)

    ### local\gae\train
    ret = gae_for_na('Name', int(kk[0][0]))
    return ret
def dump_inter_emb(pids):  # 从训练的全局模型中 取出 隐藏层, 给局部模型使用
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"  # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-)
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'  # (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)  # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中
    global_model = GlobalTripletModel(data_scale=1000000)  # 实例化一个全局模型
    trained_global_model = global_model.load_triplets_model()  # 加载一个训练好的全局模型

    embs_input = []
    for pid in pids:
        cur_emb = lc_input.get(pid)
        if cur_emb is None:
            print("ERROR: not found embedding x for pid:%s\n" % (pid))
            continue
        embs_input.append(cur_emb)
    embs_input = np.stack(embs_input)
    inter_embs = get_hidden_output(trained_global_model, embs_input)
    for i, pid in enumerate(pids):
        lc_inter.set(pid, inter_embs[i])
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
def gen_local_data(idf_threshold):
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """

    AuthorSocial = inputData.loadAuthorSocial()

    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # 286 hongbin_li_pubs_content.txt
        # generate content
        wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)),
                          'w')
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)

        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                pids_set.add(pid)
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}'.format(pids2label[pid]))
                LabelId = getLabelId(pid[:IDLength], name)
                wf_content.write('\t{}\n'.format(LabelId))
        wf_content.close()

        # generate network1
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        print('n_pubs', n_pubs)
        wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                          'w')
        for i in range(n_pubs - 1):
            if i % 10 == 0:
                print(i)
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                    # print(f, idf.get(f, idf_threshold))

                if idf_sum >= idf_threshold:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i],
                                                       pids_filter[j]))
        wf_network.close()

        def CountNumber(A, B):
            res = 0
            for x in A:
                for y in B:
                    if x == y:
                        res = res + 1

            return res

        wf_network = open(join(graph_dir, '{}_pubs_network2.txt'.format(name)),
                          'w')

        for i in range(n_pubs - 1):
            for j in range(i + 1, n_pubs):
                Graph1Socials = AuthorSocial[pids_filter[i]]
                Graph2Socials = AuthorSocial[pids_filter[j]]
                if CountNumber(Graph1Socials, Graph2Socials) >= 1:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i],
                                                       pids_filter[j]))

        wf_network.close()
Example #10
0
def dump_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=10000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train.json')
    for name in name_to_pubs_train:
        if name == "roger_williams":
            continue
        # print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    # print ("pid emb is null: ", pid)
                    continue

                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])

    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json')
    for name in name_to_pubs_test:
        if name == "roger_williams" or name == "j_yu":
            continue
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    print ("pid emb is null: ", pid)
                    continue

                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])
def prepareData(type='train'):
    # SNALabelDict, numberofCluss = preprocessSNALabels()
    # TestLabelDict, TestLabelNumberofCluss = preprocessTestLabels()
    if type == 'train':
        LabelDict, numberofCluss = preprocessLabels()
    else:
        LabelDict, numberofCluss = preprocessSNALabels()

    print("LabelDict : ", LabelDict)

    TrainPids = np.array(list(LabelDict.keys()))
    AllPids = np.array(TrainPids)
    print("AllPids : ", list(AllPids))

    if type == 'train':
        TrainPids, ValidPids = train_test_split(AllPids,
                                                stratify=list(
                                                    LabelDict.values()),
                                                random_state=42)
    else:
        TrainPids, ValidPids = train_test_split(AllPids, random_state=42)

    # TrainPids, ValidPids = train_test_split(TrainPids, test_size=0.1, random_state=42)

    LMDB_NAME_EMB = "publication.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)

    AllX = []
    Ally = []
    TrainX = []
    TrainY = []
    ValidX = []
    ValidY = []
    Allpids = []

    for pid in TrainPids:
        emb = lc_emb.get(pid)
        label = LabelDict[pid]
        # print ("pid: ", pid, ", label: ", label, ', emb: ', emb)
        if emb is None:
            continue
        Allpids.append(pid)
        AllX.append(emb)
        TrainX.append(emb)
        TrainY.append(label)
        Ally.append(label)

    for pid in ValidPids:
        emb = lc_emb.get(pid)
        label = LabelDict[pid]
        # print ("pid: ", pid, ", label: ", label, ', emb: ', emb)
        if emb is None:
            continue
        Allpids.append(pid)
        AllX.append(emb)
        ValidX.append(emb)
        ValidY.append(label)
        Ally.append(label)

    return np.array(TrainX), np.array(TrainY), np.array(ValidX), np.array(
        ValidY), numberofCluss, AllX, Ally, Allpids
Example #12
0
def disambiguate(name_pubs, number):
    print('Run task (%s)...' % (os.getpid()))
    start1 = time.time()
    lc1 = LMDBClient('sci_all_data')
    result = {}
    for n, name in enumerate(name_pubs):
        pubs = name_pubs[name]  ##存储某一人名下的所有文章
        print(n, name, len(pubs))

        if len(pubs) == 0:
            result[name] = []
            continue
        result1 = []
        if len(pubs) <= 5:
            result[name] = []
            for i, pid in enumerate(pubs):
                result1.append(pid[0])
            result[name].append(result1)
            continue

        ##保存关系
        ###############################################################
        name_pubs_raw = {}
        for i, pid in enumerate(pubs):
            paper = lc1.get(pid[0])
            paper['org_name'] = pid[1]
            paper.pop('abstract')
            paper.pop('uid')
            name_pubs_raw[pid[0]] = paper
        save_relation(name_pubs_raw, name, number)
        #         print('save features down')
        ###############################################################

        ##元路径游走类
        mpg = MetaPathGenerator()
        mpg.read_data("gene", number)
        #         print('path down')
        ###############################################################

        ##论文关系表征向量(关系特征嵌入),采用了bagging思想
        all_embs = []
        rw_num = 3
        cp = set()  ##孤立节点
        for k in range(rw_num):
            mpg.generate_WMRW("gene/RW_{}.txt".format(number), 5, 10)  #生成路径集
            sentences = word2vec.Text8Corpus(r'gene/RW_{}.txt'.format(number))
            model = word2vec.Word2Vec(sentences,
                                      size=100,
                                      negative=25,
                                      min_count=1,
                                      window=10)
            embs = []
            for i, pid in enumerate(pubs):
                if pid[0] in model.wv:
                    embs.append(model.wv[pid[0]])
                else:
                    cp.add(i)
                    embs.append(np.zeros(100))
            all_embs.append(embs)
        all_embs = np.array(all_embs)
        #         print('real emb down')
        ###############################################################

        ##论文文本表征向量
        ###############################################################
        #         ptext_emb=load_data('gene','ptext_emb_{}.pkl'.format(number))
        #         tcp=load_data('gene','tcp_{}.pkl'.format(number))
        #         tembs=[]
        #         for i,pid in enumerate(pubs):
        #             tembs.append(ptext_emb[pid[0]])
        # #         print('paper emb down')
        ###############################################################

        ##网络嵌入向量相似度
        sk_sim = np.zeros((len(pubs), len(pubs)), dtype='float16')
        for k in range(rw_num):
            sk_sim = sk_sim + pairwise_distances(all_embs[k], metric="cosine")
        sk_sim = sk_sim / rw_num

        ##文本相似度
        #         t_sim = pairwise_distances(tembs,metric="cosine")
        #         if tcp >= len(pubs)/2:
        sim = np.array(sk_sim)
        #         else:
        #             w=1#相似度矩阵融合权重
        #             sim = (np.array(sk_sim) + w*np.array(t_sim))/(1+w)

        ##实现消歧聚类
        ###############################################################
        pre = DBSCAN(eps=0.2, min_samples=1,
                     metric="precomputed").fit_predict(sim)

        ##离散点
        outlier = set()
        for i in range(len(pre)):
            if pre[i] == -1:
                outlier.add(i)
        for i in cp:
            outlier.add(i)

        ## (给每一个离群节点打上标签,基于tanimoto相似度矩阵)
        paper_pair = generate_pair(pubs, outlier, number)
        paper_pair1 = paper_pair.copy()
        K = len(set(pre))
        for i in range(len(pre)):
            if i not in outlier:
                continue
            j = np.argmax(paper_pair[i])
            while j in outlier:
                paper_pair[i][j] = -1
                j = np.argmax(paper_pair[i])
            if paper_pair[i][j] >= 1.5:
                pre[i] = pre[j]
            else:
                pre[i] = K
                K = K + 1

        ## find nodes in outlier is the same label or not
        ## 将各个离群节点通过相似度匹配来打上相同标签,相似阈值为1.5
        for ii, i in enumerate(outlier):
            for jj, j in enumerate(outlier):
                if jj <= ii:
                    continue
                else:
                    if paper_pair1[i][j] >= 1.5:
                        pre[j] = pre[i]

        ##存储消歧预测结果
        result[name] = []
        for lab in set(pre):
            sameauthor = []
            for index, lab1 in enumerate(pre):
                if lab == lab1:
                    sameauthor.append(pubs[index][0])
            result[name].append(sameauthor)
        print('消歧之后的作者数量:%d' % (len(result[name])))
    dump_json(result,
              "output",
              "sci_result_1_10_all_{}.json".format(number),
              indent=4)
    print('task %s run %0.2f seconds.' % (os.getpid(), (time.time() - start1)))
Example #13
0
cur_author = name_to_pubs_train[name]
pids = []
labels = []
rf = []
tf = []
attentionf = []

for aid in cur_author:
    if len(cur_author[aid]) < 5:
        continue

    for pid in cur_author[aid]:
        pids.append(pid)
        labels.append(aid)
        rf.append(rawFeature.get(pid))
        tf.append(tripletFeature.get(pid))
        attentionf.append(lc_emb.get(pid))

labels = encode_labels(labels)
numberofLabels = len(set(labels))


def clusterTest(embedding, numberofLabels):
    clusters_pred = clustering(embedding, num_clusters=numberofLabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
    return [prec, rec, f1]


tSNEAnanlyse(rf, labels, join(settings.PIC_DIR, "FINALResult", "%s_rawFeature.png" % (name)))
tSNEAnanlyse(tf, labels, join(settings.PIC_DIR, "FINALResult", "%s_tripletFeature.png" % (name)))
    def genPAPandPSP(self, authorName="hongbin_li", idf_threshold=10):
        idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
        raw_word2vec = 'author_100.emb.weighted'
        lc_emb = LMDBClient(raw_word2vec)
        LMDB_AUTHOR_FEATURE = "pub_authors.feature"
        lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
        cur_person_dict = self.name2pubs_train[authorName]
        pids_set = set()
        pids = []
        pids2label = {}

        print ("pass0")
        graph_dir = join(settings.DATA_DIR, 'AttentionNetwork' , 'graph-{}'.format(idf_threshold))
        # generate content
        wf_content = open(join(graph_dir, '{}_feature_and_label.txt'.format(authorName)), 'w')
        for i, aid in enumerate(cur_person_dict):
            personPids = cur_person_dict[aid]
            # It's no nessary to use these data which's length is less than 5
            if len(personPids) < 5:
                continue
            print ("aid: ", aid, ", pids: ", pids)
            for pid in personPids:
                pids2label[str(pid)] = str(aid)
                pids.append(pid)

        print ("pass1")
        for pid in pids:
            # use raw feature rather than Triplet Loss
            cur_pub_emb = lc_emb.get(pid)
            # cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                pids_set.add(pid)
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}\n'.format(pids2label[pid]))
        wf_content.close()

        print ("pass2")
        # generate network1
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        print('n_pubs', n_pubs)
        wf_network = open(join(graph_dir, '{}_PAP.txt'.format(authorName)), 'w')
        for i in range(n_pubs-1):
            if i % 10 == 0:
                print(i)
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i+1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                    # print(f, idf.get(f, idf_threshold))
                if idf_sum >= idf_threshold:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j]))
        wf_network.close()

        def CountNumber(A, B):
            res = 0
            for x in A:
                for y in B:
                    if x == y:
                        res = res + 1

            return res

        print ("pass3")
        wf_network = open(join(graph_dir, '{}_PSP.txt'.format(authorName)), 'w')

        for i in range(n_pubs-1):
            for j in range(i + 1, n_pubs):
                Graph1Socials = self.AuthorSocial[pids_filter[i]]
                Graph2Socials = self.AuthorSocial[pids_filter[j]]
                if CountNumber(Graph1Socials, Graph2Socials) >= Author_THRESHOLD:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j]))

        wf_network.close()
Example #15
0
def gen_local_data(idf_threshold=10):
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)),
                          'w')
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                pids_set.add(pid)
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}\n'.format(pids2label[pid]))
        wf_content.close()

        # generate network
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        print('n_pubs', n_pubs)
        wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                          'w')
        for i in range(n_pubs - 1):
            if i % 10 == 0:
                print(i)
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                    # print(f, idf.get(f, idf_threshold))
                if idf_sum >= idf_threshold:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i],
                                                       pids_filter[j]))
        wf_network.close()
from utils import data_utils
from utils import settings
import codecs
from os.path import join
from datetime import datetime
from utils import feature_utils
from global_.embedding import EmbeddingModel
from utils.cache import LMDBClient
from collections import defaultdict as dd

import math

pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json')
LMDB_NAME_EMB = "publication.emb.weighted"
lc_emb = LMDBClient(LMDB_NAME_EMB)
cnt = 0

for i, pid in enumerate(pubs_dict):
    if i % 1000 == 0:
        print("idx: %d" % (i))
        emb = lc_emb.get(pid)
        if emb is None:
            print("%s emb is null" % (pid))
            cnt = cnt + 1

print("cnt: %d" % (cnt))
    "gang_yin", "gang_zou", "guo_hua_zhang", "h_hu", "hai_yan_chen",
    "hai_yang_li"
]

for name in names:
    name_data = name_to_pubs_train[name]

    res_embs = []
    embs_input = []
    labels = []
    pids = []
    for i, aid in enumerate(name_data.keys()):
        if len(name_data[aid]) < 5:  # n_pubs of current author is too small
            continue
        for pid in name_data[aid]:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
            labels.append(aid)

    embs_input = np.stack(embs_input)
    inter_embs = get_hidden_output(trained_global_model, embs_input)
    labels = encode_labels(labels)

    for i, pid_ in enumerate(pids):
        res_embs.append(inter_embs[i])

    # Clustering and save the result
    tSNEAnanlyse(