Esempio n. 1
0
    def __init__(
        self,
        dtype='train',
        maxk=500,
    ):
        self.dtye = dtype
        self.seq_len = maxk
        self.maxk = maxk

        self.clusters = []

        # 将pub_emb转换成clusters

        # for author, author_dict in self.author_dict.items():
        #     for author_id, author_id_list in author_dict.items():
        #         for article in author_id_list:
        #             self.author.append([author, author_id, article])
        #
        # with open('./data/train_pub_new.json') as f:
        #     self.pub = json.loads(f.read())
        if dtype == 'train':
            pub_emb = load_json(rfdir='../data/', rfname='pub_emb.json')
            authors = load_json(rfdir='../data/',
                                rfname='train_set_author.json')

        for author in authors:
            for nameid in authors[author]:
                doc_set = []
                for pid in authors[author][nameid]:
                    doc_set.append(pub_emb[pid])
                self.clusters.append(doc_set)
Esempio n. 2
0
    def prepare_data(self):
        self.name2pubs_train = data_utils.load_json(
            settings.GLOBAL_DATA_DIR,
            'name_to_pubs_train_500.json')  # for test
        self.name2pubs_test = data_utils.load_json(
            settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json')
        self.names_train = self.name2pubs_train.keys()
        print('names train', len(self.names_train))
        self.names_test = self.name2pubs_test.keys()
        print('names test', len(self.names_test))
        assert not set(self.names_train).intersection(set(self.names_test))
        for name in self.names_train:
            name_pubs_dict = self.name2pubs_train[name]
            for aid in name_pubs_dict:
                self.pids_train += name_pubs_dict[aid]
        random.shuffle(self.pids_train)
        self.n_pubs_train = len(self.pids_train)
        print('pubs2train', self.n_pubs_train)

        for name in self.names_test:
            name_pubs_dict = self.name2pubs_test[name]
            for aid in name_pubs_dict:
                self.pids_test += name_pubs_dict[aid]
        random.shuffle(self.pids_test)
        self.n_pubs_test = len(self.pids_test)
        print('pubs2test', self.n_pubs_test)
def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME)
    lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = {}
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name)
    _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json")
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_test.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
    for name in name_to_pubs_train:
        print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_train.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
    def prepare_data(self):
        self.name2pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json')  # for test
        self.name2pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json')
        self.names_train = self.name2pubs_train.keys()
        self.names_test = self.name2pubs_test.keys()
        assert not set(self.names_train).intersection(set(self.names_test))

        for authorName in self.names_train:
            self.genPAPandPSP(authorName=authorName, idf_threshold=IDF_THRESHOLD)
def dump_inter_emb():
    """
    dump hidden embedding via trained global_ model for local model to use
    """
    Res = defaultdict(list)
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    # print(name_to_pubs_test)
    for name in name_to_pubs_test:
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
            Res[pid_].append(inter_embs[i])

    # the same as the train data
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                              'name_to_pubs_train_500.json')
    for name in name_to_pubs_train:
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            if len(name_data[aid]
                   ) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
Esempio n. 6
0
def crop_to_size(dataset_type="STB", crop_size=320):
    """crop the dataset to crop_size"""
    half_crop_size = crop_size / 2
    dataset_path = data_path + dataset_path_dict[dataset_type]
    image_names_json = dataset_json_dict[dataset_type][2]
    image_names = load_json(dataset_path + "/" + image_names_json)
    image_type = ".png" if dataset_type in ["RHD", "STB"] else ".jpg"

    save_dir = dataset_path + "_" + str(crop_size)

    begin = time.time()
    print("crop {}:{} to size {}".format(dataset_type, len(image_names),
                                         crop_size))
    for _, image_name in enumerate(image_names):
        img_raw = cv.imread(dataset_path + "/" + image_name + image_type)
        #img_raw = cv.cvtColor(img_raw, cv.COLOR_BGR2RGB)
        anno_infos = load_json(dataset_path + "/" + image_name + ".json")
        joints = np.array(anno_infos['hand_pts']).astype("float32")

        img_h, img_w, _ = img_raw.shape
        crop_center = joints[:, :2][12].astype(int)
        half_size = min(crop_center[0], crop_center[1], half_crop_size,
                        img_w - crop_center[0], img_h - crop_center[1])

        x0, y0 = (crop_center - half_size).astype(int)
        x1, y1 = (crop_center + half_size).astype(int)
        img_crop = img_raw[y0:y1, x0:x1]
        joints[:, :2] -= [x0, y0]

        scale = half_crop_size / half_size
        if scale > 1:
            img_crop = cv.resize(img_crop, (0, 0),
                                 fx=scale,
                                 fy=scale,
                                 interpolation=cv.INTER_CUBIC)
            joints[:, :2] *= scale

        if not os.path.exists(save_dir + "/" + image_name.split("/")[0]):
            print("Make dirs:", save_dir + "/" + image_name.split("/")[0])
            os.makedirs(save_dir + "/" + image_name.split("/")[0])
        cv.imwrite(save_dir + "/" + image_name + image_type, img_crop)
        anno_infos = {}
        anno_infos['img_name'] = image_name
        anno_infos['hand_pts'] = joints.tolist()
        store_json(save_dir + "/" + image_name + ".json", anno_infos)

    print("Done! Cost time:", time.time() - begin)
    store_json(save_dir + "/" + image_names_json, image_names)
Esempio n. 7
0
def gen_test(k=300, flatten=False):  # 测试集 中 抽样 k个
    name_to_pubs_test = data_utils.load_json(
        settings.GLOBAL_DATA_DIR,
        'name_to_pubs_test_100.json')  # 测试集 name->aid->pid-j
    #xs = []
    xs, ys = [], []  # 特征 与 标记
    names = []
    for name in name_to_pubs_test:  # 枚举名字 对于一个名字name 有重复抽样k个 文档
        names.append(name)  # 加入 名字列表 中
        num_clusters = len(name_to_pubs_test[name])  # name 下 的 真实 聚类数
        x = []  # 在name下 抽样k个 文档特征x^- 放入一个列表中
        items = []
        ''' for item in name_to_pubs_test[name]: # 属于他的 文档id
            items.append(item) '''
        for c in name_to_pubs_test[name]:  # one person 对于 name下 的 一个 实体 c
            for item in name_to_pubs_test[name][c]:  # 属于他的 文档id
                items.append(item)  # 加入 到 文档列表 中
        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]  # 文档集 中 随机取样 k 个
        for p in sampled_points:
            if p in data_cache:  # 在 cache 中
                x.append(data_cache[p])  # 从 cache 中 取出 特征x^-
            else:
                x.append(lc.get(p))  # 否则 从 数据库 中 取出 特征x^-
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:  # 条件走的是 这里
            xs.append(np.stack(x))  # 数组堆积后 放入 xs
            ys.append(num_clusters)  # ys 存标记 即 实际聚类大小
    xs = np.stack(
        xs
    )  # 再堆积一次 此时 xs = array([ [(一个name下的若干文档) [100维特征向量(x^-)], ... ], [[...], ...], ...])
    ys = np.stack(ys)  # ys = array([聚类大小1, 聚类大小2...])
    return names, xs, ys  # 姓名name, 文档特征(x^-), 聚类大小
Esempio n. 8
0
def gen_test(dataset_name, k=300, flatten=False):
    name_to_pubs_test = {}
    _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name)
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(dataset_name), case_name),
            "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    xs, ys = [], []
    names = []
    for name in name_to_pubs_test:
        names.append(name)
        num_clusters = len(name_to_pubs_test[name])
        x = []
        items = []
        for c in name_to_pubs_test[name]:  # one person
            for item in name_to_pubs_test[name][c]:
                items.append(item)
        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]
        for p in sampled_points:
            if p in data_cache:
                x.append(data_cache[p])
            else:
                x.append(lc.get(p))
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:
            xs.append(np.stack(x))
        ys.append(num_clusters)
    xs = np.stack(xs)
    ys = np.stack(ys)
    return names, xs, ys
Esempio n. 9
0
def run_rnn(dataset_name, k=300, seed=1106):
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _, _ = settings.get_split_name_list(dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(dataset_name), case_name),
            "assignments.json")
    # name_to_pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json')
    test_names, test_x, test_y = gen_test(dataset_name, k)
    np.random.seed(seed)
    clusters = []
    for domain in name_to_pubs_train.values():
        for cluster in domain.values():
            clusters.append(cluster)
    for i, c in enumerate(clusters):
        if i % 100 == 0:
            print(i, len(c), len(clusters))
        for pid in c:
            data_cache[pid] = lc.get(pid)
    model = create_model()
    # print(model.summary())
    model.fit_generator(gen_train(clusters, k=300, batch_size=1000),
                        steps_per_epoch=100,
                        epochs=1000,
                        validation_data=(test_x, test_y))
    kk = model.predict(test_x)
    wf = open(join(settings.get_out_dir(dataset_name), 'n_clusters_rnn.txt'),
              'w')
    for i, name in enumerate(test_names):
        wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0]))
    wf.close()
Esempio n. 10
0
def run_rnn(k=300, seed=1106, split=0.9):
    np.random.seed(seed)
    name_to_pubs = data_utils.load_json(settings.ASSIGNMENT_JSON)
    names = list(name_to_pubs.keys())
    num_train = int(len(names) * split)
    names_train = names[:num_train]
    name_to_pubs_test = dict((name, item)
                             for name, item in name_to_pubs.items()
                             if name not in names_train)

    clusters = []
    for name, pubs in name_to_pubs.items():
        if name not in names_train:
            continue
        clusters.extend(pubs)
    # for i, c in enumerate(clusters):
    #     if i % 100 == 0:
    #         print(i, len(c), len(clusters))
    #     for pid in c:
    #         v = lc.get(pid)
    #         if not v:
    #             data_cache[pid] = v
    # print(model.summary())

    model = create_model(k=k)
    test_names, test_x, test_y = gen_test(name_to_pubs_test, k=k)
    model.fit_generator(gen_train(clusters, k=k, batch_size=1000),
                        steps_per_epoch=100,
                        epochs=1000,
                        validation_data=(test_x, test_y))
    kk = model.predict(test_x)
    wf = open(join(settings.CLUSTER_SIZE), 'w')
    for i, name in enumerate(test_names):
        wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0]))
    wf.close()
Esempio n. 11
0
def dump_author_features_to_file():
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json')
    print('n_papers', len(pubs_dict))
    wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'w',
                     encoding='utf-8')
    for i, pid in enumerate(pubs_dict):
        if i % 1000 == 0:
            print(i, datetime.now() - start_time)
        paper = pubs_dict[pid]
        if "title" not in paper or "authors" not in paper:
            continue
        if len(paper["authors"]) > 30:
            print(i, pid, len(paper["authors"]))
        if len(paper["authors"]) > 100:
            continue
        n_authors = len(paper.get('authors', []))
        for j in range(n_authors):
            author_feature = feature_utils.extract_author_features(paper, j)
            aid = '{}-{}'.format(pid, j)
            wf.write(aid + '\t' + ' '.join(author_feature) + '\n')
    wf.close()
Esempio n. 12
0
def dump_test_emb():
    LMDB_NAME = "author_100.emb.weighted"
    lc_input = LMDBClient(LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    global_model = GlobalTripletModel(data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()

    sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json')
    for name in sna_valid_author_raw.keys():
        if name == "j_yu":
            continue
        print ("name: ", name)
        checkPids = sna_valid_author_raw[name]
        embs_input = []
        pids = []
        for pid in checkPids:
            cur_emb = lc_input.get(pid)
            if cur_emb is None:
                continue
            embs_input.append(cur_emb)
            pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid in enumerate(pids):
            lc_inter.set(pid, inter_embs[i])
Esempio n. 13
0
def gen_test(k=300, flatten=False):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test.json')
    xs, ys = [], []
    names = []
    # print (name_to_pubs_test)
    for name in name_to_pubs_test:
        # print ("name: ", name)
        names.append(name)
        num_clusters = len(name_to_pubs_test[name])
        x = []
        items = []
        for c in name_to_pubs_test[name]:  # one person
            for item in name_to_pubs_test[name][c]:
                if lc.get(item) is None:
                    continue
                items.append(item)
        if len(items) < k:
            continue

        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]
        for p in sampled_points:
            x.append(lc.get(p))
        # print ("name: ", name,  "x: ", x)
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:
            xs.append(np.stack(x))
        ys.append(num_clusters)
    xs = np.stack(xs)
    ys = np.stack(ys)
    return names, xs, ys
Esempio n. 14
0
def gen_test(k=300, flatten=False):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    xs, ys = [], []
    names = []
    for name in name_to_pubs_test:
        names.append(name)
        num_clusters = len(name_to_pubs_test[name])
        x = []
        items = []
        for c in name_to_pubs_test[name]:  # one person
            for item in name_to_pubs_test[name][c]:
                items.append(item)
        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]
        for p in sampled_points:
            if p in data_cache:
                x.append(data_cache[p])
            else:
                x.append(lc.get(p))
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:
            xs.append(np.stack(x))
        ys.append(num_clusters)
    xs = np.stack(xs)
    ys = np.stack(ys)
    return names, xs, ys
Esempio n. 15
0
def dump_author_features_to_file():  #提取作者特征到文件中
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                     'pubs_raw.json')  #原始数据 pubs_raw.json
    print('n_papers', len(pubs_dict))  #论文数量
    wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'w',
                     encoding='utf-8')  #特征写入 author_features.txt
    for i, pid in enumerate(pubs_dict):  #枚举一篇论文 i, pid = 索引, 枚举对象
        if i % 1000 == 0:
            print(i, datetime.now() - start_time)
        paper = pubs_dict[pid]  # 某个paper 的信息
        if "title" not in paper or "authors" not in paper:
            continue
        if len(paper["authors"]) > 30:  # 合作者 人数
            print(i, pid, len(paper["authors"]))
        if len(paper["authors"]) > 100:
            continue
        n_authors = len(
            paper.get('authors', [])
        )  #该论文的作者数 dict.get(key, default=None) 在字典中查询键值key 若不存在返回默认值default
        for j in range(n_authors):  #枚举每一位作者
            if 'id' not in paper['authors'][j]:
                continue
            author_feature = feature_utils.extract_author_features(
                paper, j)  #提取论文paper中的作者j的特征 __$f_name$_$word$
            aid = '{}-{}'.format(pid, j)  #aid: pid-j
            wf.write(aid + '\t' + ' '.join(author_feature) +
                     '\n')  #往wf中写入特征信息 aid\t author_feature\n
    wf.close()
Esempio n. 16
0
def convert_to_TFRecords(dataset_list, state="train"):
    """Convert dataset in the list to TFRecords"""
    dataset_list = dataset_list.split("/")
    tfrecords_filename = data_path + "TFRecords/" + state + "_of_" + "_".join(
        dataset_list) + "_num.tfrecords"
    writer = tf.python_io.TFRecordWriter(tfrecords_filename)
    print("Writing into ", tfrecords_filename)

    samples = 0
    begin = time.time()
    for _, dataset_type in enumerate(dataset_list):
        mid = time.time()
        print("Process dataset:", dataset_type)
        dataset_path = data_path + dataset_path_dict[dataset_type]
        image_names_json = dataset_json_dict[dataset_type][
            0] if state == "train" else dataset_json_dict[dataset_type][1]
        image_names = load_json(dataset_path + "/" + image_names_json)
        image_type = ".png" if dataset_type in ["RHD", "STB", "STB_320"
                                                ] else ".jpg"
        for _, image_name in enumerate(image_names):
            write_one_example(dataset_path, image_name, image_type, writer)
            samples += 1
        print("Cost time:", time.time() - mid)

    writer.close()
    os.rename(tfrecords_filename,
              tfrecords_filename.replace("num", str(samples)))
    print("ALL Done! Cost total time:", time.time() - begin)
def test_prepare_local_data(Name):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    # cnt = 0
    wf_contents = []
    for i, name in enumerate(name_to_pubs_test):
        if name != Name: continue
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            # if len(items) < 5:
            #     continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)
                wf_contents.append({'pid': pid, 'label': pids2label[pid]})
                # cur_pub_emb = list(map(str, cur_pub_emb))
                # wf_content.write('{}\t'.format(pid))
                # wf_content.write('\t'.join(cur_pub_emb))
                # wf_content.write('\t{}\n'.format(pids2label[pid]))
    PidsLabels = [x['label'] for x in wf_contents]
    print(len(set(PidsLabels)))
Esempio n. 18
0
def gen_sna(k=300):
    name_to_pubs_test = data_utils.load_json(settings.SNA_PUB_DIR,
                                             'sna_valid_author_raw.json')
    xs = []
    names = []

    for name in name_to_pubs_test:
        names.append(name)
        x = []
        items = []
        for pid in name_to_pubs_test[name]:
            if lc.get(pid) is not None:
                items.append(pid)

        if len(items) == 0:
            continue

        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]
        for p in sampled_points:
            emb = lc.get(p)
            # print ("emb: len :", len(emb))
            x.append(emb)
        xs.append(np.stack(x))
    xs = np.stack(xs)
    return names, xs
Esempio n. 19
0
def run_rnn(k=300, seed=1106):
    name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                              'name_to_pubs_train_500.json')
    test_names, test_x, test_y = gen_test(k)
    np.random.seed(seed)
    clusters = []
    for domain in name_to_pubs_train.values():
        for cluster in domain.values():
            clusters.append(cluster)
    for i, c in enumerate(clusters):
        if i % 100 == 0:
            print(i, len(c), len(clusters))
        for pid in c:
            data_cache[pid] = lc.get(pid)
    model = create_model()
    # print(model.summary())
    model.fit_generator(gen_train(clusters, k=300, batch_size=1000),
                        steps_per_epoch=100,
                        epochs=1000,
                        validation_data=(test_x, test_y))
    kk = model.predict(test_x)
    wf = open(join(settings.OUT_DIR, 'n_clusters_rnn.txt'), 'w')
    for i, name in enumerate(test_names):
        wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0]))
    wf.close()
Esempio n. 20
0
def check_labeled_zfj():
    pairs = data_utils.load_json(settings.AFF_DATA_DIR,
                                 "mag_aminer_hard_correct_zfj_copy.json")
    n_label_zfj = 0
    for pair in pairs:
        if pair["label_zfj"]:
            n_label_zfj += 1
    print("labeled until now", n_label_zfj)
Esempio n. 21
0
 def load_id2papers(self, fold):
     if os.path.isfile(
             join(self.pairs_dir,
                  'clean-id2paper-test-{}.json'.format(fold))):
         return data_utils.load_json(
             self.paper_dir, 'clean-id2paper-test-{}.json'.format(fold))
     else:
         return self.gen_id2papers(fold)
def test(idf_threshold):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # 286 hongbin_li_pubs_content.txt
        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)

        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)

        # generate network1
        # generate network1
        all_idf_sum = 0
        pathCnt = 0
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        for i in range(n_pubs - 1):
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                all_idf_sum += idf_sum
                if idf_sum >= idf_threshold:
                    pathCnt = pathCnt + 1

        if name == "kexin_xu":
            print("all_idf_sum: ", all_idf_sum)
            print("pathCnt: ", pathCnt)
Esempio n. 23
0
def dump_pub_features_to_file():
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    global _pubs_dict

    # Load publication features
    _pubs_dict = data_utils.load_json('./OAG_WhoIsWho_data', 'your_pub_file_name')
    res = multithread_utils.processed_by_multi_thread(get_pub_feature, range(len(_pubs_dict)))
    data_utils.dump_data(res, "Essential_Embeddings/", "pub.features")
Esempio n. 24
0
def getPids():
    name2pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json')  # for test
    cntpapers = []
    for name in name2pubs_train:
        papers = name2pubs_train[name]
        for aid in papers:
            if len(papers[aid]) < 5:
                continue
            for pid in papers[aid]:
                cntpapers.append(pid)
    return cntpapers
Esempio n. 25
0
    def prepare_data(self):
        self.name2pubs_train = {}
        # self.name2pubs_val = {}
        self.name2pubs_test = {}
        TRAIN_NAME_LIST, TEST_NAME_LIST = settings.get_split_name_list(self.dataset_name)
        for case_name in TRAIN_NAME_LIST:
            self.name2pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
                                                                   "assignments.json")
        # for case_name in VAL_NAME_LIST:
        #     self.name2pubs_val[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
        #                                                          "assignments.json")
        for case_name in TEST_NAME_LIST:
            self.name2pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
                                                                  "assignments.json")
        # self.name2pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json')  # for test
        # self.name2pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
        # self.names_train = self.name2pubs_train.keys()
        # print('names train', len(self.names_train))
        # self.names_test = self.name2pubs_test.keys()
        # print('names test', len(self.names_test))
        self.names_train, self.names_test = settings.get_split_name_list(self.dataset_name)

        assert not set(self.names_train).intersection(set(self.names_test))
        # assert not set(self.names_train).intersection(set(self.names_val))
        # assert not set(self.names_val).intersection(set(self.names_test))

        for name in self.names_train:
            name_pubs_dict = self.name2pubs_train[name]
            for aid in name_pubs_dict:
                self.pids_train += name_pubs_dict[aid]
        random.shuffle(self.pids_train)
        self.n_pubs_train = len(self.pids_train)
        print('pubs2train', self.n_pubs_train)

        for name in self.names_test:
            name_pubs_dict = self.name2pubs_test[name]
            for aid in name_pubs_dict:
                self.pids_test += name_pubs_dict[aid]
        random.shuffle(self.pids_test)
        self.n_pubs_test = len(self.pids_test)
        print('pubs2test', self.n_pubs_test)
Esempio n. 26
0
def prepro_tacos(configs):

    if not os.path.exists(configs.save_dir):
        os.makedirs(configs.save_dir)

    # train/test data format: (video_id, start_time, end_time, duration, words)
    train_data, val_data, test_data = read_tacos_data(
        configs.root, configs.max_position_length)

    # load features and sample feature shapes if possible
    features_path = os.path.join(
        configs.root,
        "tacos_features_{}/feature_shapes.json".format(configs.feature))
    feature_shapes = dict()
    for vid, length in load_json(features_path).items():
        if configs.max_position_length is not None and length > configs.max_position_length:
            length = configs.max_position_length
        feature_shapes[vid] = length

    # generate token dicts and load pre-trained vectors
    word_counter, char_counter = Counter(), Counter()
    for data in [train_data, val_data, test_data]:
        for record in data:
            words = record[-1]
            for word in words:
                word_counter[word] += 1
                for char in list(word):
                    char_counter[char] += 1
    word_dict, char_dict, word_vectors = create_vocabularies(
        configs, word_counter, char_counter)

    # generate datasets
    train_set = generate_dataset(train_data, feature_shapes, word_dict,
                                 char_dict, "train")
    val_set = generate_dataset(val_data, feature_shapes, word_dict, char_dict,
                               "val")
    test_set = generate_dataset(test_data, feature_shapes, word_dict,
                                char_dict, "test")

    # save to directory
    write_json(word_dict,
               save_path=os.path.join(configs.save_dir, "word_dict.json"))
    write_json(char_dict,
               save_path=os.path.join(configs.save_dir, "char_dict.json"))
    np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"),
                        vectors=word_vectors)
    write_json(train_set,
               save_path=os.path.join(configs.save_dir, "train_set.json"))
    write_json(val_set,
               save_path=os.path.join(configs.save_dir, "val_set.json"))
    write_json(test_set,
               save_path=os.path.join(configs.save_dir, "test_set.json"))
Esempio n. 27
0
def load_aff_data():
    file_dir = settings.AFF_DATA_DIR
    pos_pairs = data_utils.load_json(file_dir,
                                     "label_data_aff_zhoushao.json")[:600]
    pos_pairs = [({
        "name": p["affiliation"]
    }, {
        "DisplayName": p["label"]
    }) for p in pos_pairs if p["label"] != "[NIF]"]
    neg_pairs = data_utils.load_json(file_dir,
                                     'train_negative_affi_clean.json')[:600]
    neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs]
    pairs_add = data_utils.load_json(file_dir,
                                     "mag_aminer_hard_correct_zfj_copy.json")
    print("add pairs", len(pairs_add))
    pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                  if p["label_zfj"] == "1"]
    neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                  if p["label_zfj"] == "0"]
    pos_pairs = pos_pairs[-len(neg_pairs):]
    labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
    pairs = pos_pairs + neg_pairs  # label balanced is important
    return pairs, labels
Esempio n. 28
0
def json2dataframe(rfpath, wfpath):
    pubs = load_json(rfpath=rfpath)
    names = []
    values = []
    for k, v in pubs.items():
        names.extend([k] * len(v))
        values.extend(v)
    values = json_normalize(values)
    values['name'] = names
    pubs = values
    pubs['org'] = pubs.authors.map(lambda x: list(map(lambda x: x['org'], x)))
    pubs['authors'] = pubs.authors.map(
        lambda x: list(map(lambda x: x['name'], x)))
    pubs.to_parquet(wfpath, engine='fastparquet')
Esempio n. 29
0
 def prepare_corpus(self):
     train_corpus_analyzed = []
     analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
     train_corpus = data_utils.load_json(self.train_data_dir, self.train_data_fname)
     print('training documents loaded')
     print('documents number: {}'.format(len(train_corpus)))
     for i, text in enumerate(train_corpus):
         if i % 10000 == 0:
             print(i)
         words = data_utils.get_words(text)
         tags = [i]
         train_corpus_analyzed.append(analyzedDocument(words=words, tags=tags))
         # if i > 100000:
         #     break
     return train_corpus_analyzed
Esempio n. 30
0
def filter_aff_neg_pairs():
    neg_pairs = data_utils.load_json(settings.AFF_DATA_DIR,
                                     'train_negative_affi.json')
    neg_pairs_cleaned = []
    for i, pair in enumerate(neg_pairs):
        if i % 100 == 0:
            print("pair", i)
        mag_aff = pair["mag_affi"]
        aminer_aff = pair["aminer_affi"]
        aff1 = mag_aff["NormalizedName"].split()
        aff2 = aminer_aff["main_body"].split()
        common = set(aff1).intersection(aff2)
        if len(common) > 1:
            neg_pairs_cleaned.append(pair)
    print("after cleaned", len(neg_pairs_cleaned))
    data_utils.dump_json(neg_pairs_cleaned, settings.AFF_DATA_DIR,
                         "train_negative_affi_clean.json")