Beispiel #1
0
    def __init__(self, entity_type, role, sample_num=None, seed=0):

        data_dir = settings.DOM_ADAPT_DIR
        fname = "{}_{}.pkl".format(entity_type, role)
        data_dict = data_utils.load_large_obj(data_dir, fname)
        self.x1 = np.array(data_dict["x1"], dtype="float32")
        self.x2 = np.array(data_dict["x2"], dtype="float32")
        self.y = np.array(data_dict["y"], dtype=int)

        self.N = len(self.y)

        if sample_num is not None:
            self.x1, self.x2, self.y = sklearn.utils.shuffle(self.x1,
                                                             self.x2,
                                                             self.y,
                                                             random_state=seed)

            n_sample_half = int(sample_num / 2)
            pos_flag = False
            neg_flag = False
            x1 = []
            x2 = []
            y = []
            n_pos = 0
            n_neg = 0
            for i in range(self.N):
                if pos_flag and neg_flag:
                    break
                cur_y = self.y[i]
                if cur_y == 1 and n_pos < n_sample_half:
                    x1.append(self.x1[i])
                    x2.append(self.x2[i])
                    y.append(cur_y)
                    n_pos += 1
                    if n_pos == n_sample_half:
                        pos_flag = True
                elif cur_y == 0 and n_neg < n_sample_half:
                    x1.append(self.x1[i])
                    x2.append(self.x2[i])
                    y.append(cur_y)
                    n_neg += 1
                    if n_neg == n_sample_half:
                        neg_flag = True
            self.x1 = np.array(x1)
            self.x2 = np.array(x2)
            self.y = np.array(y)

        self.x1 = torch.from_numpy(self.x1)
        self.x2 = torch.from_numpy(self.x2)
        self.y = torch.from_numpy(self.y)

        self.N = len(self.y)
Beispiel #2
0
    def __init__(self, entity_type, role, domain=0):

        data_dir = settings.DOM_ADAPT_DIR
        fname = "{}_{}.pkl".format(entity_type, role)
        data_dict = data_utils.load_large_obj(data_dir, fname)
        self.x1 = np.array(data_dict["x1"], dtype="float32")
        self.x2 = np.array(data_dict["x2"], dtype="float32")
        self.x1 = torch.from_numpy(self.x1)
        self.x2 = torch.from_numpy(self.x2)

        self.y = torch.LongTensor([domain] * self.x1.shape[0])

        self.N = self.y.size()[0]
Beispiel #3
0
    def __init__(self, entity_type, role):

        data_dir = settings.DOM_ADAPT_DIR
        fname = "{}_{}.pkl".format(entity_type, role)
        data_dict = data_utils.load_large_obj(data_dir, fname)
        self.x1 = np.array(data_dict["x1"], dtype="float32")
        self.x2 = np.array(data_dict["x2"], dtype="float32")
        self.y = np.array(data_dict["y"], dtype=int)

        self.N = len(self.y)

        self.x1 = torch.from_numpy(self.x1)
        self.x2 = torch.from_numpy(self.x2)
        self.y = torch.from_numpy(self.y)
    def __init__(self,
                 file_dir,
                 matrix_size1,
                 matrix_size2,
                 seed,
                 shuffle,
                 args,
                 use_emb=True):
        self.file_dir = file_dir

        self.matrix_size_1_long = matrix_size1
        self.matrix_size_2_short = matrix_size2

        self.use_emb = use_emb
        if self.use_emb:
            self.pretrain_emb = torch.load(
                os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb"))
        self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR,
                                                   "tokenizer_all_domain.pkl")

        self.train_data = json.load(
            open(join(settings.VENUE_DATA_DIR, 'train_filter.txt'), 'r'))

        n_pos_set = int((args.train_num + 2 * args.test_num) / 2)

        neg_pairs = [p for p in self.train_data if p[0] == 0]
        pos_pairs = [p for p in self.train_data if p[0] == 1][-n_pos_set:]
        n_pos = len(pos_pairs)
        neg_pairs = neg_pairs[-n_pos:]
        self.train_data = pos_pairs + neg_pairs

        self.train_data = sklearn.utils.shuffle(self.train_data,
                                                random_state=37)

        self.mag = [nltk.word_tokenize(p[1]) for p in self.train_data]
        self.aminer = [nltk.word_tokenize(p[2]) for p in self.train_data]
        self.labels = [p[0] for p in self.train_data]

        self.calc_keyword_seqs()

        n_matrix = len(self.labels)
        self.X_long = np.zeros(
            (n_matrix, self.matrix_size_1_long, self.matrix_size_1_long))
        self.X_short = np.zeros(
            (n_matrix, self.matrix_size_2_short, self.matrix_size_2_short))
        self.Y = np.zeros(n_matrix, dtype=np.long)
        count = 0
        for i, cur_y in enumerate(self.labels):
            if i % 100 == 0:
                print('pairs to matrices', i)
            v1 = self.mag[i]
            v1 = " ".join([str(v) for v in v1])
            v2 = self.aminer[i]
            v2 = " ".join([str(v) for v in v2])
            v1_key = self.mag_venue_keywords[i]
            v1_key = " ".join([str(v) for v in v1_key])
            v2_key = self.aminer_venue_keywords[i]
            v2_key = " ".join([str(v) for v in v2_key])
            matrix1 = self.sentences_long_to_matrix(v1, v2)
            # print("mat1", matrix1)
            self.X_long[count] = feature_utils.scale_matrix(matrix1)
            matrix2 = self.sentences_short_to_matrix(v1_key, v2_key)
            # print("mat2", matrix2)
            self.X_short[count] = feature_utils.scale_matrix(matrix2)
            self.Y[count] = cur_y
            count += 1

        self.N = len(self.Y)

        n_train = args.train_num
        n_test = args.test_num
        # n_train = self.N - 2*n_test

        train_data = {}
        train_data["x1"] = self.X_long[:n_train]
        train_data["x2"] = self.X_short[:n_train]
        train_data["y"] = self.Y[:n_train]
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1"] = self.X_long[n_train:(n_train + n_test)]
        test_data["x2"] = self.X_short[n_train:(n_train + n_test)]
        test_data["y"] = self.Y[n_train:(n_train + n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1"] = self.X_long[n_train + n_test:(n_train + n_test * 2)]
        valid_data["x2"] = self.X_short[n_train + n_test:(n_train +
                                                          n_test * 2)]
        valid_data["y"] = self.Y[n_train + n_test:(n_train + n_test * 2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "venue_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "venue_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "venue_valid.pkl")
Beispiel #5
0
    def __init__(self,
                 file_dir,
                 matrix_size1,
                 matrix_size2,
                 seed,
                 shuffle,
                 args,
                 use_emb=True):

        self.file_dir = file_dir

        self.matrix_size_1_long = matrix_size1
        self.matrix_size_2_short = matrix_size2

        self.use_emb = use_emb
        if self.use_emb:
            self.pretrain_emb = torch.load(
                os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb"))
        self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR,
                                                   "tokenizer_all_domain.pkl")

        # load training pairs
        # pos_pairs = data_utils.load_json(file_dir, 'train_positive_affi.json')
        # pos_pairs = [(p['aminer_affi'], p['mag_affi']) for p in pos_pairs]
        pos_pairs = data_utils.load_json(file_dir,
                                         "label_data_aff_zhoushao.json")[:600]
        pos_pairs = [({
            "name": p["affiliation"]
        }, {
            "DisplayName": p["label"]
        }) for p in pos_pairs if p["label"] != "[NIF]"]
        # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json')
        neg_pairs = data_utils.load_json(
            file_dir, 'train_negative_affi_clean.json')[:600]
        neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs]
        pairs_add = data_utils.load_json(
            file_dir, "mag_aminer_hard_correct_zfj_copy.json")
        print("add pairs", len(pairs_add))
        pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "1"]
        neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "0"]
        n_pos = len(pos_pairs)
        # labels = [1] * len(pos_pairs) + [0] * len(pos_pairs)
        labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        # pairs = pos_pairs + [neg_pairs[x] for x in range(n_pos)]  # label balanced is important
        pairs = pos_pairs + neg_pairs  # label balanced is important

        n_matrix = len(pairs)
        self.X_long = np.zeros(
            (n_matrix, self.matrix_size_1_long, self.matrix_size_1_long))
        self.X_short = np.zeros(
            (n_matrix, self.matrix_size_2_short, self.matrix_size_2_short))
        self.Y = np.zeros(n_matrix, dtype=np.long)
        count = 0
        for i, pair in enumerate(pairs):
            if i % 100 == 0:
                print('pairs to matrices', i)
            item_a, item_m = pair
            cur_y = labels[i]
            matrix1 = self.sentences_long_to_matrix(item_a['name'],
                                                    item_m['DisplayName'])
            # print("matrix1", matrix1)
            # print(item_a['name'])
            # print(item_m['DisplayName'])
            self.X_long[count] = feature_utils.scale_matrix(matrix1)
            # matrix2 = self.sentences_short_to_matrix(item_a['main_body'], item_m['NormalizedName'])
            matrix2 = self.sentences_short_to_matrix_2(item_a['name'],
                                                       item_m['DisplayName'])
            # print("matrix2", matrix2)
            self.X_short[count] = feature_utils.scale_matrix(matrix2)
            self.Y[count] = cur_y
            count += 1

            # # transpose
            # self.X_long[count] = feature_utils.scale_matrix(matrix1.transpose())
            # self.X_short[count] = feature_utils.scale_matrix(matrix2.transpose())
            # self.Y[count] = cur_y
            # count += 1

        print("shuffle", shuffle)
        if shuffle:
            self.X_long, self.X_short, self.Y = sklearn.utils.shuffle(
                self.X_long, self.X_short, self.Y, random_state=seed)

        self.N = len(self.Y)

        n_train = int(self.N * 0.6)
        n_test = int(self.N * 0.2)

        train_data = {}
        train_data["x1"] = self.X_long[:n_train]
        train_data["x2"] = self.X_short[:n_train]
        train_data["y"] = self.Y[:n_train]
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1"] = self.X_long[n_train:(n_train + n_test)]
        test_data["x2"] = self.X_short[n_train:(n_train + n_test)]
        test_data["y"] = self.Y[n_train:(n_train + n_test)]
        print("test labels", len(test_data["y"]), test_data["y"])

        valid_data = {}
        valid_data["x1"] = self.X_long[n_train + n_test:(n_train + n_test * 2)]
        valid_data["x2"] = self.X_short[n_train + n_test:(n_train +
                                                          n_test * 2)]
        valid_data["y"] = self.Y[n_train + n_test:(n_train + n_test * 2)]
        print("valid labels", len(valid_data["y"]), valid_data["y"])

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "aff_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "aff_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "aff_valid.pkl")
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed,
                 args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len
        self.train_data = json.load(
            open(join(settings.VENUE_DATA_DIR, 'train_filter.txt'), 'r'))

        n_pos_set = int((args.train_num + 2 * args.test_num) / 2)

        neg_pairs = [p for p in self.train_data if p[0] == 0]
        pos_pairs = [p for p in self.train_data if p[0] == 1][-n_pos_set:]
        n_pos = len(pos_pairs)
        print("n_pos", n_pos)
        neg_pairs = neg_pairs[-n_pos:]
        self.train_data = pos_pairs + neg_pairs

        self.train_data = sklearn.utils.shuffle(self.train_data,
                                                random_state=37)

        t = data_utils.load_large_obj(settings.OUT_DIR,
                                      "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)
        self.load_stop_words()  #TODO

        self.mag = t.texts_to_sequences([p[1] for p in self.train_data])
        self.aminer = t.texts_to_sequences([p[2] for p in self.train_data])
        self.labels = [p[0] for p in self.train_data]
        mag_sum_before = sum([len(x[1].split()) for x in self.train_data])
        mag_sum_after = sum([len(x) for x in self.mag])
        print(mag_sum_before, mag_sum_after, mag_sum_before - mag_sum_after)

        self.calc_keyword_seqs()
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)
        self.mag_venue_keywords = pad_sequences(self.mag_venue_keywords,
                                                maxlen=self.max_seq2_len)
        self.aminer_venue_keywords = pad_sequences(self.aminer_venue_keywords,
                                                   maxlen=max_seq2_len)

        self.N = len(self.labels)

        n_train = args.train_num
        n_test = args.test_num
        # n_train = self.N - 2*n_test

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_venue_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_venue_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)]
        test_data["x1_seq2"] = self.mag_venue_keywords[n_train:(n_train +
                                                                n_test)]
        test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)]
        test_data["x2_seq2"] = self.aminer_venue_keywords[n_train:(n_train +
                                                                   n_test)]
        test_data["y"] = self.labels[n_train:(n_train + n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train +
                                                           n_test * 2)]
        valid_data["x1_seq2"] = self.mag_venue_keywords[n_train +
                                                        n_test:(n_train +
                                                                n_test * 2)]
        valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train +
                                                              n_test * 2)]
        valid_data["x2_seq2"] = self.aminer_venue_keywords[n_train +
                                                           n_test:(n_train +
                                                                   n_test * 2)]
        valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "venue_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "venue_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "venue_rnn_valid.pkl")
Beispiel #7
0
    def __init__(self, file_dir, matrix_size1, matrix_size2, build_index_window, seed, shuffle, args, use_emb=True):
        self.file_dir = file_dir
        self.build_index_window = build_index_window

        self.matrix_title_size = matrix_size1
        self.matrix_author_size = matrix_size2

        self.use_emb = use_emb
        if self.use_emb:
            self.pretrain_emb = torch.load(os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb"))
        self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl")

        # load training pairs
        pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json')
        pos_pairs = [(p['c'], p['n']) for p in pos_pairs]
        neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json')
        neg_pairs = [(p['c'], p['n']) for p in neg_pairs]
        labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        pairs = pos_pairs + neg_pairs

        n_matrix = len(pairs)
        self.X_title = np.zeros((n_matrix, self.matrix_title_size, self.matrix_title_size))
        self.X_author = np.zeros((n_matrix, self.matrix_author_size, self.matrix_author_size))
        self.Y = np.zeros(n_matrix, dtype=np.long)
        count = 0
        for i, pair in enumerate(pairs):
            if i % 100 == 0:
                print('pairs to matrices', i)
            cpaper, npaper = pair
            cur_y = labels[i]
            matrix1 = self.titles_to_matrix(cpaper['title'], npaper['title'])
            self.X_title[count] = feature_utils.scale_matrix(matrix1)
            matrix2 = self.authors_to_matrix(cpaper['authors'], npaper['authors'])
            self.X_author[count] = feature_utils.scale_matrix(matrix2)
            self.Y[count] = cur_y
            count += 1

        print("shuffle", shuffle)
        if shuffle:
            self.X_title, self.X_author, self.Y = sklearn.utils.shuffle(
                self.X_title, self.X_author, self.Y,
                random_state=seed
            )

        self.N = len(self.Y)

        # valid_start = int(self.N * args.train_ratio / 100)
        # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100)
        valid_start = 800
        test_start = 200 + valid_start
        end_point = 200 + test_start

        train_data = {}
        train_data["x1"] = self.X_title[:valid_start]
        train_data["x2"] = self.X_author[:valid_start]
        train_data["y"] = self.Y[:valid_start]
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1"] = self.X_title[test_start: end_point]
        test_data["x2"] = self.X_author[test_start: end_point]
        test_data["y"] = self.Y[test_start: end_point]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1"] = self.X_title[valid_start:test_start]
        valid_data["x2"] = self.X_author[valid_start:test_start]
        valid_data["y"] = self.Y[valid_start:test_start]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "paper_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "paper_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "paper_valid.pkl")
Beispiel #8
0
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed,
                 args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        # load training pairs
        # pos_pairs = data_utils.load_json(file_dir, 'train_positive_affi.json')
        # pos_pairs = [(p['aminer_affi'], p['mag_affi']) for p in pos_pairs]
        # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json')
        # neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs]
        # n_pos = len(pos_pairs)
        pos_pairs = data_utils.load_json(file_dir,
                                         "label_data_aff_zhoushao.json")[:600]
        pos_pairs = [({
            "name": p["affiliation"]
        }, {
            "DisplayName": p["label"]
        }) for p in pos_pairs if p["label"] != "[NIF]"]
        # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json')
        neg_pairs = data_utils.load_json(
            file_dir, 'train_negative_affi_clean.json')[:600]
        neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs]
        pairs_add = data_utils.load_json(
            file_dir, "mag_aminer_hard_correct_zfj_copy.json")
        print("add pairs", len(pairs_add))
        pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "1"]
        neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "0"]

        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        pairs = pos_pairs + neg_pairs  # label balanced is important

        # corpus = []
        # for item in pairs:
        #     corpus.append(item[0]["name"].lower())
        #     corpus.append(item[1]["DisplayName"].lower())
        #
        # t = Tokenizer(num_words=9999)
        # t.fit_on_texts(corpus)

        t = data_utils.load_large_obj(settings.OUT_DIR,
                                      "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)
        # print("tokenizer", t.word_index)

        self.mag = t.texts_to_sequences([p[1]["DisplayName"] for p in pairs])
        for mag_aff in self.mag:
            for word_idx in mag_aff:
                assert word_idx <= 100000
        self.aminer = t.texts_to_sequences([p[0]["name"] for p in pairs])
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)

        # self.mag_keywords = t.texts_to_sequences([p[1]["NormalizedName"] for p in pairs])
        # self.aminer_keywords = t.texts_to_sequences([p[0]["main_body"] for p in pairs])
        self.calc_keyword_seqs()
        # self.mag_keywords = t.texts_to_sequences(self.mag_keywords)
        # self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords)

        self.mag_keywords = pad_sequences(self.mag_keywords,
                                          maxlen=max_seq2_len)
        self.aminer_keywords = pad_sequences(self.aminer_keywords,
                                             maxlen=max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag,
                self.aminer,
                self.mag_keywords,
                self.aminer_keywords,
                self.labels,
                random_state=seed)

        self.N = len(self.labels)

        n_train = int(self.N * 0.6)
        n_test = int(self.N * 0.2)

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)]
        test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_test)]
        test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)]
        test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train + n_test)]
        test_data["y"] = self.labels[n_train:(n_train + n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train +
                                                           n_test * 2)]
        valid_data["x1_seq2"] = self.mag_keywords[n_train +
                                                  n_test:(n_train +
                                                          n_test * 2)]
        valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train +
                                                              n_test * 2)]
        valid_data["x2_seq2"] = self.aminer_keywords[n_train +
                                                     n_test:(n_train +
                                                             n_test * 2)]
        valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "aff_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "aff_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "aff_rnn_valid.pkl")
Beispiel #9
0
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len
        self.train_data = json.load(open(join(settings.VENUE_DATA_DIR, 'train.txt'), 'r'))
        # self.tokenizer = _pickle.load(open(join(settings.DATA_DIR, 'venues', "tokenizer"), "rb"))
        # print(self.tokenizer)

        # corpus = []
        # for item in self.train_data:
        #     corpus.append(item[1].lower())
        #     corpus.append(item[2].lower())
        # vectorizer = CountVectorizer()
        # X = vectorizer.fit_transform(corpus)
        # print(len(vectorizer.vocabulary_), vectorizer.vocabulary_)
        # t = Tokenizer()
        # t.fit_on_texts(corpus)

        t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)
        print("tokenizer", t.word_counts, t.word_index)
        self.load_stop_words()  #TODO

        # self.mag = [nltk.word_tokenize(p[1].lower()) for p in self.train_data]
        # self.aminer = [nltk.word_tokenize(p[2].lower()) for p in self.train_data]
        self.mag = t.texts_to_sequences([p[1] for p in self.train_data])
        # print("mag", self.mag)
        self.aminer = t.texts_to_sequences([p[2] for p in self.train_data])
        self.labels = [p[0] for p in self.train_data]

        self.calc_keyword_seqs()
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)
        self.mag_venue_keywords = pad_sequences(self.mag_venue_keywords, maxlen=self.max_seq2_len)
        self.aminer_venue_keywords = pad_sequences(self.aminer_venue_keywords, maxlen=max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_venue_keywords, self.aminer_venue_keywords, self.labels = sklearn.utils.shuffle(
                self.mag, self.aminer, self.mag_venue_keywords, self.aminer_venue_keywords, self.labels,
                random_state=seed
            )

        self.N = len(self.labels)

        n_train = args.train_num
        n_test = args.test_num

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_venue_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_venue_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[n_train:(n_train+n_test)]
        test_data["x1_seq2"] = self.mag_venue_keywords[n_train:(n_train+n_test)]
        test_data["x2_seq1"] = self.aminer[n_train:(n_train+n_test)]
        test_data["x2_seq2"] = self.aminer_venue_keywords[n_train:(n_train+n_test)]
        test_data["y"] = self.labels[n_train:(n_train+n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train+n_test:(n_train+n_test*2)]
        valid_data["x1_seq2"] = self.mag_venue_keywords[n_train+n_test:(n_train+n_test*2)]
        valid_data["x2_seq1"] = self.aminer[n_train+n_test:(n_train+n_test*2)]
        valid_data["x2_seq2"] = self.aminer_venue_keywords[n_train+n_test:(n_train+n_test*2)]
        valid_data["y"] = self.labels[n_train+n_test:(n_train+n_test*2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "venue_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "venue_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "venue_rnn_valid.pkl")
Beispiel #10
0
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args):
        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        # load training pairs
        pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json')
        pos_pairs = [(p['c'], p['n']) for p in pos_pairs]
        neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json')
        neg_pairs = [(p['c'], p['n']) for p in neg_pairs]
        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        pairs = pos_pairs + neg_pairs

        t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)

        self.aminer = [pair[0]["title"] for pair in pairs]
        self.mag = [pair[1]["title"] for pair in pairs]
        self.aminer = t.texts_to_sequences(self.aminer)
        self.mag = t.texts_to_sequences(self.mag)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)

        self.aminer_keywords = [" ".join(pair[0]["authors"]) for pair in pairs]
        self.mag_keywords = [" ".join(pair[1]["authors"]) for pair in pairs]
        self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords)
        self.mag_keywords = t.texts_to_sequences(self.mag_keywords)
        self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=self.max_seq2_len)
        self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=self.max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels,
                random_state=seed
            )

        self.N = len(self.labels)

        # valid_start = int(self.N * args.train_ratio / 100)
        # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100)
        valid_start = 800
        test_start = 200 + valid_start
        end_point = 200 + test_start

        train_data = {}
        train_data["x1_seq1"] = self.mag[:valid_start]
        train_data["x1_seq2"] = self.mag_keywords[:valid_start]
        train_data["x2_seq1"] = self.aminer[:valid_start]
        train_data["x2_seq2"] = self.aminer_keywords[:valid_start]
        train_data["y"] = self.labels[:valid_start]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[test_start: end_point]
        test_data["x1_seq2"] = self.mag_keywords[test_start: end_point]
        test_data["x2_seq1"] = self.aminer[test_start: end_point]
        test_data["x2_seq2"] = self.aminer_keywords[test_start: end_point]
        test_data["y"] = self.labels[test_start: end_point]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[valid_start:test_start]
        valid_data["x1_seq2"] = self.mag_keywords[valid_start:test_start]
        valid_data["x2_seq1"] = self.aminer[valid_start:test_start]
        valid_data["x2_seq2"] = self.aminer_keywords[valid_start:test_start]
        valid_data["y"] = self.labels[valid_start:test_start]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "paper_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "paper_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "paper_rnn_valid.pkl")
    def __init__(self,
                 file_dir,
                 matrix_size1,
                 matrix_size2,
                 seed,
                 shuffle,
                 args,
                 use_emb=True,
                 all_train=False):
        self.file_dir = file_dir
        self.matrix_title_size = matrix_size1
        self.matrix_author_size = matrix_size2

        # load training pairs
        pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json')
        neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json')
        pairs = pos_pairs + neg_pairs
        labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)

        self.person_dict = data_utils.load_json(file_dir,
                                                "ego_person_dict.json")

        self.use_emb = use_emb
        if self.use_emb:
            self.pretrain_emb = torch.load(
                os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb"))
        self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR,
                                                   "tokenizer_all_domain.pkl")

        X_long = []
        X_short = []
        nn_pos = 0
        nn_neg = 0
        for i, pair in enumerate(pairs):
            if i % 100 == 0:
                logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg)
            aid, mid = pair['aid'], pair['mid']
            aperson = self.person_dict.get(aid, {})
            mperson = self.person_dict.get(mid, {})
            # matrix1, nn1 = self.org_to_matrix(aperson.get('org', ''), mperson.get('org', ''), matrix_size1)
            matrix1, nn1 = self.paper_to_matrix(aperson.get('pubs', []),
                                                mperson.get('pubs', []),
                                                matrix_size1)
            matrix1 = feature_utils.scale_matrix(matrix1)
            X_long.append(matrix1)
            matrix2, nn2 = self.venue_to_matrix(aperson.get('venue', ''),
                                                mperson.get('venue', ''),
                                                matrix_size2)
            # print("matrix2", matrix2)
            matrix2 = feature_utils.scale_matrix(matrix2)
            X_short.append(matrix2)

        self.X_long = X_long
        self.X_short = X_short
        self.Y = labels

        print("shuffle", shuffle)
        if shuffle:
            self.X_long, self.X_short, self.Y = sklearn.utils.shuffle(
                self.X_long, self.X_short, self.Y, random_state=seed)

        self.N = len(self.Y)

        # valid_start = int(self.N * args.train_ratio / 100)
        # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100)
        if all_train:
            valid_start = 10000
            test_start = 5000 + valid_start
            end_point = 5000 + test_start
        else:
            valid_start = 800
            test_start = 200 + valid_start
            end_point = 200 + test_start

        train_data = {}
        train_data["x1"] = self.X_long[:valid_start]
        train_data["x2"] = self.X_short[:valid_start]
        train_data["y"] = self.Y[:valid_start]
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1"] = self.X_long[test_start:end_point]
        test_data["x2"] = self.X_short[test_start:end_point]
        test_data["y"] = self.Y[test_start:end_point]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1"] = self.X_long[valid_start:test_start]
        valid_data["x2"] = self.X_short[valid_start:test_start]
        valid_data["y"] = self.Y[valid_start:test_start]
        print("valid labels", len(valid_data["y"]))

        print("train positive samples", sum(train_data["y"]))
        print("test positive samples", sum(test_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "author_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "author_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "author_valid.pkl")
    def __init__(self,
                 file_dir,
                 max_seq1_len,
                 max_seq2_len,
                 shuffle,
                 seed,
                 args,
                 all_train=False):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        # load training pairs
        pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json')
        neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json')
        pairs = pos_pairs + neg_pairs
        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)

        self.person_dict = data_utils.load_json(file_dir,
                                                "ego_person_dict.json")
        nn_pos = 0
        nn_neg = 0
        t = data_utils.load_large_obj(settings.OUT_DIR,
                                      "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)

        self.mag = [
            self.person_dict.get(pair["mid"], {}).get("pubs", [])
            for pair in pairs
        ]
        self.aminer = [
            self.person_dict.get(pair["aid"], {}).get("pubs", [])
            for pair in pairs
        ]
        self.mag = t.texts_to_sequences(self.mag)

        self.aminer = t.texts_to_sequences(self.aminer)
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)

        self.mag_keywords = []
        self.aminer_keywords = []
        for i, pair in enumerate(pairs):
            if i % 100 == 0:
                logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg)
            aid, mid = pair['aid'], pair['mid']
            avenue = [
                item["id"]
                for item in self.person_dict.get(aid, {}).get("venue", [])
            ]
            mvenue = [
                item["id"]
                for item in self.person_dict.get(mid, {}).get("venue", [])
            ]
            self.mag_keywords.append(mvenue)
            self.aminer_keywords.append(avenue)

        self.mag_keywords = t.texts_to_sequences(self.mag_keywords)
        self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords)

        self.mag_keywords = pad_sequences(self.mag_keywords,
                                          maxlen=max_seq2_len)
        self.aminer_keywords = pad_sequences(self.aminer_keywords,
                                             maxlen=max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag,
                self.aminer,
                self.mag_keywords,
                self.aminer_keywords,
                self.labels,
                random_state=seed)

        self.N = len(self.labels)

        # valid_start = int(self.N * args.train_ratio / 100)
        # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100)
        if all_train:
            valid_start = 10000
            test_start = 5000 + valid_start
            end_point = 5000 + test_start
        else:
            valid_start = 800
            test_start = 200 + valid_start
            end_point = 200 + test_start

        train_data = {}
        train_data["x1_seq1"] = self.mag[:valid_start]
        train_data["x1_seq2"] = self.mag_keywords[:valid_start]
        train_data["x2_seq1"] = self.aminer[:valid_start]
        train_data["x2_seq2"] = self.aminer_keywords[:valid_start]
        train_data["y"] = self.labels[:valid_start]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[test_start:end_point]
        test_data["x1_seq2"] = self.mag_keywords[test_start:end_point]
        test_data["x2_seq1"] = self.aminer[test_start:end_point]
        test_data["x2_seq2"] = self.aminer_keywords[test_start:end_point]
        test_data["y"] = self.labels[test_start:end_point]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[valid_start:test_start]
        valid_data["x1_seq2"] = self.mag_keywords[valid_start:test_start]
        valid_data["x2_seq1"] = self.aminer[valid_start:test_start]
        valid_data["x2_seq2"] = self.aminer_keywords[valid_start:test_start]
        valid_data["y"] = self.labels[valid_start:test_start]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "author_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "author_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "author_rnn_valid.pkl")
Beispiel #13
0
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        # load training pairs

        pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json')
        neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json')
        pairs = pos_pairs + neg_pairs
        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)

        self.person_dict = data_utils.load_json(file_dir, "ego_person_dict.json")

        corpus = []
        nn_pos = 0
        nn_neg = 0

        # for i, pair in enumerate(pairs):
        #     if i % 100 == 0:
        #         logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg)
        #     # cpaper, npaper = pair
        #     aid, mid = pair['aid'], pair['mid']
        #     aperson = self.person_dict.get(aid, {})
        #     mperson = self.person_dict.get(mid, {})
        #     corpus.append(aperson.get("pubs", []))
        #     corpus.append(mperson.get("pubs", []))
        #
        #     corpus.append([item["id"] for item in aperson.get("venue", [])])
        #     corpus.append([item["id"] for item in mperson.get("venue", [])])
        #
        # t = Tokenizer(num_words=100000)
        # t.fit_on_texts(corpus)

        t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)
        # print("tokenizer", t.word_index)

        self.mag = [self.person_dict.get(pair["mid"], {}).get("pubs", []) for pair in pairs]
        self.aminer = [self.person_dict.get(pair["aid"], {}).get("pubs", []) for pair in pairs]
        self.mag = t.texts_to_sequences(self.mag)
        # print("self mag", self.mag)

        self.aminer = t.texts_to_sequences(self.aminer)
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)

        self.mag_keywords = []
        self.aminer_keywords = []
        for i, pair in enumerate(pairs):
            if i % 100 == 0:
                logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg)
            # cpaper, npaper = pair
            aid, mid = pair['aid'], pair['mid']
            avenue = [item["id"] for item in self.person_dict.get(aid, {}).get("venue", [])]
            mvenue = [item["id"] for item in self.person_dict.get(mid, {}).get("venue", [])]
            self.mag_keywords.append(mvenue)
            self.aminer_keywords.append(avenue)


        # self.mag_keywords = [self.person_dict.get(pair["mid"], {}).get("venue", []) for pair in pairs]
        # self.mag_keywords = [venue["id"] for venue in self.mag_keywords]
        # self.aminer_keywords = [self.person_dict.get(pair["aid"], {}).get("venue", []) for pair in pairs]
        # self.aminer_keywords = [venue["id"] for venue in self.aminer_keywords]

        self.mag_keywords = t.texts_to_sequences(self.mag_keywords)
        self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords)

        self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=max_seq2_len)
        self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels,
                random_state=seed
            )

        self.N = len(self.labels)

        n_train = args.train_num
        n_test = args.test_num

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[n_train:(n_train+n_test)]
        test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train+n_test)]
        test_data["x2_seq1"] = self.aminer[n_train:(n_train+n_test)]
        test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train+n_test)]
        test_data["y"] = self.labels[n_train:(n_train+n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train+n_test:(n_train+n_test*2)]
        valid_data["x1_seq2"] = self.mag_keywords[n_train+n_test:(n_train+n_test*2)]
        valid_data["x2_seq1"] = self.aminer[n_train+n_test:(n_train+n_test*2)]
        valid_data["x2_seq2"] = self.aminer_keywords[n_train+n_test:(n_train+n_test*2)]
        valid_data["y"] = self.labels[n_train+n_test:(n_train+n_test*2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "author_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "author_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "author_rnn_valid.pkl")
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed,
                 args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        pos_pairs = data_utils.load_json(file_dir,
                                         "label_data_aff_zhoushao.json")[:600]
        pos_pairs = [({
            "name": p["affiliation"]
        }, {
            "DisplayName": p["label"]
        }) for p in pos_pairs if p["label"] != "[NIF]"]
        neg_pairs = data_utils.load_json(
            file_dir, 'train_negative_affi_clean.json')[:600]
        neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs]
        pairs_add = data_utils.load_json(
            file_dir, "mag_aminer_hard_correct_zfj_copy.json")
        print("add pairs", len(pairs_add))
        pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "1"]
        neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add
                      if p["label_zfj"] == "0"]

        pos_pairs = pos_pairs[-len(neg_pairs):]
        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        print("n_pos", len(pos_pairs), "n_neg", len(neg_pairs))
        pairs = pos_pairs + neg_pairs  # label balanced is important

        t = data_utils.load_large_obj(settings.OUT_DIR,
                                      "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)

        self.mag = t.texts_to_sequences([p[1]["DisplayName"] for p in pairs])
        for mag_aff in self.mag:
            for word_idx in mag_aff:
                assert word_idx <= settings.MAX_WORD_TOKEN_NUM + 1
        self.aminer = t.texts_to_sequences([p[0]["name"] for p in pairs])
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)

        self.calc_keyword_seqs()

        self.mag_keywords = pad_sequences(self.mag_keywords,
                                          maxlen=max_seq2_len)
        self.aminer_keywords = pad_sequences(self.aminer_keywords,
                                             maxlen=max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag,
                self.aminer,
                self.mag_keywords,
                self.aminer_keywords,
                self.labels,
                random_state=seed)

        self.N = len(self.labels)

        N = self.N

        n_train = int(self.N * 0.6)
        n_valid = int(self.N * 0.2)
        n_test = N - n_train - n_valid

        # n_train = 800
        # n_valid = 200
        # n_test = 200

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[(n_train + n_valid):(n_train +
                                                             n_valid + n_test)]
        test_data["x1_seq2"] = self.mag_keywords[(n_train +
                                                  n_valid):(n_train + n_valid +
                                                            n_test)]
        test_data["x2_seq1"] = self.aminer[(n_train +
                                            n_valid):(n_train + n_valid +
                                                      n_test)]
        test_data["x2_seq2"] = self.aminer_keywords[(n_train +
                                                     n_valid):(n_train +
                                                               n_valid +
                                                               n_test)]
        test_data["y"] = self.labels[(n_train + n_valid):(n_train + n_valid +
                                                          n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train:(n_train + n_valid)]
        valid_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_valid)]
        valid_data["x2_seq1"] = self.aminer[n_train:(n_train + n_valid)]
        valid_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train +
                                                              n_valid)]
        valid_data["y"] = self.labels[n_train:(n_train + n_valid)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "aff_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "aff_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "aff_rnn_valid.pkl")
Beispiel #15
0
    def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed,
                 args):

        self.max_seq1_len = max_seq1_len
        self.max_seq2_len = max_seq2_len

        # load training pairs
        pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json')
        pos_pairs = [(p['c'], p['n']) for p in pos_pairs]
        neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json')
        neg_pairs = [(p['c'], p['n']) for p in neg_pairs]
        self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs)
        pairs = pos_pairs + neg_pairs

        # corpus = []
        # for i, pair in enumerate(pairs):
        #     if i % 100 == 0:
        #         logger.info('pairs to matrices %d', i)
        #     cpaper, npaper = pair
        #     corpus.append(cpaper["title"])
        #     corpus.append(npaper["title"])
        #     corpus.append(" ".join(cpaper["authors"]))
        #     corpus.append(" ".join(npaper["authors"]))
        #
        # t = Tokenizer(num_words=99999)
        # t.fit_on_texts(corpus)

        t = data_utils.load_large_obj(settings.OUT_DIR,
                                      "tokenizer_all_domain.pkl")

        self.vocab_size = len(t.word_counts)
        print("vocab size", self.vocab_size)
        # print("tokenizer", t.word_index)

        self.aminer = [pair[0]["title"] for pair in pairs]
        self.mag = [pair[1]["title"] for pair in pairs]
        self.aminer = t.texts_to_sequences(self.aminer)
        self.mag = t.texts_to_sequences(self.mag)
        # print("mag", self.mag)
        self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len)
        self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len)

        self.aminer_keywords = [" ".join(pair[0]["authors"]) for pair in pairs]
        self.mag_keywords = [" ".join(pair[1]["authors"]) for pair in pairs]
        self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords)
        self.mag_keywords = t.texts_to_sequences(self.mag_keywords)
        self.aminer_keywords = pad_sequences(self.aminer_keywords,
                                             maxlen=self.max_seq2_len)
        self.mag_keywords = pad_sequences(self.mag_keywords,
                                          maxlen=self.max_seq2_len)

        if shuffle:
            self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle(
                self.mag,
                self.aminer,
                self.mag_keywords,
                self.aminer_keywords,
                self.labels,
                random_state=seed)

        self.N = len(self.labels)

        n_train = args.train_num
        n_test = args.test_num

        train_data = {}
        train_data["x1_seq1"] = self.mag[:n_train]
        train_data["x1_seq2"] = self.mag_keywords[:n_train]
        train_data["x2_seq1"] = self.aminer[:n_train]
        train_data["x2_seq2"] = self.aminer_keywords[:n_train]
        train_data["y"] = self.labels[:n_train]
        train_data["vocab_size"] = self.vocab_size
        print("train labels", len(train_data["y"]))

        test_data = {}
        test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)]
        test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_test)]
        test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)]
        test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train + n_test)]
        test_data["y"] = self.labels[n_train:(n_train + n_test)]
        print("test labels", len(test_data["y"]))

        valid_data = {}
        valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train +
                                                           n_test * 2)]
        valid_data["x1_seq2"] = self.mag_keywords[n_train +
                                                  n_test:(n_train +
                                                          n_test * 2)]
        valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train +
                                                              n_test * 2)]
        valid_data["x2_seq2"] = self.aminer_keywords[n_train +
                                                     n_test:(n_train +
                                                             n_test * 2)]
        valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)]
        print("valid labels", len(valid_data["y"]))

        out_dir = join(settings.DATA_DIR, "dom-adpt")
        os.makedirs(out_dir, exist_ok=True)
        data_utils.dump_large_obj(train_data, out_dir, "paper_rnn_train.pkl")
        data_utils.dump_large_obj(test_data, out_dir, "paper_rnn_test.pkl")
        data_utils.dump_large_obj(valid_data, out_dir, "paper_rnn_valid.pkl")