def __init__(self, entity_type, role, sample_num=None, seed=0): data_dir = settings.DOM_ADAPT_DIR fname = "{}_{}.pkl".format(entity_type, role) data_dict = data_utils.load_large_obj(data_dir, fname) self.x1 = np.array(data_dict["x1"], dtype="float32") self.x2 = np.array(data_dict["x2"], dtype="float32") self.y = np.array(data_dict["y"], dtype=int) self.N = len(self.y) if sample_num is not None: self.x1, self.x2, self.y = sklearn.utils.shuffle(self.x1, self.x2, self.y, random_state=seed) n_sample_half = int(sample_num / 2) pos_flag = False neg_flag = False x1 = [] x2 = [] y = [] n_pos = 0 n_neg = 0 for i in range(self.N): if pos_flag and neg_flag: break cur_y = self.y[i] if cur_y == 1 and n_pos < n_sample_half: x1.append(self.x1[i]) x2.append(self.x2[i]) y.append(cur_y) n_pos += 1 if n_pos == n_sample_half: pos_flag = True elif cur_y == 0 and n_neg < n_sample_half: x1.append(self.x1[i]) x2.append(self.x2[i]) y.append(cur_y) n_neg += 1 if n_neg == n_sample_half: neg_flag = True self.x1 = np.array(x1) self.x2 = np.array(x2) self.y = np.array(y) self.x1 = torch.from_numpy(self.x1) self.x2 = torch.from_numpy(self.x2) self.y = torch.from_numpy(self.y) self.N = len(self.y)
def __init__(self, entity_type, role, domain=0): data_dir = settings.DOM_ADAPT_DIR fname = "{}_{}.pkl".format(entity_type, role) data_dict = data_utils.load_large_obj(data_dir, fname) self.x1 = np.array(data_dict["x1"], dtype="float32") self.x2 = np.array(data_dict["x2"], dtype="float32") self.x1 = torch.from_numpy(self.x1) self.x2 = torch.from_numpy(self.x2) self.y = torch.LongTensor([domain] * self.x1.shape[0]) self.N = self.y.size()[0]
def __init__(self, entity_type, role): data_dir = settings.DOM_ADAPT_DIR fname = "{}_{}.pkl".format(entity_type, role) data_dict = data_utils.load_large_obj(data_dir, fname) self.x1 = np.array(data_dict["x1"], dtype="float32") self.x2 = np.array(data_dict["x2"], dtype="float32") self.y = np.array(data_dict["y"], dtype=int) self.N = len(self.y) self.x1 = torch.from_numpy(self.x1) self.x2 = torch.from_numpy(self.x2) self.y = torch.from_numpy(self.y)
def __init__(self, file_dir, matrix_size1, matrix_size2, seed, shuffle, args, use_emb=True): self.file_dir = file_dir self.matrix_size_1_long = matrix_size1 self.matrix_size_2_short = matrix_size2 self.use_emb = use_emb if self.use_emb: self.pretrain_emb = torch.load( os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb")) self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.train_data = json.load( open(join(settings.VENUE_DATA_DIR, 'train_filter.txt'), 'r')) n_pos_set = int((args.train_num + 2 * args.test_num) / 2) neg_pairs = [p for p in self.train_data if p[0] == 0] pos_pairs = [p for p in self.train_data if p[0] == 1][-n_pos_set:] n_pos = len(pos_pairs) neg_pairs = neg_pairs[-n_pos:] self.train_data = pos_pairs + neg_pairs self.train_data = sklearn.utils.shuffle(self.train_data, random_state=37) self.mag = [nltk.word_tokenize(p[1]) for p in self.train_data] self.aminer = [nltk.word_tokenize(p[2]) for p in self.train_data] self.labels = [p[0] for p in self.train_data] self.calc_keyword_seqs() n_matrix = len(self.labels) self.X_long = np.zeros( (n_matrix, self.matrix_size_1_long, self.matrix_size_1_long)) self.X_short = np.zeros( (n_matrix, self.matrix_size_2_short, self.matrix_size_2_short)) self.Y = np.zeros(n_matrix, dtype=np.long) count = 0 for i, cur_y in enumerate(self.labels): if i % 100 == 0: print('pairs to matrices', i) v1 = self.mag[i] v1 = " ".join([str(v) for v in v1]) v2 = self.aminer[i] v2 = " ".join([str(v) for v in v2]) v1_key = self.mag_venue_keywords[i] v1_key = " ".join([str(v) for v in v1_key]) v2_key = self.aminer_venue_keywords[i] v2_key = " ".join([str(v) for v in v2_key]) matrix1 = self.sentences_long_to_matrix(v1, v2) # print("mat1", matrix1) self.X_long[count] = feature_utils.scale_matrix(matrix1) matrix2 = self.sentences_short_to_matrix(v1_key, v2_key) # print("mat2", matrix2) self.X_short[count] = feature_utils.scale_matrix(matrix2) self.Y[count] = cur_y count += 1 self.N = len(self.Y) n_train = args.train_num n_test = args.test_num # n_train = self.N - 2*n_test train_data = {} train_data["x1"] = self.X_long[:n_train] train_data["x2"] = self.X_short[:n_train] train_data["y"] = self.Y[:n_train] print("train labels", len(train_data["y"])) test_data = {} test_data["x1"] = self.X_long[n_train:(n_train + n_test)] test_data["x2"] = self.X_short[n_train:(n_train + n_test)] test_data["y"] = self.Y[n_train:(n_train + n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1"] = self.X_long[n_train + n_test:(n_train + n_test * 2)] valid_data["x2"] = self.X_short[n_train + n_test:(n_train + n_test * 2)] valid_data["y"] = self.Y[n_train + n_test:(n_train + n_test * 2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "venue_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "venue_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "venue_valid.pkl")
def __init__(self, file_dir, matrix_size1, matrix_size2, seed, shuffle, args, use_emb=True): self.file_dir = file_dir self.matrix_size_1_long = matrix_size1 self.matrix_size_2_short = matrix_size2 self.use_emb = use_emb if self.use_emb: self.pretrain_emb = torch.load( os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb")) self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") # load training pairs # pos_pairs = data_utils.load_json(file_dir, 'train_positive_affi.json') # pos_pairs = [(p['aminer_affi'], p['mag_affi']) for p in pos_pairs] pos_pairs = data_utils.load_json(file_dir, "label_data_aff_zhoushao.json")[:600] pos_pairs = [({ "name": p["affiliation"] }, { "DisplayName": p["label"] }) for p in pos_pairs if p["label"] != "[NIF]"] # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json') neg_pairs = data_utils.load_json( file_dir, 'train_negative_affi_clean.json')[:600] neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs] pairs_add = data_utils.load_json( file_dir, "mag_aminer_hard_correct_zfj_copy.json") print("add pairs", len(pairs_add)) pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "1"] neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "0"] n_pos = len(pos_pairs) # labels = [1] * len(pos_pairs) + [0] * len(pos_pairs) labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) # pairs = pos_pairs + [neg_pairs[x] for x in range(n_pos)] # label balanced is important pairs = pos_pairs + neg_pairs # label balanced is important n_matrix = len(pairs) self.X_long = np.zeros( (n_matrix, self.matrix_size_1_long, self.matrix_size_1_long)) self.X_short = np.zeros( (n_matrix, self.matrix_size_2_short, self.matrix_size_2_short)) self.Y = np.zeros(n_matrix, dtype=np.long) count = 0 for i, pair in enumerate(pairs): if i % 100 == 0: print('pairs to matrices', i) item_a, item_m = pair cur_y = labels[i] matrix1 = self.sentences_long_to_matrix(item_a['name'], item_m['DisplayName']) # print("matrix1", matrix1) # print(item_a['name']) # print(item_m['DisplayName']) self.X_long[count] = feature_utils.scale_matrix(matrix1) # matrix2 = self.sentences_short_to_matrix(item_a['main_body'], item_m['NormalizedName']) matrix2 = self.sentences_short_to_matrix_2(item_a['name'], item_m['DisplayName']) # print("matrix2", matrix2) self.X_short[count] = feature_utils.scale_matrix(matrix2) self.Y[count] = cur_y count += 1 # # transpose # self.X_long[count] = feature_utils.scale_matrix(matrix1.transpose()) # self.X_short[count] = feature_utils.scale_matrix(matrix2.transpose()) # self.Y[count] = cur_y # count += 1 print("shuffle", shuffle) if shuffle: self.X_long, self.X_short, self.Y = sklearn.utils.shuffle( self.X_long, self.X_short, self.Y, random_state=seed) self.N = len(self.Y) n_train = int(self.N * 0.6) n_test = int(self.N * 0.2) train_data = {} train_data["x1"] = self.X_long[:n_train] train_data["x2"] = self.X_short[:n_train] train_data["y"] = self.Y[:n_train] print("train labels", len(train_data["y"])) test_data = {} test_data["x1"] = self.X_long[n_train:(n_train + n_test)] test_data["x2"] = self.X_short[n_train:(n_train + n_test)] test_data["y"] = self.Y[n_train:(n_train + n_test)] print("test labels", len(test_data["y"]), test_data["y"]) valid_data = {} valid_data["x1"] = self.X_long[n_train + n_test:(n_train + n_test * 2)] valid_data["x2"] = self.X_short[n_train + n_test:(n_train + n_test * 2)] valid_data["y"] = self.Y[n_train + n_test:(n_train + n_test * 2)] print("valid labels", len(valid_data["y"]), valid_data["y"]) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "aff_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "aff_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "aff_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len self.train_data = json.load( open(join(settings.VENUE_DATA_DIR, 'train_filter.txt'), 'r')) n_pos_set = int((args.train_num + 2 * args.test_num) / 2) neg_pairs = [p for p in self.train_data if p[0] == 0] pos_pairs = [p for p in self.train_data if p[0] == 1][-n_pos_set:] n_pos = len(pos_pairs) print("n_pos", n_pos) neg_pairs = neg_pairs[-n_pos:] self.train_data = pos_pairs + neg_pairs self.train_data = sklearn.utils.shuffle(self.train_data, random_state=37) t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) self.load_stop_words() #TODO self.mag = t.texts_to_sequences([p[1] for p in self.train_data]) self.aminer = t.texts_to_sequences([p[2] for p in self.train_data]) self.labels = [p[0] for p in self.train_data] mag_sum_before = sum([len(x[1].split()) for x in self.train_data]) mag_sum_after = sum([len(x) for x in self.mag]) print(mag_sum_before, mag_sum_after, mag_sum_before - mag_sum_after) self.calc_keyword_seqs() self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag_venue_keywords = pad_sequences(self.mag_venue_keywords, maxlen=self.max_seq2_len) self.aminer_venue_keywords = pad_sequences(self.aminer_venue_keywords, maxlen=max_seq2_len) self.N = len(self.labels) n_train = args.train_num n_test = args.test_num # n_train = self.N - 2*n_test train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_venue_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_venue_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)] test_data["x1_seq2"] = self.mag_venue_keywords[n_train:(n_train + n_test)] test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)] test_data["x2_seq2"] = self.aminer_venue_keywords[n_train:(n_train + n_test)] test_data["y"] = self.labels[n_train:(n_train + n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train + n_test * 2)] valid_data["x1_seq2"] = self.mag_venue_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq2"] = self.aminer_venue_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "venue_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "venue_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "venue_rnn_valid.pkl")
def __init__(self, file_dir, matrix_size1, matrix_size2, build_index_window, seed, shuffle, args, use_emb=True): self.file_dir = file_dir self.build_index_window = build_index_window self.matrix_title_size = matrix_size1 self.matrix_author_size = matrix_size2 self.use_emb = use_emb if self.use_emb: self.pretrain_emb = torch.load(os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb")) self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json') pos_pairs = [(p['c'], p['n']) for p in pos_pairs] neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json') neg_pairs = [(p['c'], p['n']) for p in neg_pairs] labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) pairs = pos_pairs + neg_pairs n_matrix = len(pairs) self.X_title = np.zeros((n_matrix, self.matrix_title_size, self.matrix_title_size)) self.X_author = np.zeros((n_matrix, self.matrix_author_size, self.matrix_author_size)) self.Y = np.zeros(n_matrix, dtype=np.long) count = 0 for i, pair in enumerate(pairs): if i % 100 == 0: print('pairs to matrices', i) cpaper, npaper = pair cur_y = labels[i] matrix1 = self.titles_to_matrix(cpaper['title'], npaper['title']) self.X_title[count] = feature_utils.scale_matrix(matrix1) matrix2 = self.authors_to_matrix(cpaper['authors'], npaper['authors']) self.X_author[count] = feature_utils.scale_matrix(matrix2) self.Y[count] = cur_y count += 1 print("shuffle", shuffle) if shuffle: self.X_title, self.X_author, self.Y = sklearn.utils.shuffle( self.X_title, self.X_author, self.Y, random_state=seed ) self.N = len(self.Y) # valid_start = int(self.N * args.train_ratio / 100) # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100) valid_start = 800 test_start = 200 + valid_start end_point = 200 + test_start train_data = {} train_data["x1"] = self.X_title[:valid_start] train_data["x2"] = self.X_author[:valid_start] train_data["y"] = self.Y[:valid_start] print("train labels", len(train_data["y"])) test_data = {} test_data["x1"] = self.X_title[test_start: end_point] test_data["x2"] = self.X_author[test_start: end_point] test_data["y"] = self.Y[test_start: end_point] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1"] = self.X_title[valid_start:test_start] valid_data["x2"] = self.X_author[valid_start:test_start] valid_data["y"] = self.Y[valid_start:test_start] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "paper_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "paper_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "paper_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len # load training pairs # pos_pairs = data_utils.load_json(file_dir, 'train_positive_affi.json') # pos_pairs = [(p['aminer_affi'], p['mag_affi']) for p in pos_pairs] # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json') # neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs] # n_pos = len(pos_pairs) pos_pairs = data_utils.load_json(file_dir, "label_data_aff_zhoushao.json")[:600] pos_pairs = [({ "name": p["affiliation"] }, { "DisplayName": p["label"] }) for p in pos_pairs if p["label"] != "[NIF]"] # neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi.json') neg_pairs = data_utils.load_json( file_dir, 'train_negative_affi_clean.json')[:600] neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs] pairs_add = data_utils.load_json( file_dir, "mag_aminer_hard_correct_zfj_copy.json") print("add pairs", len(pairs_add)) pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "1"] neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "0"] self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) pairs = pos_pairs + neg_pairs # label balanced is important # corpus = [] # for item in pairs: # corpus.append(item[0]["name"].lower()) # corpus.append(item[1]["DisplayName"].lower()) # # t = Tokenizer(num_words=9999) # t.fit_on_texts(corpus) t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) # print("tokenizer", t.word_index) self.mag = t.texts_to_sequences([p[1]["DisplayName"] for p in pairs]) for mag_aff in self.mag: for word_idx in mag_aff: assert word_idx <= 100000 self.aminer = t.texts_to_sequences([p[0]["name"] for p in pairs]) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) # self.mag_keywords = t.texts_to_sequences([p[1]["NormalizedName"] for p in pairs]) # self.aminer_keywords = t.texts_to_sequences([p[0]["main_body"] for p in pairs]) self.calc_keyword_seqs() # self.mag_keywords = t.texts_to_sequences(self.mag_keywords) # self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords) self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=max_seq2_len) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed) self.N = len(self.labels) n_train = int(self.N * 0.6) n_test = int(self.N * 0.2) train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)] test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_test)] test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)] test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train + n_test)] test_data["y"] = self.labels[n_train:(n_train + n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train + n_test * 2)] valid_data["x1_seq2"] = self.mag_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq2"] = self.aminer_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "aff_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "aff_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "aff_rnn_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len self.train_data = json.load(open(join(settings.VENUE_DATA_DIR, 'train.txt'), 'r')) # self.tokenizer = _pickle.load(open(join(settings.DATA_DIR, 'venues', "tokenizer"), "rb")) # print(self.tokenizer) # corpus = [] # for item in self.train_data: # corpus.append(item[1].lower()) # corpus.append(item[2].lower()) # vectorizer = CountVectorizer() # X = vectorizer.fit_transform(corpus) # print(len(vectorizer.vocabulary_), vectorizer.vocabulary_) # t = Tokenizer() # t.fit_on_texts(corpus) t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) print("tokenizer", t.word_counts, t.word_index) self.load_stop_words() #TODO # self.mag = [nltk.word_tokenize(p[1].lower()) for p in self.train_data] # self.aminer = [nltk.word_tokenize(p[2].lower()) for p in self.train_data] self.mag = t.texts_to_sequences([p[1] for p in self.train_data]) # print("mag", self.mag) self.aminer = t.texts_to_sequences([p[2] for p in self.train_data]) self.labels = [p[0] for p in self.train_data] self.calc_keyword_seqs() self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag_venue_keywords = pad_sequences(self.mag_venue_keywords, maxlen=self.max_seq2_len) self.aminer_venue_keywords = pad_sequences(self.aminer_venue_keywords, maxlen=max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_venue_keywords, self.aminer_venue_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_venue_keywords, self.aminer_venue_keywords, self.labels, random_state=seed ) self.N = len(self.labels) n_train = args.train_num n_test = args.test_num train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_venue_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_venue_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[n_train:(n_train+n_test)] test_data["x1_seq2"] = self.mag_venue_keywords[n_train:(n_train+n_test)] test_data["x2_seq1"] = self.aminer[n_train:(n_train+n_test)] test_data["x2_seq2"] = self.aminer_venue_keywords[n_train:(n_train+n_test)] test_data["y"] = self.labels[n_train:(n_train+n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train+n_test:(n_train+n_test*2)] valid_data["x1_seq2"] = self.mag_venue_keywords[n_train+n_test:(n_train+n_test*2)] valid_data["x2_seq1"] = self.aminer[n_train+n_test:(n_train+n_test*2)] valid_data["x2_seq2"] = self.aminer_venue_keywords[n_train+n_test:(n_train+n_test*2)] valid_data["y"] = self.labels[n_train+n_test:(n_train+n_test*2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "venue_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "venue_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "venue_rnn_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json') pos_pairs = [(p['c'], p['n']) for p in pos_pairs] neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json') neg_pairs = [(p['c'], p['n']) for p in neg_pairs] self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) pairs = pos_pairs + neg_pairs t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) self.aminer = [pair[0]["title"] for pair in pairs] self.mag = [pair[1]["title"] for pair in pairs] self.aminer = t.texts_to_sequences(self.aminer) self.mag = t.texts_to_sequences(self.mag) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer_keywords = [" ".join(pair[0]["authors"]) for pair in pairs] self.mag_keywords = [" ".join(pair[1]["authors"]) for pair in pairs] self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords) self.mag_keywords = t.texts_to_sequences(self.mag_keywords) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=self.max_seq2_len) self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=self.max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed ) self.N = len(self.labels) # valid_start = int(self.N * args.train_ratio / 100) # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100) valid_start = 800 test_start = 200 + valid_start end_point = 200 + test_start train_data = {} train_data["x1_seq1"] = self.mag[:valid_start] train_data["x1_seq2"] = self.mag_keywords[:valid_start] train_data["x2_seq1"] = self.aminer[:valid_start] train_data["x2_seq2"] = self.aminer_keywords[:valid_start] train_data["y"] = self.labels[:valid_start] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[test_start: end_point] test_data["x1_seq2"] = self.mag_keywords[test_start: end_point] test_data["x2_seq1"] = self.aminer[test_start: end_point] test_data["x2_seq2"] = self.aminer_keywords[test_start: end_point] test_data["y"] = self.labels[test_start: end_point] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[valid_start:test_start] valid_data["x1_seq2"] = self.mag_keywords[valid_start:test_start] valid_data["x2_seq1"] = self.aminer[valid_start:test_start] valid_data["x2_seq2"] = self.aminer_keywords[valid_start:test_start] valid_data["y"] = self.labels[valid_start:test_start] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "paper_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "paper_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "paper_rnn_valid.pkl")
def __init__(self, file_dir, matrix_size1, matrix_size2, seed, shuffle, args, use_emb=True, all_train=False): self.file_dir = file_dir self.matrix_title_size = matrix_size1 self.matrix_author_size = matrix_size2 # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json') neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json') pairs = pos_pairs + neg_pairs labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) self.person_dict = data_utils.load_json(file_dir, "ego_person_dict.json") self.use_emb = use_emb if self.use_emb: self.pretrain_emb = torch.load( os.path.join(settings.OUT_DIR, "rnn_init_word_emb.emb")) self.tokenizer = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") X_long = [] X_short = [] nn_pos = 0 nn_neg = 0 for i, pair in enumerate(pairs): if i % 100 == 0: logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg) aid, mid = pair['aid'], pair['mid'] aperson = self.person_dict.get(aid, {}) mperson = self.person_dict.get(mid, {}) # matrix1, nn1 = self.org_to_matrix(aperson.get('org', ''), mperson.get('org', ''), matrix_size1) matrix1, nn1 = self.paper_to_matrix(aperson.get('pubs', []), mperson.get('pubs', []), matrix_size1) matrix1 = feature_utils.scale_matrix(matrix1) X_long.append(matrix1) matrix2, nn2 = self.venue_to_matrix(aperson.get('venue', ''), mperson.get('venue', ''), matrix_size2) # print("matrix2", matrix2) matrix2 = feature_utils.scale_matrix(matrix2) X_short.append(matrix2) self.X_long = X_long self.X_short = X_short self.Y = labels print("shuffle", shuffle) if shuffle: self.X_long, self.X_short, self.Y = sklearn.utils.shuffle( self.X_long, self.X_short, self.Y, random_state=seed) self.N = len(self.Y) # valid_start = int(self.N * args.train_ratio / 100) # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100) if all_train: valid_start = 10000 test_start = 5000 + valid_start end_point = 5000 + test_start else: valid_start = 800 test_start = 200 + valid_start end_point = 200 + test_start train_data = {} train_data["x1"] = self.X_long[:valid_start] train_data["x2"] = self.X_short[:valid_start] train_data["y"] = self.Y[:valid_start] print("train labels", len(train_data["y"])) test_data = {} test_data["x1"] = self.X_long[test_start:end_point] test_data["x2"] = self.X_short[test_start:end_point] test_data["y"] = self.Y[test_start:end_point] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1"] = self.X_long[valid_start:test_start] valid_data["x2"] = self.X_short[valid_start:test_start] valid_data["y"] = self.Y[valid_start:test_start] print("valid labels", len(valid_data["y"])) print("train positive samples", sum(train_data["y"])) print("test positive samples", sum(test_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "author_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "author_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "author_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args, all_train=False): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json') neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json') pairs = pos_pairs + neg_pairs self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) self.person_dict = data_utils.load_json(file_dir, "ego_person_dict.json") nn_pos = 0 nn_neg = 0 t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) self.mag = [ self.person_dict.get(pair["mid"], {}).get("pubs", []) for pair in pairs ] self.aminer = [ self.person_dict.get(pair["aid"], {}).get("pubs", []) for pair in pairs ] self.mag = t.texts_to_sequences(self.mag) self.aminer = t.texts_to_sequences(self.aminer) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag_keywords = [] self.aminer_keywords = [] for i, pair in enumerate(pairs): if i % 100 == 0: logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg) aid, mid = pair['aid'], pair['mid'] avenue = [ item["id"] for item in self.person_dict.get(aid, {}).get("venue", []) ] mvenue = [ item["id"] for item in self.person_dict.get(mid, {}).get("venue", []) ] self.mag_keywords.append(mvenue) self.aminer_keywords.append(avenue) self.mag_keywords = t.texts_to_sequences(self.mag_keywords) self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords) self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=max_seq2_len) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed) self.N = len(self.labels) # valid_start = int(self.N * args.train_ratio / 100) # test_start = int(self.N * (args.train_ratio + args.valid_ratio) / 100) if all_train: valid_start = 10000 test_start = 5000 + valid_start end_point = 5000 + test_start else: valid_start = 800 test_start = 200 + valid_start end_point = 200 + test_start train_data = {} train_data["x1_seq1"] = self.mag[:valid_start] train_data["x1_seq2"] = self.mag_keywords[:valid_start] train_data["x2_seq1"] = self.aminer[:valid_start] train_data["x2_seq2"] = self.aminer_keywords[:valid_start] train_data["y"] = self.labels[:valid_start] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[test_start:end_point] test_data["x1_seq2"] = self.mag_keywords[test_start:end_point] test_data["x2_seq1"] = self.aminer[test_start:end_point] test_data["x2_seq2"] = self.aminer_keywords[test_start:end_point] test_data["y"] = self.labels[test_start:end_point] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[valid_start:test_start] valid_data["x1_seq2"] = self.mag_keywords[valid_start:test_start] valid_data["x2_seq1"] = self.aminer[valid_start:test_start] valid_data["x2_seq2"] = self.aminer_keywords[valid_start:test_start] valid_data["y"] = self.labels[valid_start:test_start] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "author_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "author_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "author_rnn_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos_person_pairs.json') neg_pairs = data_utils.load_json(file_dir, 'neg_person_pairs.json') pairs = pos_pairs + neg_pairs self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) self.person_dict = data_utils.load_json(file_dir, "ego_person_dict.json") corpus = [] nn_pos = 0 nn_neg = 0 # for i, pair in enumerate(pairs): # if i % 100 == 0: # logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg) # # cpaper, npaper = pair # aid, mid = pair['aid'], pair['mid'] # aperson = self.person_dict.get(aid, {}) # mperson = self.person_dict.get(mid, {}) # corpus.append(aperson.get("pubs", [])) # corpus.append(mperson.get("pubs", [])) # # corpus.append([item["id"] for item in aperson.get("venue", [])]) # corpus.append([item["id"] for item in mperson.get("venue", [])]) # # t = Tokenizer(num_words=100000) # t.fit_on_texts(corpus) t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) # print("tokenizer", t.word_index) self.mag = [self.person_dict.get(pair["mid"], {}).get("pubs", []) for pair in pairs] self.aminer = [self.person_dict.get(pair["aid"], {}).get("pubs", []) for pair in pairs] self.mag = t.texts_to_sequences(self.mag) # print("self mag", self.mag) self.aminer = t.texts_to_sequences(self.aminer) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag_keywords = [] self.aminer_keywords = [] for i, pair in enumerate(pairs): if i % 100 == 0: logger.info('pairs to matrices %d %d %d', i, nn_pos, nn_neg) # cpaper, npaper = pair aid, mid = pair['aid'], pair['mid'] avenue = [item["id"] for item in self.person_dict.get(aid, {}).get("venue", [])] mvenue = [item["id"] for item in self.person_dict.get(mid, {}).get("venue", [])] self.mag_keywords.append(mvenue) self.aminer_keywords.append(avenue) # self.mag_keywords = [self.person_dict.get(pair["mid"], {}).get("venue", []) for pair in pairs] # self.mag_keywords = [venue["id"] for venue in self.mag_keywords] # self.aminer_keywords = [self.person_dict.get(pair["aid"], {}).get("venue", []) for pair in pairs] # self.aminer_keywords = [venue["id"] for venue in self.aminer_keywords] self.mag_keywords = t.texts_to_sequences(self.mag_keywords) self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords) self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=max_seq2_len) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed ) self.N = len(self.labels) n_train = args.train_num n_test = args.test_num train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[n_train:(n_train+n_test)] test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train+n_test)] test_data["x2_seq1"] = self.aminer[n_train:(n_train+n_test)] test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train+n_test)] test_data["y"] = self.labels[n_train:(n_train+n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train+n_test:(n_train+n_test*2)] valid_data["x1_seq2"] = self.mag_keywords[n_train+n_test:(n_train+n_test*2)] valid_data["x2_seq1"] = self.aminer[n_train+n_test:(n_train+n_test*2)] valid_data["x2_seq2"] = self.aminer_keywords[n_train+n_test:(n_train+n_test*2)] valid_data["y"] = self.labels[n_train+n_test:(n_train+n_test*2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "author_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "author_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "author_rnn_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len pos_pairs = data_utils.load_json(file_dir, "label_data_aff_zhoushao.json")[:600] pos_pairs = [({ "name": p["affiliation"] }, { "DisplayName": p["label"] }) for p in pos_pairs if p["label"] != "[NIF]"] neg_pairs = data_utils.load_json( file_dir, 'train_negative_affi_clean.json')[:600] neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs] pairs_add = data_utils.load_json( file_dir, "mag_aminer_hard_correct_zfj_copy.json") print("add pairs", len(pairs_add)) pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "1"] neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "0"] pos_pairs = pos_pairs[-len(neg_pairs):] self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) print("n_pos", len(pos_pairs), "n_neg", len(neg_pairs)) pairs = pos_pairs + neg_pairs # label balanced is important t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) self.mag = t.texts_to_sequences([p[1]["DisplayName"] for p in pairs]) for mag_aff in self.mag: for word_idx in mag_aff: assert word_idx <= settings.MAX_WORD_TOKEN_NUM + 1 self.aminer = t.texts_to_sequences([p[0]["name"] for p in pairs]) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.calc_keyword_seqs() self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=max_seq2_len) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed) self.N = len(self.labels) N = self.N n_train = int(self.N * 0.6) n_valid = int(self.N * 0.2) n_test = N - n_train - n_valid # n_train = 800 # n_valid = 200 # n_test = 200 train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[(n_train + n_valid):(n_train + n_valid + n_test)] test_data["x1_seq2"] = self.mag_keywords[(n_train + n_valid):(n_train + n_valid + n_test)] test_data["x2_seq1"] = self.aminer[(n_train + n_valid):(n_train + n_valid + n_test)] test_data["x2_seq2"] = self.aminer_keywords[(n_train + n_valid):(n_train + n_valid + n_test)] test_data["y"] = self.labels[(n_train + n_valid):(n_train + n_valid + n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train:(n_train + n_valid)] valid_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_valid)] valid_data["x2_seq1"] = self.aminer[n_train:(n_train + n_valid)] valid_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train + n_valid)] valid_data["y"] = self.labels[n_train:(n_train + n_valid)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "aff_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "aff_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "aff_rnn_valid.pkl")
def __init__(self, file_dir, max_seq1_len, max_seq2_len, shuffle, seed, args): self.max_seq1_len = max_seq1_len self.max_seq2_len = max_seq2_len # load training pairs pos_pairs = data_utils.load_json(file_dir, 'pos-pairs-train.json') pos_pairs = [(p['c'], p['n']) for p in pos_pairs] neg_pairs = data_utils.load_json(file_dir, 'neg-pairs-train.json') neg_pairs = [(p['c'], p['n']) for p in neg_pairs] self.labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) pairs = pos_pairs + neg_pairs # corpus = [] # for i, pair in enumerate(pairs): # if i % 100 == 0: # logger.info('pairs to matrices %d', i) # cpaper, npaper = pair # corpus.append(cpaper["title"]) # corpus.append(npaper["title"]) # corpus.append(" ".join(cpaper["authors"])) # corpus.append(" ".join(npaper["authors"])) # # t = Tokenizer(num_words=99999) # t.fit_on_texts(corpus) t = data_utils.load_large_obj(settings.OUT_DIR, "tokenizer_all_domain.pkl") self.vocab_size = len(t.word_counts) print("vocab size", self.vocab_size) # print("tokenizer", t.word_index) self.aminer = [pair[0]["title"] for pair in pairs] self.mag = [pair[1]["title"] for pair in pairs] self.aminer = t.texts_to_sequences(self.aminer) self.mag = t.texts_to_sequences(self.mag) # print("mag", self.mag) self.aminer = pad_sequences(self.aminer, maxlen=self.max_seq1_len) self.mag = pad_sequences(self.mag, maxlen=self.max_seq1_len) self.aminer_keywords = [" ".join(pair[0]["authors"]) for pair in pairs] self.mag_keywords = [" ".join(pair[1]["authors"]) for pair in pairs] self.aminer_keywords = t.texts_to_sequences(self.aminer_keywords) self.mag_keywords = t.texts_to_sequences(self.mag_keywords) self.aminer_keywords = pad_sequences(self.aminer_keywords, maxlen=self.max_seq2_len) self.mag_keywords = pad_sequences(self.mag_keywords, maxlen=self.max_seq2_len) if shuffle: self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels = sklearn.utils.shuffle( self.mag, self.aminer, self.mag_keywords, self.aminer_keywords, self.labels, random_state=seed) self.N = len(self.labels) n_train = args.train_num n_test = args.test_num train_data = {} train_data["x1_seq1"] = self.mag[:n_train] train_data["x1_seq2"] = self.mag_keywords[:n_train] train_data["x2_seq1"] = self.aminer[:n_train] train_data["x2_seq2"] = self.aminer_keywords[:n_train] train_data["y"] = self.labels[:n_train] train_data["vocab_size"] = self.vocab_size print("train labels", len(train_data["y"])) test_data = {} test_data["x1_seq1"] = self.mag[n_train:(n_train + n_test)] test_data["x1_seq2"] = self.mag_keywords[n_train:(n_train + n_test)] test_data["x2_seq1"] = self.aminer[n_train:(n_train + n_test)] test_data["x2_seq2"] = self.aminer_keywords[n_train:(n_train + n_test)] test_data["y"] = self.labels[n_train:(n_train + n_test)] print("test labels", len(test_data["y"])) valid_data = {} valid_data["x1_seq1"] = self.mag[n_train + n_test:(n_train + n_test * 2)] valid_data["x1_seq2"] = self.mag_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq1"] = self.aminer[n_train + n_test:(n_train + n_test * 2)] valid_data["x2_seq2"] = self.aminer_keywords[n_train + n_test:(n_train + n_test * 2)] valid_data["y"] = self.labels[n_train + n_test:(n_train + n_test * 2)] print("valid labels", len(valid_data["y"])) out_dir = join(settings.DATA_DIR, "dom-adpt") os.makedirs(out_dir, exist_ok=True) data_utils.dump_large_obj(train_data, out_dir, "paper_rnn_train.pkl") data_utils.dump_large_obj(test_data, out_dir, "paper_rnn_test.pkl") data_utils.dump_large_obj(valid_data, out_dir, "paper_rnn_valid.pkl")