def get_submit_test(self, contatenate=0): # validation data tmp_test = pickle.load(open('test2_id2doc.pkl', 'rb')) test_ids, test_docs = tmp_test[0], tmp_test[1] # sequentializing validation data max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length x_val = data_reader.docs_to_sequences_suffix(test_docs, self.opt.word_index, max_sequence_length, contatenate=contatenate) print('[test] Shape of data tensor:', x_val.shape) return x_val, test_ids
def get_test(self, contatenate=0): # validation data valid_temp = pickle.load(open('val_docs2label.pkl', 'rb')) val_docs, val_labels = valid_temp[0], valid_temp[1], # sequentializing validation data max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length x_val = data_reader.docs_to_sequences_suffix(val_docs, self.opt.word_index, max_sequence_length, contatenate=contatenate) y_val = val_labels # one-hot encoding print('[Val] Shape of data tensor:', x_val.shape) print('[Val] Shape of label tensor:', y_val.shape) return x_val, y_val
def get_train(self, contatenate=0): tmp = pickle.load(open('4kdoc2label_tokens.pkl', 'rb')) word_index, docs, labels = tmp[0], tmp[1], tmp[2] self.opt.word_index = word_index print('word_index:', len(word_index)) print('docs:', len(docs)) # train data loading max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length x_train = data_reader.docs_to_sequences_suffix(docs, word_index, max_sequence_length, contatenate=contatenate) y_train = labels # one-hot label encoding print('[train] Shape of data tensor:', x_train.shape) print('[train] Shape of label tensor:', y_train.shape) self.opt.embedding_matrix = self.build_word_embedding_matrix( word_index) # It is better not to build the word embedding matrix here. return x_train, y_train