Example #1
0
    def get_submit_test(self, contatenate=0):

        # validation data
        tmp_test = pickle.load(open('test2_id2doc.pkl', 'rb'))
        test_ids, test_docs = tmp_test[0], tmp_test[1]
        # sequentializing validation data
        max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length
        x_val = data_reader.docs_to_sequences_suffix(test_docs,
                                                     self.opt.word_index,
                                                     max_sequence_length,
                                                     contatenate=contatenate)

        print('[test] Shape of data tensor:', x_val.shape)

        return x_val, test_ids
Example #2
0
    def get_test(self, contatenate=0):

        # validation data
        valid_temp = pickle.load(open('val_docs2label.pkl', 'rb'))
        val_docs, val_labels = valid_temp[0], valid_temp[1],
        # sequentializing validation data
        max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length
        x_val = data_reader.docs_to_sequences_suffix(val_docs,
                                                     self.opt.word_index,
                                                     max_sequence_length,
                                                     contatenate=contatenate)
        y_val = val_labels  # one-hot encoding

        print('[Val] Shape of data tensor:', x_val.shape)
        print('[Val] Shape of label tensor:', y_val.shape)

        return x_val, y_val
Example #3
0
    def get_train(self, contatenate=0):

        tmp = pickle.load(open('4kdoc2label_tokens.pkl', 'rb'))
        word_index, docs, labels = tmp[0], tmp[1], tmp[2]
        self.opt.word_index = word_index
        print('word_index:', len(word_index))
        print('docs:', len(docs))

        # train data loading
        max_sequence_length = self.opt.max_sequence_length_contatenate if contatenate == 1 else self.opt.max_sequence_length
        x_train = data_reader.docs_to_sequences_suffix(docs,
                                                       word_index,
                                                       max_sequence_length,
                                                       contatenate=contatenate)
        y_train = labels  # one-hot label encoding

        print('[train] Shape of data tensor:', x_train.shape)
        print('[train] Shape of label tensor:', y_train.shape)

        self.opt.embedding_matrix = self.build_word_embedding_matrix(
            word_index)
        # It is better not to build the word embedding matrix  here.

        return x_train, y_train