コード例 #1
0
    def predict(self, test_file, verbose=False):
        start = time.time()
        self.test = dh.loaddata(test_file,
                                self._word_file_path,
                                normalize_text=True,
                                split_hashtag=True,
                                ignore_profiles=False)
        end = time.time()
        if (verbose == True):
            print('test resource loading time::', (end - start))

        self._vocab = self.load_vocab()

        start = time.time()
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.test, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
        tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
        tD = dh.pad_sequence_1d(tD, maxlen=11)

        end = time.time()
        if (verbose == True):
            print('test resource preparation time::', (end - start))

        self.__predict_model([tC, tX, tD], self.test)
コード例 #2
0
    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 word2vec_path,
                 cross_validation=False,
                 cross_val_ratio=0.2,
                 test_file=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._test_file = test_file

        self.load_train_validation_test_data()

        batch_size = 32

        print(self._line_maxlen)
        self._vocab = dh.build_vocab(self.train, ignore_context=False)
        self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        if (cross_validation):
            self.train, self.validation = self.split_train_validation(
                self.train, ratio=cross_val_ratio)

        X, Y, D, C, A = dh.vectorize_word_dimension(self.train,
                                                    self._vocab,
                                                    drop_dimension_index=None)

        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab, drop_dimension_index=None)

        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
        C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
        D = dh.pad_sequence_1d(D, maxlen=11)

        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
        tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
        tD = dh.pad_sequence_1d(tD, maxlen=11)

        hidden_units = 1280
        dimension_size = 300

        W = dh.get_word2vec_weight(
            self._vocab,
            n=dimension_size,
            path='/home/word2vec/GoogleNews-vectors-negative300.bin')

        cW = W

        print('Word2vec obtained....')

        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]

        print('ratio', ratio)

        dimension_vocab = numpy.unique(D)
        print(len(dimension_vocab))

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_C', C.shape)
        print('train_D', D.shape)
        print('train_Y', Y.shape)

        print('validation_X', tX.shape)
        print('validation_C', tC.shape)
        print('validation_D', tD.shape)
        print('validation_Y', tY.shape)

        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights=W,
                                    c_emb_weights=cW,
                                    hidden_units=hidden_units,
                                    trainable=False,
                                    dimension_length=11,
                                    batch_size=batch_size)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5',
                                    save_best_only=True,
                                    monitor='val_loss')
        # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
        #                            save_best_only=False)
        # early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='val_loss',
                                     factor=0.1,
                                     patience=10,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        model.fit([C, X, D],
                  Y,
                  batch_size=batch_size,
                  epochs=100,
                  validation_data=([tC, tX, tD], tY),
                  shuffle=True,
                  callbacks=[save_best, lr_tuner],
                  class_weight=ratio)

        if (cross_validation):
            t = test_model(word_file_path, model_file, vocab_file_path,
                           output_file, input_weight_file_path)
            t.load_trained_model()
            t.predict_cross_validation(tC, tX, tD, self.validation)