Ejemplo n.º 1
0
    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_test_data()

        batch_size = 32

        print(self._line_maxlen)
        self._vocab = dh.build_vocab(self.train, ignore_context=False)
        self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        X, Y, D, C, A = dh.vectorize_word_dimension(self.train,
                                                    self._vocab,
                                                    drop_dimension_index=None)

        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab, drop_dimension_index=None)

        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
        C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
        D = dh.pad_sequence_1d(D, maxlen=11)

        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
        tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
        tD = dh.pad_sequence_1d(tD, maxlen=11)

        hidden_units = 128
        dimension_size = 300

        W = dh.get_word2vec_weight(self._vocab,
                                   n=dimension_size,
                                   path=word2vec_path)

        cW = W

        print('Word2vec obtained....')

        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]

        print('ratio', ratio)

        dimension_vocab = numpy.unique(D)
        print(len(dimension_vocab))

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_C', C.shape)
        print('train_D', D.shape)
        print('train_Y', Y.shape)

        print('validation_X', tX.shape)
        print('validation_C', tC.shape)
        print('validation_D', tD.shape)
        print('validation_Y', tY.shape)

        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights=W,
                                    c_emb_weights=cW,
                                    hidden_units=hidden_units,
                                    trainable=False,
                                    batch_size=batch_size)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5',
                                    save_best_only=True,
                                    monitor='val_loss')
        # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
        #                            save_best_only=False)
        early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='loss',
                                     factor=0.1,
                                     patience=10,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        model.fit([C, X],
                  Y,
                  batch_size=batch_size,
                  epochs=100,
                  validation_data=([tC, tX], tY),
                  shuffle=True,
                  callbacks=[save_best, lr_tuner],
                  class_weight=ratio)
    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 word2vec_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file

        self.load_train_validation_data()

        print(self._line_maxlen)

        # build vocabulary
        # truncates words with min freq=1
        self._vocab = dh.build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        dimension_size = 300

        W = dh.get_word2vec_weight(self._vocab,
                                   n=dimension_size,
                                   path=word2vec_path)

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    hidden_units=256,
                                    emb_weights=W)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5',
                                    save_best_only=True)
        save_all = ModelCheckpoint(self._model_file +
                                   'weights.{epoch:02d}.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=20,
                                       verbose=1)

        # training
        model.fit(X,
                  Y,
                  batch_size=64,
                  epochs=100,
                  validation_data=(tX, tY),
                  shuffle=True,
                  callbacks=[save_best, save_all, early_stopping],
                  class_weight=ratio,
                  verbose=2)
    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 word2vec_path=None,
                 test_file=None,
                 input_weight_file_path=None):

        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_test_data()

        print(self._line_maxlen)

        # build vocabulary
        if (self._test_file != None):
            self._vocab = dh.build_vocab(self.train + self.validation +
                                         self.test)
        else:
            self._vocab = dh.build_vocab(self.train + self.validation)

        self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        dimension_size = 100
        W = []

        W = dh.get_word2vec_weight(self._vocab, n=200, path=word2vec_path)

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights=W,
                                    trainable=False)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5',
                                    save_best_only=True)
        save_all = ModelCheckpoint(self._model_file +
                                   'weights.{epoch:02d}__.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='loss', patience=20, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='loss',
                                     factor=0.1,
                                     patience=10,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        # training
        model.fit(X,
                  Y,
                  batch_size=128,
                  epochs=100,
                  validation_data=(tX, tY),
                  shuffle=True,
                  callbacks=[save_best],
                  class_weight=ratio)
    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 model_filename=None):
        offensive_content_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._model_filename = model_filename

        # self.load_train_validation_data(lowercase=False, at_character=True)
        # self.char_train = self.train
        # self.char_validation = self.validation

        self.load_train_validation_data()

        # batch size
        batch_size = 16
        print('bb', len(self.train))
        # print('bb', len(self.char_train))

        # self.train = self.train[-len(self.train) % batch_size:]
        # self.char_train = self.char_train[-len(self.char_train) % batch_size:]
        print('bb', len(self.train))
        # print('bb', len(self.char_train))

        print(self._line_maxlen)
        print(self._line_char_maxlen)

        # build vocabulary
        self._vocab = dh.build_vocab(self.train, min_freq=5)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        self._char_vocab = {}
        # self._char_vocab = dh.build_vocab(self.char_train)
        # if ('unk' not in self._char_vocab):
        #     self._char_vocab['unk'] = len(self._char_vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])
        # print(len(self._char_vocab.keys()) + 1)
        # print('unk::', self._char_vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)
        # dh.write_vocab(self._vocab_file_path + '.char', self._char_vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # prepares character input
        # cX, cY, cD, cC, cA = dh.vectorize_word_dimension(self.char_train, self._char_vocab)
        # cX = dh.pad_sequence_1d(cX, maxlen=self._line_char_maxlen)

        # prepares character input
        # ctX, ctY, ctD, ctC, ctA = dh.vectorize_word_dimension(self.char_validation, self._char_vocab)
        # ctX = dh.pad_sequence_1d(ctX, maxlen=self._line_char_maxlen)

        print("X", X.shape)
        # print('cX', cX.shape)

        # hidden units
        hidden_units = 256

        # word2vec dimension
        dimension_size = 128
        W = []
        W = dh.get_word2vec_weight(
            self._vocab,
            n=300,
            path='/home/striker/word2vec/GoogleNews-vectors-negative300.bin')
        # W = dh.get_glove_weights(self._vocab, n=200, path='/home/striker/word2vec/glove_model_200.txt.bin')
        print('Word2vec obtained....')

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
        # Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        model = None
        if (model_filename == 'emotion.json'):
            model = self._build_emotion_network(len(self._vocab.keys()) + 1,
                                                self._line_maxlen,
                                                emb_weights=W,
                                                hidden_units=hidden_units,
                                                trainable=False)
        if (model_filename == 'offensive.json'):
            model = self._build_network(len(self._vocab.keys()) + 1,
                                        len(self._char_vocab.keys()) + 1,
                                        emb_weights=W,
                                        hidden_units=hidden_units,
                                        trainable=False,
                                        batch_size=8)

        open(self._model_file + self._model_filename,
             'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + self._model_filename +
                                    '.hdf5',
                                    save_best_only=True)
        early_stopping = EarlyStopping(monitor='loss', patience=50, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='loss',
                                     factor=0.1,
                                     patience=1,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        # training
        model.fit([X],
                  Y,
                  batch_size=16,
                  epochs=100,
                  validation_split=0.1,
                  shuffle=True,
                  callbacks=[save_best, early_stopping, lr_tuner],
                  class_weight=ratio,
                  verbose=1)