def predict(self, test_file, verbose=False): start = time.time() self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, split_hashtag=True, ignore_profiles=False) end = time.time() if (verbose == True): print('test resource loading time::', (end - start)) self._vocab = self.load_vocab() start = time.time() tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.test, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) tD = dh.pad_sequence_1d(tD, maxlen=11) end = time.time() if (verbose == True): print('test resource preparation time::', (end - start)) self.__predict_model([tC, tX, tD], self.test)
def __init__(self, train_file, validation_file, word_file_path, model_file, vocab_file, output_file, word2vec_path, cross_validation=False, cross_val_ratio=0.2, test_file=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._test_file = test_file self.load_train_validation_test_data() batch_size = 32 print(self._line_maxlen) self._vocab = dh.build_vocab(self.train, ignore_context=False) self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) if (cross_validation): self.train, self.validation = self.split_train_validation( self.train, ratio=cross_val_ratio) X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None) tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab, drop_dimension_index=None) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen) D = dh.pad_sequence_1d(D, maxlen=11) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) tD = dh.pad_sequence_1d(tD, maxlen=11) hidden_units = 1280 dimension_size = 300 W = dh.get_word2vec_weight( self._vocab, n=dimension_size, path='/home/word2vec/GoogleNews-vectors-negative300.bin') cW = W print('Word2vec obtained....') ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('ratio', ratio) dimension_vocab = numpy.unique(D) print(len(dimension_vocab)) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_C', C.shape) print('train_D', D.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_C', tC.shape) print('validation_D', tD.shape) print('validation_Y', tY.shape) model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW, hidden_units=hidden_units, trainable=False, dimension_length=11, batch_size=batch_size) open(self._model_file + 'model.json', 'w').write(model.to_json()) save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss') # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5', # save_best_only=False) # early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1) lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0.000001) model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True, callbacks=[save_best, lr_tuner], class_weight=ratio) if (cross_validation): t = test_model(word_file_path, model_file, vocab_file_path, output_file, input_weight_file_path) t.load_trained_model() t.predict_cross_validation(tC, tX, tD, self.validation)