def add_predictions(self, dataset, name="set", logs={}): X = dataset[0] y = dataset[1] if self.regression: y_pred = self.model.predict(X, batch_size=2048, verbose=0) y_pred = numpy.reshape(y_pred, y.shape) y_test = y # test if the labels are categorical or singular else: if len(y.shape) > 1: try: y_pred = self.model.predict_classes(X, batch_size=2048, verbose=0) except Exception as e: y_pred = self.predic_classes( self.model.predict(X, batch_size=2048, verbose=0)) y_test = onehot_to_categories(y) else: y_pred = self.model.predict(X, batch_size=2048, verbose=0) y_pred = numpy.array([int(_y > 0.5) for _y in y_pred]) y_test = y for k, metric in self.metrics.items(): score = numpy.squeeze(metric(y_test, y_pred)) entry = ".".join([name, k]) self.params['metrics'].append(entry) logs[entry] = score
mode="min", verbose=1, save_best_only=True) _callbacks = [] _callbacks.append(metrics_callback) _callbacks.append(plotting) _callbacks.append(weights) ############################################################################ # APPLY CLASS WEIGHTS ############################################################################ if TASK == "BD": class_weights = get_class_weights2(training[1], smooth_factor=0) else: class_weights = get_class_weights2(onehot_to_categories(training[1]), smooth_factor=0.1) print("Class weights:", {cat_to_class_mapping[c]: w for c, w in class_weights.items()}) history = nn_model.fit(training[0], training[1], validation_data=(validation[0], validation[1]) if not FINAL else (testing[0], testing[1]), nb_epoch=50, batch_size=64, class_weight=class_weights, callbacks=_callbacks)
def run(self, train, test, features=None, test_features=None, extra_train=None, callbacks=True): self.tokenizer.fit_on_texts(train.text.values) features_dim = features.shape[1] if features is not None else None X_train, Y_train = self.get_features_targets(train) X_test, Y_test = self.get_features_targets( test, features_dim=X_train.shape[1]) if extra_train is not None: self.tokenizer.fit_on_texts(extra_train.text.values) X_extra_train, Y_extra_train \ = self.get_features_targets(extra_train) if X_extra_train.shape[1] > X_train.shape[1]: X_train = pad_sequences(X_train, maxlen=X_extra_train.shape[1]) X_test = pad_sequences(X_test, maxlen=X_extra_train.shape[1]) vocab_size = len(self.tokenizer.word_index) + 1 class_count = 3 if self.ternary else 2 embedding_matrix = None if self.use_embeddings: embedding_manager = EmbeddingManager() embedding_matrix = embedding_manager.get_embedding_matrix( self.tokenizer.word_index, self.embedding_dim) base_model_params = { 'input_dim': X_train.shape[1], 'class_count': class_count, 'features_dim': features_dim, 'dropout': self.dropout } if self.model_type == "elmo": params = { **base_model_params, 'index_word': self.tokenizer.index_word } self.model = ElmoModel().compile(**params) elif self.model_type == "bid_attent": params = { **base_model_params, 'vocab_size': vocab_size, 'embedding_matrix': embedding_matrix, 'embedding_dim': self.embedding_dim } self.model = BidirectionalAttention().compile(**params) else: params = { **base_model_params, 'vocab_size': vocab_size, 'embedding_matrix': embedding_matrix, 'embedding_dim': self.embedding_dim } self.model = BaselineWithFeatures().compile(**params) self.logger.setup(ternary=self.ternary, embeddings=self.use_embeddings, train_set=X_train, test_set=X_test, vocab_size=vocab_size, epochs=self.epochs, batch_size=self.batch_size, dropout=self.dropout, extra_train=extra_train is not None) self.model.summary(print_fn=self.logger.write) fit_params = { 'batch_size': self.batch_size, 'callbacks': self.get_callbacks() if callbacks else [], 'epochs': self.epochs, 'validation_split': self.validation_split, 'verbose': 1, 'class_weight': get_class_weights2(onehot_to_categories(Y_train), smooth_factor=0) } if extra_train is not None: training = self.model.fit(X_extra_train, Y_extra_train, **fit_params) self.logger.write_history(training) train_input = [X_train, features] if features is not None else X_train test_input = [X_test, test_features ] if features is not None else X_test training = self.model.fit(train_input, Y_train, **fit_params) pred_classes = self.model.predict(test_input, verbose=1).argmax(axis=1) self.logger.write_history(training) self.print_results(pred_classes, Y_test, class_count=class_count) self.save_output_for_scoring(test.tweet_id, pred_classes)