def evaluate(session, ops, previous_ops, dataset): losses = [] accuracies = [] f1_scores = [] masked_predictions = [] aspects = [] ground_truths = [] masks = [] cm = np.zeros(shape=(args.num_classes, args.num_classes), dtype=np.int32) test_metrics = {} if args.mode == 'test': if args.task == 'semeval16-restaurant': aspect_word_index_map = RESTAURANT_ASPECT_WORD_INDEX_MAP elif args.task == 'semeval16-laptops': # TODO: change this aspect_word_index_map = LAPTOPS_ASPECT_WORD_INDEX_MAP n_sentiment_classes = args.num_classes n_aspect = len(aspect_word_index_map) - 1 n_total_classes = n_aspect * (n_sentiment_classes - 1) + 1 n_multilabel_success = 0 n_multilabel_failure = 0 n_sentence = 0 args.batch_size = n_aspect per_aspect_sentiments_cm = np.zeros(shape=(n_aspect, n_sentiment_classes, n_sentiment_classes), dtype=np.int32) per_aspect_aspect_detection_cm = np.zeros(shape=(n_aspect + 1, 2, 2), dtype=np.int32) joint_aspect_sentiment_cm = np.zeros(shape=(n_total_classes, 2, 2), dtype=np.int32) for x1, x2, y in batch_iterator(dataset, args.batch_size, 1): # get feed_dicts fd = get_feed_data(x1, x2, y, is_training=False, args=args) # get previous feed_dicts previous_fd = fd.copy() # execute previous models word_level_inputs, aspect_embedded_encoder_output, aspect_embedded_sentence_inputs, birnn_output = session.run( [ previous_ops['word_level_inputs'], previous_ops['aspect_embedded_encoder_output'], previous_ops['aspect_embedded_sentence_inputs'], previous_ops['birnn_output'] ], previous_fd) fd[PREFIX + PREVIOUS_WORD_LEVEL_INPUTS_TENSOR_NAME] = word_level_inputs fd[PREFIX + PREVIOUS_ASPECT_EMBEDDED_ENCODER_OUTPUT_TENSOR_NAME] = aspect_embedded_encoder_output fd[PREFIX + PREVIOUS_ASPECT_EMBEDDED_SENTENCE_INPUTS_TENSOR_NAME] = aspect_embedded_sentence_inputs fd[PREFIX + PREVIOUS_SENTENCE_ENCODER_OUTPUT_TENSOR_NAME] = birnn_output fd[PREFIX + ASPECTS_TENSOR_NAME] = fd[ASPECTS_TENSOR_NAME] fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME] = fd[PADDED_REVIEWS_TENSOR_NAME] fd[PREFIX + ACTUAL_SENTENCE_COUNT_TENSOR_NAME] = fd[ ACTUAL_SENTENCE_COUNT_TENSOR_NAME] fd[PREFIX + ACTUAL_WORD_COUNT_TENSOR_NAME] = fd[ACTUAL_WORD_COUNT_TENSOR_NAME] fd[PREFIX + SENTENCE_MASK_TENSOR_NAME] = fd[SENTENCE_MASK_TENSOR_NAME] fd[PREFIX + WORD_MASK_TENSOR_NAME] = fd[WORD_MASK_TENSOR_NAME] fd[PREFIX + PADDED_LABELS_TENSOR_NAME] = fd[PADDED_LABELS_TENSOR_NAME] fd[PREFIX + LABLE_WEIGHTS_TENSOR_NAME] = fd[LABLE_WEIGHTS_TENSOR_NAME] fd[PREFIX + IS_TRAINING_TENSOR_NAME] = fd[IS_TRAINING_TENSOR_NAME] # run evaluation val_accuracy, loss, f1_score, confusion_matrix, masked_prediction = session.run( [ ops['accuracy'], ops['loss'], ops['f1_score'], ops['confusion_matrix'], ops['masked_predictions'] ], fd) losses.append(loss) accuracies.append(val_accuracy) f1_scores.append(f1_score) cm += confusion_matrix masked_predictions.append(masked_prediction) aspects.append(x1) ground_truths.append(y) masks.append(fd[SENTENCE_MASK_TENSOR_NAME]) if args.mode == 'test': eval_results = evaluation_metrics( fd[PREFIX + ASPECTS_TENSOR_NAME], fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME], fd[PREFIX + PADDED_LABELS_TENSOR_NAME], masked_prediction, aspect_word_index_map) per_aspect_sentiments_cm += eval_results[ 'per_aspect_sentiments_cm'] per_aspect_aspect_detection_cm += eval_results[ 'per_aspect_aspect_detection_cm'] joint_aspect_sentiment_cm += eval_results[ 'joint_aspect_sentiment_cm'] n_multilabel_success += eval_results['n_multilabel_success'] n_multilabel_failure += eval_results['n_multilabel_failure'] n_sentence += eval_results['count'] test_metrics = { 'per_aspect_sentiments_cm': per_aspect_sentiments_cm, 'per_aspect_aspect_detection_cm': per_aspect_aspect_detection_cm, 'joint_aspect_sentiment_cm': joint_aspect_sentiment_cm, 'n_multilabel_success': n_multilabel_success, 'n_multilabel_failure': n_multilabel_failure, 'n_sentence': n_sentence } df = { 'loss': losses, 'accuracy': accuracies, 'f1_score': f1_scores, 'confusion_matrix': cm, 'masked_predictions': masked_predictions, 'aspects': aspects, 'ground_truths': ground_truths, 'masks': masks, 'test_metrics': test_metrics } return df
def train(config=None, reporter=None): """ Main method to start training :param hyperparam_tune: flag for controlling hyperparameter tuning :param config: contains grid searched values for hyperparameters to be tuned :param reporter: can contain reporting values like accuracy, f1-score etc :return: """ # set values according to hyperparamter tuner if args.hyperparam_tune: print('Data dir : ' + DATA_DIR) args.lr = config['learning_rate'] args.batch_size = config['batch_size'] args.dropout_keep_prob = config['dropout_keep_prob'] print(args) write_experiment_parameters(args) # https://stackoverflow.com/questions/44873273/what-do-the-options-in-configproto-like-allow-soft-placement-and-log-device-plac config = tf.ConfigProto(allow_soft_placement=True) # Clears the default graph stack and resets the global default graph. tf.reset_default_graph() with tf.Session(config=config) as session: # attach keras and tf session so that we can use keras layers together with tf K.set_session(session) # load previous trained model previous_ops = model.get_previous_model(session=session, args=args) # get model and saver instances _, saver, ops = model.get_model(session=session, args=args, restore_only=False) # get label weights for handling class imbalance class_weights = calculate_class_weights() # create a training summary writer train_writer = tf.summary.FileWriter(TFLOG_DIR, graph=session.graph) # initializations val_accuracies = [] val_per_class_accuracies = [] val_per_class_f1_scores = [] val_macro_f1_scores = [] train_accuracies = [] train_per_class_f1_scores = [] train_per_class_accuracies = [] train_macro_f1_scores = [] train_confusion_matrix = np.zeros(shape=(args.num_classes, args.num_classes), dtype=np.int32) best_macro_f1_score = 0 best_step_number = 0 # start training for i, (x1, x2, y) in enumerate( batch_iterator(train_loader(args.epochs), args.batch_size)): t0 = time.clock() # calculate dynamic class weights if args.dynamic_class_weights: class_weights = calculate_class_weights(classes=y) # get feed_dicts fd = get_feed_data(x1, x2, y, class_weights=class_weights, is_training=True, args=args) # get previous feed_dicts previous_fd = fd.copy() previous_fd[IS_TRAINING_TENSOR_NAME] = False # execute previous model word_level_inputs, aspect_embedded_encoder_output, aspect_embedded_sentence_inputs, birnn_output = session.run( [ previous_ops['word_level_inputs'], previous_ops['aspect_embedded_encoder_output'], previous_ops['aspect_embedded_sentence_inputs'], previous_ops['birnn_output'] ], previous_fd) fd[PREFIX + PREVIOUS_WORD_LEVEL_INPUTS_TENSOR_NAME] = word_level_inputs fd[PREFIX + PREVIOUS_ASPECT_EMBEDDED_ENCODER_OUTPUT_TENSOR_NAME] = aspect_embedded_encoder_output fd[PREFIX + PREVIOUS_ASPECT_EMBEDDED_SENTENCE_INPUTS_TENSOR_NAME] = aspect_embedded_sentence_inputs fd[PREFIX + PREVIOUS_SENTENCE_ENCODER_OUTPUT_TENSOR_NAME] = birnn_output fd[PREFIX + ASPECTS_TENSOR_NAME] = fd[ASPECTS_TENSOR_NAME] fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME] = fd[PADDED_REVIEWS_TENSOR_NAME] fd[PREFIX + ACTUAL_SENTENCE_COUNT_TENSOR_NAME] = fd[ ACTUAL_SENTENCE_COUNT_TENSOR_NAME] fd[PREFIX + ACTUAL_WORD_COUNT_TENSOR_NAME] = fd[ ACTUAL_WORD_COUNT_TENSOR_NAME] fd[PREFIX + SENTENCE_MASK_TENSOR_NAME] = fd[SENTENCE_MASK_TENSOR_NAME] fd[PREFIX + WORD_MASK_TENSOR_NAME] = fd[WORD_MASK_TENSOR_NAME] fd[PREFIX + PADDED_LABELS_TENSOR_NAME] = fd[PADDED_LABELS_TENSOR_NAME] fd[PREFIX + LABLE_WEIGHTS_TENSOR_NAME] = fd[LABLE_WEIGHTS_TENSOR_NAME] fd[PREFIX + IS_TRAINING_TENSOR_NAME] = fd[IS_TRAINING_TENSOR_NAME] # run session step, summaries, loss, accuracy, f1_score, f1_score_0, f1_score_1, f1_score_2, f1_score3, \ confusion_matrix, labels, predictions, label_weights, _ = session.run( [ ops['global_step'], ops['summary_op'], ops['loss'], ops['accuracy'], ops['f1_score'], ops['f1_score_0'], ops['f1_score_1'], ops['f1_score_2'], ops['f1_score_3'], ops['confusion_matrix'], ops['padded_labels'], ops['predictions'], ops['label_weights'], ops['train_op'] ], fd) train_writer.add_summary(summaries, global_step=step) td = time.clock() - t0 if args.hyperparam_tune: reporter(f1_score=f1_score) if step % args.print_frequency == 0: train_confusion_matrix += confusion_matrix print( 'step %s, loss=%s, accuracy=%s, f1_score=%s, t=%s, inputs=%s' % (step, loss, accuracy, f1_score, round( td, 2), fd[PREFIX + PADDED_REVIEWS_TENSOR_NAME].shape)) if step != 0 and step % args.eval_frequency == 0: # run validation val_results = evaluate(session=session, ops=ops, previous_ops=previous_ops, dataset=val_loader(epochs=1)) print_results(val_results, args, 'VALIDATION RESULTS', val_accuracies, val_per_class_accuracies, val_macro_f1_scores, val_per_class_f1_scores) # save a checkpoint if best f1 score if val_macro_f1_scores[-1] >= best_macro_f1_score: best_macro_f1_score = val_macro_f1_scores[-1] best_step_number = step print('Best Macro F1 Score : %.2f' % best_macro_f1_score) print('Best step at : ' + str(best_step_number)) saver.save(session, CHECKPOINT_PATH, global_step=step) print('checkpoint saved') train_results = { 'loss': loss, 'accuracy': accuracy, 'f1_score': f1_score, 'confusion_matrix': train_confusion_matrix } print_results(train_results, args, 'TRAINING RESULTS', train_accuracies, train_per_class_accuracies, train_macro_f1_scores, train_per_class_f1_scores) # reset train confusion matrix train_confusion_matrix = np.zeros(shape=(args.num_classes, args.num_classes), dtype=np.int32) val_per_class_accuracies = np.asarray(val_per_class_accuracies) train_per_class_accuracies = np.asarray(train_per_class_accuracies) val_per_class_f1_scores = np.asarray(val_per_class_f1_scores) train_per_class_f1_scores = np.asarray(train_per_class_f1_scores) plot_accuracy(val_accuracies, train_accuracies, title='Accuracy') plot_accuracy(val_per_class_accuracies[:, 0], train_per_class_accuracies[:, 0], title='Accuracy Class 0 Positive Sentiment') plot_accuracy(val_per_class_accuracies[:, 1], train_per_class_accuracies[:, 1], title='Accuracy Class 1 Negative Sentiment') plot_accuracy(val_per_class_accuracies[:, 2], train_per_class_accuracies[:, 2], title='Accuracy Class 2 Neutral Sentiment') plot_accuracy(val_per_class_accuracies[:, 3], train_per_class_accuracies[:, 3], title='Accuracy Class 3 Not Applicable Sentiment') plot_f1_score(val_macro_f1_scores, train_macro_f1_scores, title='Macro F1 Score') plot_f1_score(val_per_class_f1_scores[:, 0], train_per_class_f1_scores[:, 0], title='F1 Score Class 0 Positive Sentiment') plot_f1_score(val_per_class_f1_scores[:, 1], train_per_class_f1_scores[:, 1], title='F1 Score Class 1 Negative Sentiment') plot_f1_score(val_per_class_f1_scores[:, 2], train_per_class_f1_scores[:, 2], title='F1 Score Class 2 Neutral Sentiment') plot_f1_score(val_per_class_f1_scores[:, 3], train_per_class_f1_scores[:, 3], title='F1 Score Class 3 Not Applicable Sentiment') return best_step_number
def evaluate(session, ops, dataset): losses = [] accuracies = [] f1_scores = [] masked_predictions = [] aspects = [] ground_truths = [] masks = [] cm = np.zeros(shape = (args.num_classes, args.num_classes), dtype = np.int32) test_metrics = {} if args.mode == 'test': n_sentiment_classes = args.num_classes n_aspect = len(GERMEVAL_ASPECT_WORD_INDEX_MAP) - 1 n_total_classes = n_aspect * (n_sentiment_classes - 1) + 1 n_multilabel_success = 0 n_multilabel_failure = 0 n_sentence = 0 args.batch_size = n_aspect per_aspect_sentiments_cm = np.zeros(shape = (n_aspect, n_sentiment_classes, n_sentiment_classes), dtype = np.int32) per_aspect_aspect_detection_cm = np.zeros(shape = (n_aspect + 1, 2, 2), dtype = np.int32) joint_aspect_sentiment_cm = np.zeros(shape = (n_total_classes, 2, 2), dtype = np.int32) for x1, x2, y in batch_iterator(dataset, args.batch_size, 1): # get feed_dicts fd = get_feed_data(x1, x2, y, is_training = False, args = args) # run evaluation val_accuracy, loss, f1_score, confusion_matrix, masked_prediction = session.run( [ops['accuracy'], ops['loss'], ops['f1_score'], ops['confusion_matrix'], ops['masked_predictions']], fd) losses.append(loss) accuracies.append(val_accuracy) f1_scores.append(f1_score) cm += confusion_matrix masked_predictions.append(masked_prediction) aspects.append(x1) ground_truths.append(y) masks.append(fd[SENTENCE_MASK_TENSOR_NAME]) if args.mode == 'test': eval_results = evaluation_metrics(fd[ASPECTS_TENSOR_NAME], fd[PADDED_REVIEWS_TENSOR_NAME], fd[PADDED_LABELS_TENSOR_NAME], masked_prediction, GERMEVAL_ASPECT_WORD_INDEX_MAP) per_aspect_sentiments_cm += eval_results['per_aspect_sentiments_cm'] per_aspect_aspect_detection_cm += eval_results['per_aspect_aspect_detection_cm'] joint_aspect_sentiment_cm += eval_results['joint_aspect_sentiment_cm'] n_multilabel_success += eval_results['n_multilabel_success'] n_multilabel_failure += eval_results['n_multilabel_failure'] n_sentence += eval_results['count'] test_metrics = { 'per_aspect_sentiments_cm': per_aspect_sentiments_cm, 'per_aspect_aspect_detection_cm': per_aspect_aspect_detection_cm, 'joint_aspect_sentiment_cm': joint_aspect_sentiment_cm, 'n_multilabel_success': n_multilabel_success, 'n_multilabel_failure': n_multilabel_failure, 'n_sentence': n_sentence } df = {'loss': losses, 'accuracy': accuracies, 'f1_score': f1_scores, 'confusion_matrix': cm, 'masked_predictions': masked_predictions, 'aspects': aspects, 'ground_truths': ground_truths, 'masks': masks, 'test_metrics': test_metrics } return df