Esempio n. 1
0
def eval_on_dataset(dataset_path,
                    vocab_dict,
                    num_classes,
                    max_input_length,
                    steps,
                    batch_size=100):
    start_time = datetime.datetime.now()

    _generator = create_batch_generator(dataset_path, vocab_dict, num_classes,
                                        max_input_length, batch_size)
    scores = model.evaluate_generator(_generator, steps)

    end_time = datetime.datetime.now()
    elapsed_time = end_time - start_time
    print('Evaluation time on %d samples: %s' %
          (steps * batch_size, str(elapsed_time)))
    print("Loss: %1.4f. Accuracy: %.2f%% (Chance: %0.2f%%)" %
          (scores[0], scores[1] * 100, 100.0 / num_classes))

    return scores, elapsed_time
Esempio n. 2
0
    model = keras.models.load_model(model_checkpoint_path,
                                    custom_objects=_cust_objects)

    ## Run predictions
    if True:
        max_to_pred = 1000
        pred_res = np.zeros([max_to_pred, num_classes])
        act_res = np.zeros(max_to_pred)
        all_text = []
        all_titles = []
        print('{0}: Predicting on {1} samples'.format(datetime.datetime.now(),
                                                      max_to_pred))
        pred_generator = create_batch_generator(test_path,
                                                vocab_dict,
                                                num_classes,
                                                max_input_length,
                                                batch_size,
                                                return_raw_text=False,
                                                return_title=True)
        num_predded = 0
        for pred_inputs in pred_generator:
            X_pred, y_true, obj_title = pred_inputs
            #all_text += raw_text
            all_titles += obj_title
            y_preds = model.predict(X_pred)

            offset = num_predded
            num_predded += X_pred.shape[0]

            pred_res[offset:offset + y_preds.shape[0], :] = y_preds
            act_res[offset:offset + y_true.shape[0]] = np.argmax(y_true,
Esempio n. 3
0
                                 embedding_trainable=embedding_trainable)

        model.compile(loss=loss_, optimizer=optimizer_, metrics=log_metrics)
        #-----------------------#

    print('Model summary')
    print(model.summary())

    ## Training
    if initial_epoch < epochs:
        training_start_time = datetime.datetime.now()
        print('{0}: Starting training at epoch {1}/{2}'.format(
            training_start_time, initial_epoch, epochs))

        train_generator = create_batch_generator(train_path, vocab_dict,
                                                 num_classes, max_input_length,
                                                 batch_size)
        history = model.fit_generator(train_generator,
                                      batches_per_epoch,
                                      epochs,
                                      callbacks=_callbacks,
                                      initial_epoch=initial_epoch)

        training_end_time = datetime.datetime.now()
        print('{0}: Training finished at epoch {1}'.format(
            training_end_time, epochs))
        training_time = training_end_time - training_start_time
        print('{0} elapsed to train {1} epochs'.format(str(training_time),
                                                       epochs - initial_epoch))

    ## Evaluation of final model
Esempio n. 4
0
def main_func(max_input_length,
              batch_size,
              batches_per_epoch,
              epochs,
              loss_,
              optimizer_,
              _config,
              do_final_eval=True):

    # Bring these keys into general namespace
    # Note that '_config' variable name subject to change

    model_tag = 'cnn_lstm_denovo_trainable_embed'
    train_path = './dbpedia_csv/train_shuf.csv'
    test_path = './dbpedia_csv/test_shuf.csv'
    class_labels = './dbpedia_csv/classes.txt'
    google_word2vec = '/home/denys/word2vec-GoogleNews-vectors/GoogleNews-vectors-negative300.bin.gz'

    #string_keys = ['model_tag', 'train_path', 'test_path', 'class_labels', 'google_word2vec']
    # for key in string_keys:
    #    exec('%s = "%s"' % (key, _config[key]))

    # print(train_path)
    # Dynamically created logging directories
    log_dir = './keras_logs_%s' % model_tag
    train_log_dir = '%s/train' % log_dir
    val_log_dir = '%s/val' % log_dir
    custom_log_dir = '%s/custom' % log_dir
    model_dir = 'models_%s' % model_tag
    model_path = os.path.join(model_dir,
                              'word2vec_%s_{epoch:02d}.hdf5' % model_tag)

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    # Logging
    # Create callback and logging objects
    log_metrics = [
        'categorical_accuracy', 'categorical_crossentropy', brier_pred,
        brier_true
    ]
    model_saver = keras.callbacks.ModelCheckpoint(model_path, verbose=1)
    # Log savers which play reasonably well with Keras
    train_tboard_logger = FilterTensorBoard(log_dir=train_log_dir,
                                            write_graph=False,
                                            write_images=False,
                                            log_regex=r'^(?!val).*')
    val_tboard_logger = FilterTensorBoard(log_dir=val_log_dir,
                                          write_graph=False,
                                          write_images=False,
                                          log_regex=r"^val")
    #Custom saver
    custom_tboard_saver = TensorBoardMod(log_dir=custom_log_dir,
                                         histogram_freq=0,
                                         write_graph=False,
                                         write_images=False,
                                         save_logs=False)
    _callbacks = [
        model_saver, train_tboard_logger, val_tboard_logger,
        custom_tboard_saver
    ]

    # Parameters fed using Sacred
    vocab_model = create_vocab_model()

    ## Main training and testing
    embedding_matrix = vocab_model.syn0
    vocab_dict = {
        word: vocab_model.vocab[word].index
        for word in vocab_model.vocab.keys()
    }
    vocab_size = len(vocab_dict)

    #Load class label dictionary
    class_ind_to_label = {}
    with open(class_labels, 'r') as cfi:
        for ind, line in enumerate(cfi):
            class_ind_to_label[ind] = line.rstrip()
    num_classes = len(class_ind_to_label)

    ## Create or load the model
    last_epoch, model_checkpoint_path = find_last_checkpoint(model_dir)
    initial_epoch = 0
    if model_checkpoint_path is not None:
        print('Loading epoch {0:d} from {1:s}'.format(last_epoch,
                                                      model_checkpoint_path))
        _cust_objects = {
            'brier_skill': brier_skill,
            'brier_pred': brier_pred,
            'brier_true': brier_true
        }
        model = keras.models.load_model(model_checkpoint_path,
                                        custom_objects=_cust_objects)
        initial_epoch = last_epoch + 1
    else:
        print('Building new model')
        #----------------------#
        model = build_lstm_model(vocab_size,
                                 num_outputs=num_classes,
                                 embedding_matrix=embedding_matrix)

        model.compile(loss=loss_, optimizer=optimizer_, metrics=log_metrics)
        #-----------------------#

    print('Model summary')
    print(model.summary())

    ## Custom tensorflow logging
    # Placeholder for the true values
    y_true = model.model._feed_targets[0]
    # This is the final softmax output layer of the model
    y_pred = model.outputs[0]

    create_batch_pairwise_metrics(y_true, y_pred)

    ## Training
    if initial_epoch < epochs:
        training_start_time = datetime.datetime.now()
        print('{0}: Starting training at epoch {1}/{2}'.format(
            training_start_time, initial_epoch, epochs))

        train_generator = create_batch_generator(train_path, vocab_dict,
                                                 num_classes, max_input_length,
                                                 batch_size)

        val_size = 1000
        val_generator = create_batch_generator(test_path, vocab_dict,
                                               num_classes, max_input_length,
                                               val_size)
        val_X, val_y = next(val_generator)

        print(len(val_X))
        print(len(val_y[0]))

        model.fit_generator(train_generator,
                            batches_per_epoch,
                            epochs,
                            callbacks=_callbacks,
                            initial_epoch=initial_epoch,
                            verbose=1)
        #validation_data = (val_X, val_y)
        training_end_time = datetime.datetime.now()
        print('{0}: Training finished at epoch {1}'.format(
            training_end_time, epochs))
        training_time = training_end_time - training_start_time
        print('{0} elapsed to train {1} epochs'.format(str(training_time),
                                                       epochs - initial_epoch))

    ## Evaluation of final model
    if do_final_eval:
        num_test_samples = 1000
        num_test_steps = num_test_samples // batch_size
        num_test_samples = num_test_steps * batch_size
        print('{0}: Starting testing on {1} samples'.format(
            datetime.datetime.now(), num_test_samples))
        test_scores, test_time = eval_on_dataset(model, test_path, vocab_dict,
                                                 num_classes, max_input_length,
                                                 num_test_steps, batch_size)
        time_per_sample = test_time.total_seconds() / num_test_samples
        print("Seconds per sample: %2.2e sec" % time_per_sample)