Exemple #1
0
def benchmark_reader():
    reader_results = []
    doc_store = get_document_store("elasticsearch")
    docs, labels = eval_data_from_file(data_dir / filename)
    index_to_doc_store(doc_store, docs, None, labels)
    for reader_name in reader_models:
        for reader_type in reader_types:
            try:
                reader = get_reader(reader_name, reader_type)
                results = reader.eval(document_store=doc_store,
                                      doc_index=doc_index,
                                      label_index=label_index,
                                      device="cuda")
                # print(results)
                results["passages_per_second"] = n_passages / results[
                    "reader_time"]
                results["reader"] = reader_name
                results["error"] = ""
                reader_results.append(results)
            except Exception as e:
                results = {
                    'EM': 0.,
                    'f1': 0.,
                    'top_n_accuracy': 0.,
                    'top_n': 0,
                    'reader_time': 0.,
                    "passages_per_second": 0.,
                    "seconds_per_query": 0.,
                    'reader': reader_name,
                    "error": e
                }
                reader_results.append(results)
            reader_df = pd.DataFrame.from_records(reader_results)
            reader_df.to_csv("reader_results.csv")
Exemple #2
0
def benchmark_reader(ci=False,
                     update_json=False,
                     save_markdown=False,
                     **kwargs):
    if ci:
        reader_models = reader_models_ci
        max_docs = 100
        # heuristic to estimate num of passages for the reduced num of docs
        n_passages = n_total_passages * (max_docs / n_total_docs)
    else:
        reader_models = reader_models_full
        max_docs = None
        n_passages = n_total_passages
    reader_results = []
    doc_store = get_document_store("elasticsearch")
    # download squad data
    _download_extract_downstream_data(input_file=data_dir / filename)
    docs, labels = eval_data_from_file(data_dir / filename, max_docs)

    index_to_doc_store(doc_store, docs, None, labels)
    for reader_name in reader_models:
        for reader_type in reader_types:
            logger.info(
                f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### "
            )
            try:
                reader = get_reader(reader_name, reader_type)
                results = reader.eval(document_store=doc_store,
                                      doc_index=doc_index,
                                      label_index=label_index,
                                      device="cuda")
                # results = reader.eval_on_file(data_dir, filename, device="cuda")
                print(results)
                results["passages_per_second"] = n_passages / results[
                    "reader_time"]
                results["reader"] = reader_name
                results["error"] = ""
                reader_results.append(results)
            except Exception as e:
                results = {
                    'EM': 0.,
                    'f1': 0.,
                    'top_n_accuracy': 0.,
                    'top_n': 0,
                    'reader_time': 0.,
                    "passages_per_second": 0.,
                    "seconds_per_query": 0.,
                    'reader': reader_name,
                    "error": e
                }
                reader_results.append(results)
            reader_df = pd.DataFrame.from_records(reader_results)
            reader_df.to_csv(results_file)
            if save_markdown:
                md_file = results_file.replace(".csv", ".md")
                with open(md_file, "w") as f:
                    f.write(str(reader_df.to_markdown()))
    if update_json:
        populate_reader_json()
Exemple #3
0
def train_and_eval_from_config(common_config, model_config, model_output_dir):
    # for reproducibility
    tf.set_random_seed(0)

    # Load Data from tfrecord
    # first get the reader based on the config: frame feature reader or
    # aggregated feature reader
    reader = get_reader(common_config['feature_names'],
                        common_config['feature_sizes'],
                        common_config['num_classes'],
                        common_config['frame_features'])

    # get the input data tensors
    unused_video_id, model_input_raw, labels_batch, num_frames = (
        get_input_training_tensors(
            reader,
            common_config['training_data_path'],
            batch_size=common_config['batch_size'],
            num_readers=common_config['num_readers'],
            num_epochs=common_config['num_epochs']
        ))

    tf.summary.histogram("model/input_raw", model_input_raw)

    # create model
    model_name = next(iter(model_config))
    train_model = ModelFactory().get_model(model_name).create_model(model_input_raw,
                                                                    common_config,
                                                                    model_config)

    # compile the model
    # Pass the target tensor `labels_batch` to train_model.compile
    # via the `target_tensors` keyword argument
    classifier = common_config['classifier']
    if classifier == 'multiclass':
        train_model.compile(optimizer='adam',
                            loss='categorical_crossentropy',
                            metrics=['accuracy'],
                            target_tensors=[labels_batch])
    elif classifier == 'binary':
        train_model.compile(optimizer='adam',
                            loss='binary_crossentropy',
                            metrics=['accuracy'],
                            target_tensors=[labels_batch])
    train_model.summary()

    # get the evaluation data tensors
    video_id_batch, eval_model_input_raw, eval_labels_batch, eval_num_frames = (
        get_input_evaluation_tensors(
            reader,
            common_config['evaluation_data_path'],
            batch_size=common_config['batch_size'],
            num_readers=common_config['num_readers']))

    tf.summary.histogram("eval/input_raw", eval_model_input_raw)

    # create a separate model for evaluation
    eval_model = ModelFactory().get_model(model_name).create_model(eval_model_input_raw,
                                                                   common_config,
                                                                   model_config)

    # compile the eval model
    # Pass the target tensor `eval_labels_batch` to eval_model.compile
    # via the `target_tensors` keyword argument
    if classifier == 'multiclass':
        eval_model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['accuracy'],
                           target_tensors=[eval_labels_batch])
    elif classifier == 'binary':
        eval_model.compile(optimizer='adam',
                           loss='binary_crossentropy',
                           metrics=['accuracy'],
                           target_tensors=[eval_labels_batch])
    eval_model.summary()

    # get the session..
    from keras import backend as K
    sess = K.get_session()

    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    # Fit the model using data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    try:
        train_model.fit(epochs=common_config['num_epochs'],
                        steps_per_epoch=common_config['training_steps_per_epoch'],
                        callbacks=[EvaluateInputTensor(eval_model, steps=common_config['eval_steps_per_epoch'])],
                        verbose=2)
    except tf.errors.OutOfRangeError:
        logging.info("Done training -- epoch limit reached.")

    # save the model
    # 1. serialize model to JSON
    model_json = train_model.to_json()
    model_file_to_save = os.path.join(model_output_dir, OUTPUT_MODEL_FILENAME)
    with open(model_file_to_save, "w") as json_file:
        json_file.write(model_json)

    # 2. save the model weights
    weights_file_to_save = os.path.join(model_output_dir, OUTPUT_WEIGHTS_FILENAME)

    train_model.save_weights(weights_file_to_save)
    logging.info("Saved model and weights to " + model_output_dir)

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)
    K.clear_session()
Exemple #4
0
def test_from_saved_model(test_data_path, batch_size, num_readers, num_epochs,
                          feature_names, feature_sizes, num_classes,
                          frame_features, model_output_dir, saved_model_file,
                          saved_weights_file):
    # get the session..
    from keras import backend as K
    sess = K.get_session()

    # for reproducibility
    tf.set_random_seed(0)
    # Load Data from tfrecord
    # first get the reader based on the config: frame feature reader or
    # aggregated feature reader
    reader = utils.get_reader(feature_names, feature_sizes, num_classes,
                              frame_features)

    # get input test tensors
    video_id_batch, test_model_input_raw, test_labels_batch, test_num_frames = (
        get_input_test_tensors(reader,
                               test_data_path,
                               batch_size=batch_size,
                               num_readers=num_readers))
    tf.summary.histogram("test/model_input_raw", test_model_input_raw)

    feature_dim = len(test_model_input_raw.get_shape()) - 1

    # Normalize input features.
    test_model_input_raw = tf.nn.l2_normalize(test_model_input_raw,
                                              feature_dim)
    # test_model_input_raw = tf.reshape(test_model_input_raw, shape=(batch_size, 1, 300, 128))

    # TODO see if we need these 2 lines..
    test_labels_batch = tf.cast(test_labels_batch, tf.float32)

    # load json and create model
    if model_output_dir.endswith('/'):
        model_file_to_load = model_output_dir + saved_model_file
        weights_file_to_load = model_output_dir + saved_weights_file
    else:
        model_file_to_load = model_output_dir + '/' + saved_model_file
        weights_file_to_load = model_output_dir + '/' + saved_weights_file

    json_file = open(model_file_to_load, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    test_model = model_from_json(loaded_model_json)
    # load weights into new model
    test_model.load_weights(weights_file_to_load)

    # compile the model
    test_model.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])
    test_model.summary()

    sess.run(tf.local_variables_initializer())
    # sess.run(tf.global_variables_initializer())
    # evaluate the model using test data from the TFRecord data tensors.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    try:
        while not coord.should_stop():
            loss, acc = test_model.evaluate(test_model_input_raw,
                                            test_labels_batch,
                                            steps=100,
                                            verbose=1)
    except tf.errors.OutOfRangeError:
        logging.info("Done training -- limit reached.")

    # Clean up the TF session.
    coord.request_stop()
    coord.join(threads)
    K.clear_session()

    print('\nTest accuracy: {0}'.format(acc))