Beispiel #1
0
def test_pool_edges_model():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = PooledEdgesModel(encoder)
    criterion = nn.MultiMarginLoss()

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      early_stopping=5,
                                      criterion=criterion)

    selected_questions = [
        s for s in training_dataset
        if any(scores[2] > 0.0 for g, scores in s.graphs)
    ]
    targets = np.zeros((len(selected_questions)), dtype=np.int32)
    for qi, q in enumerate(selected_questions):
        random.shuffle(q.graphs)
        targets[qi] = np.argsort([g.scores[2] for g in q.graphs])[::-1][0]

    train_questions = V.encode_batch_questions(selected_questions,
                                               word2idx)[..., 0, :]
    train_edges = V.encode_batch_graphs(selected_questions, word2idx)[...,
                                                                      0, :]

    container.train(train=(train_questions, train_edges),
                    train_targets=targets)
Beispiel #2
0
def test_variable_margin_loss():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = OneEdgeModel(encoder)
    criterion = losses.VariableMarginLoss()

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      early_stopping=5,
                                      criterion=criterion)

    training_dataset = [
        s for s in dataset if any(scores[2] > 0.0 for g, scores in s.graphs)
    ]
    train_questions = V.encode_batch_questions(training_dataset,
                                               word2idx)[..., 0, :]
    train_edges = V.encode_batch_graphs(training_dataset, word2idx)[..., 0,
                                                                    0, :]
    targets = np.zeros((len(training_dataset), 100))
    for qi, q in enumerate(training_dataset):
        random.shuffle(q.graphs)
        for gi, g in enumerate(q.graphs[:100]):
            targets[qi, gi] = g.scores[2]

    container.train(train=(train_questions, train_edges),
                    train_targets=targets)
Beispiel #3
0
def test_metrics():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = PooledEdgesModel(encoder)
    criterion = nn.MultiMarginLoss()

    def metrics(targets, predictions, validation=False):
        _, predicted_targets = torch.topk(predictions, 1, dim=-1)
        # _, targets = torch.topk(targets, 1, dim=-1)
        predicted_targets = predicted_targets.squeeze(1)
        cur_acc = torch.sum(predicted_targets == targets).float()
        cur_acc /= predicted_targets.size(0)
        cur_f1 = 0.0

        if validation:
            for i, q in enumerate(training_dataset):
                if i < predicted_targets.size(0):
                    idx = predicted_targets.data[i]
                    if idx < len(q.graphs):
                        cur_f1 += q.graphs[idx].scores[2]
            cur_f1 /= targets.size(0)
        return {'acc': cur_acc.data[0], 'f1': cur_f1}

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      early_stopping=5,
                                      criterion=criterion,
                                      metrics=metrics)

    selected_questions = [
        s for s in training_dataset
        if any(scores[2] > 0.0 for g, scores in s.graphs)
    ]
    targets = np.zeros((len(selected_questions)), dtype=np.int32)
    for qi, q in enumerate(selected_questions):
        random.shuffle(q.graphs)
        targets[qi] = np.argsort([g.scores[2] for g in q.graphs])[::-1][0]

    train_questions = V.encode_batch_questions(selected_questions,
                                               word2idx)[..., 0, :]
    train_edges = V.encode_batch_graphs(selected_questions, word2idx)[...,
                                                                      0, :]

    container.train(train=(train_questions, train_edges),
                    train_targets=targets,
                    dev=(train_questions, train_edges),
                    dev_targets=targets)
Beispiel #4
0
def test_load_parameters():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = GNNModel(encoder, hp_dropout=0.2)
    criterion = nn.MultiMarginLoss(margin=0.5)

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      save_to_dir="../trainedmodels/",
                                      early_stopping=5,
                                      criterion=criterion,
                                      init_model_weights=True,
                                      lr_decay=2)
    container.save_model()
    container.reload_from_saved()
    assert container._model._gnn._prop_model._dropout.p == 0.2
Beispiel #5
0
def test_stagg_model():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = STAGGModel(encoder)
    criterion = nn.CrossEntropyLoss()

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      early_stopping=5,
                                      criterion=criterion)

    train_questions = V.encode_batch_questions(training_dataset, word2idx)
    train_edges = V.encode_batch_graphs(training_dataset, word2idx)[...,
                                                                    0, :, :]
    train_features = V.encode_structural_features(training_dataset)

    container.train(train=(train_questions, train_edges, train_features),
                    train_targets=np.zeros(len(training_dataset),
                                           dtype=np.int32))
Beispiel #6
0
def test_gnn():
    encoder = ConvWordsEncoder(*wordembeddings.shape)
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = GNNModel(encoder, hp_gated=False)
    criterion = nn.MultiMarginLoss(margin=0.5)

    container = fackel.TorchContainer(torch_model=net,
                                      batch_size=8,
                                      max_epochs=5,
                                      model_checkpoint=False,
                                      early_stopping=5,
                                      criterion=criterion,
                                      init_model_weights=True,
                                      lr_decay=2)

    train_questions = V.encode_batch_questions(training_dataset,
                                               word2idx)[..., 0, :]
    train_graphs = V.encode_batch_graph_structure(training_dataset, word2idx)
    targets = np.zeros(len(training_dataset), dtype=np.int32)

    container.train(train=(train_questions, *train_graphs),
                    train_targets=targets)
Beispiel #7
0
def train(config_file_path, seed, gpuid, model_description, experiment_tag):
    config, logger = config_utils.load_config(config_file_path, seed, gpuid)
    if "training" not in config:
        print("Training parameters not in the config file!")
        sys.exit()

    results_logger = None
    if 'log.results' in config['training']:
        results_logger = logging.getLogger("results_logger")
        results_logger.setLevel(logging.INFO)
        fh = logging.FileHandler(filename=config['training']['log.results'])
        fh.setLevel(logging.INFO)
        results_logger.addHandler(fh)
        results_logger.info(str(config))

    # Load data
    if not isinstance(config['training']["path_to_dataset"], list):
        config['training']["path_to_dataset"] = [
            config['training']["path_to_dataset"]
        ]
    training_dataset = []
    for path_to_train in config['training']["path_to_dataset"]:
        with open(path_to_train) as f:
            training_dataset += json.load(f, object_hook=sentence_object_hook)
    logger.info(f"Train: {len(training_dataset)}")
    train_size_available = len(training_dataset)
    dataset_name = config['training']["path_to_dataset"][0].split(
        "/")[-1].split(".")[0]

    if "path_to_validation" not in config['training']:
        config['training']["path_to_validation"] = config['training'][
            "path_to_dataset"][-1]
        logger.info(f"No validation set, using part of the training data.")
    with open(config['training']["path_to_validation"]) as f:
        val_dataset = json.load(f, object_hook=sentence_object_hook)
    logger.info(f"Validation: {len(val_dataset)}")
    val_size_available = len(val_dataset)

    wordembeddings, word2idx = V.extend_embeddings_with_special_tokens(
        *_utils.load_word_embeddings(
            _utils.RESOURCES_FOLDER +
            "../../resources/embeddings/glove/glove.6B.100d.txt"))
    logger.info(f"Loaded word embeddings: {wordembeddings.shape}")

    model_type = config['training']["model_type"]
    logger.info(f"Model type: {model_type}")

    V.MAX_NEGATIVE_GRAPHS = 50
    training_dataset = [
        s for s in training_dataset
        if any(scores[2] > losses.MIN_TARGET_VALUE for g, scores in s.graphs)
    ]
    training_samples, training_targets = pack_data(training_dataset, word2idx,
                                                   model_type)
    logger.info(f"Data encoded: {[m.shape for m in training_samples]}")

    V.MAX_NEGATIVE_GRAPHS = 100
    val_dataset = [
        s for s in val_dataset
        if any(scores[2] > losses.MIN_TARGET_VALUE for g, scores in s.graphs)
    ]
    print(
        f"Val F1 upper bound: {np.average([q.graphs[0].scores[2] for q in val_dataset])}"
    )
    val_samples, val_targets = pack_data(val_dataset, word2idx, model_type)
    logger.info(f"Val data encoded: {[m.shape for m in val_samples]}")

    encoder = models.ConvWordsEncoder(hp_vocab_size=wordembeddings.shape[0],
                                      hp_word_emb_size=wordembeddings.shape[1],
                                      **config['model'])
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = getattr(models, model_type)(encoder, **config['model'])

    def metrics(targets, predictions, validation=False):
        _, predicted_targets = torch.topk(predictions, 1, dim=-1)
        _, targets = torch.topk(targets, 1, dim=-1)
        predicted_targets = predicted_targets.squeeze(1)
        targets = targets.squeeze(1)
        cur_acc = torch.sum(predicted_targets == targets).float()
        cur_acc /= predicted_targets.size(0)
        cur_f1 = 0.0
        if validation:
            for i, q in enumerate(val_dataset):
                if i < predicted_targets.size(0):
                    idx = predicted_targets.data[i]
                    if abs(idx) < len(q.graphs):
                        cur_f1 += q.graphs[idx].scores[2]
            cur_f1 /= predicted_targets.size(0)
        return {
            'acc': cur_acc.data[0],
            'f1': cur_f1,
            'predictions': predicted_targets.data.unsqueeze(0)
        }

    # Save models into model specific directory
    if "save_to_dir" in config['training']:
        now = datetime.datetime.now()
        model_gated = net._gnn.hp_gated if model_type == "GNNModel" else False
        config['training']['save_to_dir'] = config['training']['save_to_dir'] + \
                                            f"{'g' if model_gated else ''}" \
                                            f"{model_type.lower()}s_{now.year}Q{now.month // 4 + 1}/"
        if not os.path.exists(config['training']['save_to_dir']):
            os.makedirs(config['training']['save_to_dir'])
    container = fackel.TorchContainer(
        torch_model=net,
        criterion=losses.VariableMarginLoss(),
        # criterion=nn.MultiMarginLoss(margin=0.5, size_average=False),
        metrics=metrics,
        optimizer_params={
            'weight_decay': 0.05,
            # 'lr': 0.01
        },
        optimizer="Adam",
        logger=logger,
        init_model_weights=True,
        description=model_description,
        **config['training'])

    if results_logger:
        results_logger.info("Model save to: {}".format(
            container._save_model_to))

    log_history = container.train(training_samples,
                                  training_targets,
                                  dev=val_samples,
                                  dev_targets=val_targets)

    for q in val_dataset:
        random.shuffle(q.graphs)
    if container._model_checkpoint:
        container.reload_from_saved()
    val_samples, val_targets = pack_data(val_dataset, word2idx, model_type)
    predictions = container.predict_batchwise(*val_samples)
    results = metrics(*container._torchify_data(True, val_targets),
                      predictions,
                      validation=True)
    _, predictions = torch.topk(predictions, 1, dim=-1)
    print(f"Acc: {results['acc']}, F1: {results['f1']}")
    print(f"Predictions head: {predictions.data[:10].view(1,-1)}")

    model_name = container._save_model_to.name
    model_gated = container._model._gnn.hp_gated if model_type == "GNNModel" else False
    # Print out the model path for the evaluation script to pick up
    if "add.results.to" in config['training']:
        print(
            f"Adding training results to {config['training']['add.results.to']}"
        )
        with open(config['training']["add.results.to"], 'a+') as results_out:
            results_out.write(",".join([
                model_name, model_type, "Gated" if model_gated else "Simple",
                model_description,
                str(seed), dataset_name,
                f"{len(training_dataset)}/{train_size_available}",
                f"{len(val_dataset)}/{val_size_available}",
                str(len(log_history)),
                str(results['acc']),
                str(results['f1']), experiment_tag
            ]))
            results_out.write("\n")
    # Print out the model path for the evaluation script to pick up
    print(container._save_model_to)
def generate(path_to_model, config_file_path):

    config, logger = config_utils.load_config(config_file_path)
    if "evaluation" not in config:
        print("Evaluation parameters not in the config file!")
        sys.exit()

    with open(config['evaluation']['questions']) as f:
        webquestions_questions = json.load(f)

    entitylinker = None
    if 'entity.linking' in config:
        PATH_EL = "../../entity-linking/"
        sys.path.insert(0, PATH_EL)
        from entitylinking import core
        linking_config = config['entity.linking']
        logger.info("Load entity linker")
        entitylinker = getattr(core, linking_config['linker'])(
            logger=logger, **linking_config['linker.options'], pos_tags=True)

    _, word2idx = V.extend_embeddings_with_special_tokens(
        *_utils.load_word_embeddings(
            _utils.RESOURCES_FOLDER +
            "../../resources/embeddings/glove/glove.6B.100d.txt"))
    V.WORD_2_IDX = word2idx

    model_type = path_to_model.split("/")[-1].split("_")[0]
    logger.info(f"Model type: {model_type}")

    logger.info('Loading the model from: {}'.format(path_to_model))

    dummy_net = getattr(models, model_type)()
    container = fackel.TorchContainer(torch_model=dummy_net, logger=logger)
    container.load_from_file(path_to_model)

    graph_queries.FREQ_THRESHOLD = config['evaluation'].get(
        "min.relation.freq", 500)
    logger.debug('Testing')
    global_answers = []
    avg_metrics = np.zeros(3)
    data_iterator = tqdm.tqdm(webquestions_questions, ncols=100, ascii=True)
    for i, q_obj in enumerate(data_iterator):
        q = q_obj.get('utterance', q_obj.get('question'))
        q_index = q_obj['questionid']

        if entitylinker:
            sent = entitylinker.link_entities_in_raw_input(q,
                                                           element_id=q_index)
            if "max.num.entities" in config['evaluation']:
                sent.entities = sent.entities[:config['evaluation']
                                              ["max.num.entities"]]
            sent = sentence.Sentence(input_text=sent.input_text,
                                     tagged=sent.tagged,
                                     entities=sent.entities)
        else:
            tagged = _utils.get_tagged_from_server(q, caseless=q.islower())
            sent = sentence.Sentence(input_text=q,
                                     tagged=tagged,
                                     entities=q_obj['entities'])

        chosen_graphs = staged_generation.generate_with_model(
            sent,
            container,
            beam_size=config['evaluation'].get("beam.size", 10))
        model_answers = []
        g = ({}, )
        if chosen_graphs:
            j = 0
            while not model_answers and j < len(chosen_graphs):
                g = chosen_graphs[j]
                model_answers = graph_queries.get_graph_denotations(g.graph)
                j += 1

        gold_answers = webquestions_io.get_answers_from_question(q_obj)
        metrics = evaluation.retrieval_prec_rec_f1(gold_answers, model_answers)
        global_answers.append((q_index, list(metrics), model_answers, [
            (c_g.graph, float(c_g.scores[2])) for c_g in chosen_graphs[:10]
        ]))
        avg_metrics += metrics
        precision, recall, f1 = tuple(avg_metrics / (i + 1))
        data_iterator.set_postfix(prec=precision, rec=recall, f1=f1)

        if i > 0 and i % 100 == 0:
            with open(config['evaluation']["save.answers.to"],
                      'w') as answers_out:
                json.dump(global_answers,
                          answers_out,
                          sort_keys=True,
                          indent=4,
                          cls=sentence.SentenceEncoder)

    print("Average metrics: {}".format(
        (avg_metrics / (len(webquestions_questions)))))

    logger.debug('Testing is finished')
    with open(config['evaluation']["save.answers.to"], 'w') as answers_out:
        json.dump(global_answers,
                  answers_out,
                  sort_keys=True,
                  indent=4,
                  cls=sentence.SentenceEncoder)
def train(config_file_path, seed, gpuid):
    config, logger = config_utils.load_config(config_file_path, seed, gpuid)
    if "training" not in config:
        print("Training parameters not in the config file!")
        sys.exit()

    results_logger = None
    if 'log.results' in config['training']:
        results_logger = logging.getLogger("results_logger")
        results_logger.setLevel(logging.INFO)
        fh = logging.FileHandler(filename=config['training']['log.results'])
        fh.setLevel(logging.INFO)
        results_logger.addHandler(fh)
        results_logger.info(str(config))

    # Load data
    if not isinstance(config['training']["path_to_dataset"], list):
        config['training']["path_to_dataset"] = [config['training']["path_to_dataset"]]
    training_dataset = []
    for path_to_train in config['training']["path_to_dataset"]:
        with open(path_to_train) as f:
            training_dataset += json.load(f,  object_hook=sentence_object_hook)
    logger.info(f"Train: {len(training_dataset)}")

    if "path_to_validation" not in config['training']:
        config['training']["path_to_validation"] = config['training']["path_to_dataset"][-1]
        logger.info(f"No validation set, using part of the training data.")
    with open(config['training']["path_to_validation"]) as f:
        val_dataset = json.load(f,  object_hook=sentence_object_hook)
    logger.info(f"Validation: {len(val_dataset)}")

    wordembeddings, word2idx = V.extend_embeddings_with_special_tokens(
        *_utils.load_word_embeddings(_utils.RESOURCES_FOLDER + "../../resources/embeddings/glove/glove.6B.100d.txt")
    )
    logger.info(f"Loaded word embeddings: {wordembeddings.shape}")

    model_type = config['training']["model_type"]
    logger.info(f"Model type: {model_type}")

    V.MAX_NEGATIVE_GRAPHS = 50
    training_dataset = [s for s in training_dataset if any(scores[2] > 0.25 for g, scores in s.graphs)]
    training_samples, training_targets = pack_data(training_dataset, word2idx, model_type)
    logger.info(f"Data encoded: {[m.shape for m in training_samples]}")

    V.MAX_NEGATIVE_GRAPHS = 100
    val_dataset = [s for s in val_dataset if any(scores[2] > 0.25 for g, scores in s.graphs)]
    print(f"Val F1 upper bound: {np.average([q.graphs[0].scores[2] for q in val_dataset])}")
    val_samples, val_targets = pack_data(val_dataset, word2idx, model_type)
    logger.info(f"Val data encoded: {[m.shape for m in val_samples]}")

    encoder = models.ConvWordsEncoder(
        hp_vocab_size=wordembeddings.shape[0],
        hp_word_emb_size=wordembeddings.shape[1],
        **config['model']
    )
    encoder.load_word_embeddings_from_numpy(wordembeddings)
    net = getattr(models, model_type)(encoder, **config['model'])

    def metrics(targets, predictions, validation=False):
        _, predicted_targets = torch.topk(predictions, 1, dim=-1)
        _, targets = torch.topk(targets, 1, dim=-1)
        predicted_targets = predicted_targets.squeeze(1)
        targets = targets.squeeze(1)
        cur_acc = torch.sum(predicted_targets == targets).float()
        cur_acc /= predicted_targets.size(0)
        cur_f1 = 0.0
        if validation:
            for i, q in enumerate(val_dataset):
                if i < predicted_targets.size(0):
                    idx = predicted_targets.data[i]
                    if abs(idx) < len(q.graphs):
                        cur_f1 += q.graphs[idx].scores[2]
            cur_f1 /= predicted_targets.size(0)
        return {'acc': cur_acc.data[0], 'f1': cur_f1, 'predictions': predicted_targets.data.unsqueeze(0)}

    container = fackel.TorchContainer(
        torch_model=net,
        criterion=losses.VariableMarginLoss(),
        # criterion=nn.MultiMarginLoss(margin=0.5, size_average=False),
        metrics=metrics,
        optimizer_params={
            'weight_decay': 0.05,
            # 'lr': 0.01
        },
        optimizer="Adam",
        logger=logger,
        init_model_weights=True,
        **config['training']
    )

    if results_logger:
        results_logger.info("Model save to: {}".format(container._save_model_to))

    log_history = container.train(
        training_samples, training_targets,
        dev=val_samples, dev_targets=val_targets
    )

    for q in val_dataset:
        random.shuffle(q.graphs)
    if container._model_checkpoint:
        container.reload_from_saved()
    val_samples, val_targets = pack_data(val_dataset, word2idx, model_type)
    predictions = container.predict_batchwise(*val_samples)
    results = metrics(*container._torchify_data(True, val_targets), predictions, validation=True)
    _, predictions = torch.topk(predictions, 1, dim=-1)
    print(f"Acc: {results['acc']}, F1: {results['f1']}")
    print(f"Predictions: {predictions.data[:10].view(1,-1)}")
Beispiel #10
0
def generate(path_to_model, config_file_path, seed, gpuid, experiment_tag):
    config, logger = config_utils.load_config(config_file_path,
                                              gpuid=gpuid,
                                              seed=seed)
    if "evaluation" not in config:
        print("Evaluation parameters not in the config file!")
        sys.exit()

    # Get the data set name and load the data set as specified in the config file
    dataset_name = config['evaluation']['questions'].split("/")[-1].split(
        ".")[0]
    logger.info(f"Dataset: {dataset_name}")
    with open(config['evaluation']['questions']) as f:
        webquestions_questions = json.load(f)

    # Load the entity linker if specified, otherwise the entity annotations in the data set will be used
    entitylinker = None
    if 'entity.linking' in config:
        PATH_EL = "../../entity-linking/"
        sys.path.insert(0, PATH_EL)
        from entitylinking import core
        linking_config = config['entity.linking']
        logger.info("Load entity linker")
        entitylinker = getattr(core, linking_config['linker'])(
            logger=logger, **linking_config['linker.options'], pos_tags=True)

    # Load the GloVe word embeddings and embeddings for special tokens
    _, word2idx = V.extend_embeddings_with_special_tokens(
        *_utils.load_word_embeddings(
            _utils.RESOURCES_FOLDER +
            "../../resources/embeddings/glove/glove.6B.100d.txt"))
    # Set the global mapping for words to indices
    V.WORD_2_IDX = word2idx

    # Derive the model type and the full model name from the model file
    model_type = path_to_model.split("/")[-1].split("_")[0]
    model_name = path_to_model.split("/")[-1].replace(".pkl", "")
    logger.info(f"Model type: {model_type}")
    logger.info('Loading the model from: {}'.format(path_to_model))

    # Load the PyTorch model
    dummy_net = getattr(models, model_type)()
    container = fackel.TorchContainer(torch_model=dummy_net, logger=logger)
    container.load_from_file(path_to_model)
    model_gated = container._model._gnn.hp_gated if model_type == "GNNModel" else False

    # Load the freebase entity set that was used top restrict the answer space by the previous work if specified.
    freebase_entity_set = set()
    if config['evaluation'].get('entities.list', False):
        print(f"Using the Freebase entity list")
        freebase_entity_set = _utils.load_blacklist(_utils.RESOURCES_FOLDER +
                                                    "freebase-entities.txt")

    # Compose a file name for the output file
    save_answer_to = config['evaluation']["save.answers.to"]
    if not save_answer_to.endswith(".json"):
        dir_name = config['evaluation'][
            "save.answers.to"] + f"{dataset_name}/{model_type.lower()}/"
        save_answer_to = dir_name + f"{dataset_name}_predictions_{'g' if model_gated else ''}{model_name.lower()}.json"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
    print(f"Save output to {save_answer_to}")

    # Init the variables to store the results
    logger.debug('Testing')
    graph_queries.FREQ_THRESHOLD = config['evaluation'].get(
        "min.relation.freq", 500)
    global_answers = []
    avg_metrics = np.zeros(4)

    # Iterate over the questions in the dataset
    data_iterator = tqdm.tqdm(webquestions_questions, ncols=100, ascii=True)
    for i, q_obj in enumerate(data_iterator):
        q = q_obj.get('utterance', q_obj.get('question'))
        q_index = q_obj['questionid']

        if entitylinker:
            sent = entitylinker.link_entities_in_raw_input(q,
                                                           element_id=q_index)
            if "max.num.entities" in config['evaluation']:
                sent.entities = sent.entities[:config['evaluation']
                                              ["max.num.entities"]]
            sent = sentence.Sentence(input_text=sent.input_text,
                                     tagged=sent.tagged,
                                     entities=sent.entities)
        else:
            tagged = _utils.get_tagged_from_server(q, caseless=q.islower())
            sent = sentence.Sentence(input_text=q,
                                     tagged=tagged,
                                     entities=q_obj['entities'])

        chosen_graphs = staged_generation.generate_with_model(
            sent,
            container,
            beam_size=config['evaluation'].get("beam.size", 10))
        model_answers = []
        g = ({}, )
        j = -1
        if chosen_graphs:
            j = 0
            valid_answer_set = False
            while not valid_answer_set and j < len(chosen_graphs):
                g = chosen_graphs[j]
                model_answers = graph_queries.get_graph_denotations(g.graph)
                if model_answers:
                    valid_answer_set = True
                    if freebase_entity_set:
                        labeled_answers = {
                            l.lower()
                            for _, labels in queries.get_labels_for_entities(
                                model_answers).items() for l in labels
                        }
                        valid_answer_set = len(
                            labeled_answers
                            & freebase_entity_set) > len(model_answers) - 1
                j += 1

        gold_answers = webquestions_io.get_answers_from_question(q_obj)
        metrics = evaluation.retrieval_prec_rec_f1(gold_answers, model_answers)
        global_answers.append((q_index, list(metrics), model_answers, [
            (c_g.graph, float(c_g.scores[2])) for c_g in chosen_graphs[:10]
        ]))
        avg_metrics += metrics + (j, )
        precision, recall, f1, g_j = tuple(avg_metrics / (i + 1))
        data_iterator.set_postfix(prec=precision, rec=recall, f1=f1, g_j=g_j)

        # Save intermediate results
        if i > 0 and i % 100 == 0:
            with open(save_answer_to, 'w') as answers_out:
                json.dump(global_answers,
                          answers_out,
                          sort_keys=True,
                          indent=4,
                          cls=sentence.SentenceEncoder)

    avg_metrics = avg_metrics / (len(webquestions_questions))
    print("Average metrics: {}".format(avg_metrics))

    # Fine-grained results, if there is a mapping of questions to the number of relation to find the correct answer
    results_by_hops = {}
    if "qid2hop" in config['evaluation']:
        with open(config['evaluation']['qid2hop']) as f:
            q_index2hop = json.load(f)
        print("Results by hop: ")
        hops_dist = Counter([q_index2hop[p[0]] for p in global_answers])
        results_by_hops = {
            i: np.zeros(3)
            for i in range(max(hops_dist.keys()) + 1)
        }
        for p in global_answers:
            metrics = tuple(p[1])
            results_by_hops[q_index2hop[p[0]]] += metrics
        for m in results_by_hops:
            if hops_dist[m] > 0:
                results_by_hops[m] = results_by_hops[m] / hops_dist[m]
        print(results_by_hops)

    # Add results to the results file
    if "add.results.to" in config['evaluation']:
        print(f"Adding results to {config['evaluation']['add.results.to']}")
        with open(config['evaluation']["add.results.to"], 'a+') as results_out:
            results_out.write(",".join([
                model_name, model_type, "Gated" if model_gated else "Simple",
                str(seed), dataset_name, "full",
                "EntityList" if freebase_entity_set else "NoEntityList"
            ] + [str(el) for el in avg_metrics[:3]]))
            results_out.write("\n")
            # Include fine grained results if available
            if results_by_hops:
                for i in range(max(results_by_hops.keys()) + 1):
                    results_out.write(",".join([
                        model_name, model_type, "Gated"
                        if model_gated else "Simple", container.description,
                        str(seed), dataset_name,
                        str(i),
                        "EntityList" if freebase_entity_set else "NoEntityList"
                    ] + [str(el)
                         for el in results_by_hops[i]] + [experiment_tag]))
                    results_out.write("\n")

    # Save final model output
    with open(save_answer_to, 'w') as answers_out:
        json.dump(global_answers,
                  answers_out,
                  sort_keys=True,
                  indent=4,
                  cls=sentence.SentenceEncoder)