Ejemplo n.º 1
0
def main(params):

    # create output dir
    now = datetime.now()
    datetime_str = now.strftime("%Y-%m-%d_%H-%M-%S")
    model_output_path = os.path.join(params["output_path"], datetime_str)
    if not os.path.exists(model_output_path):
        os.makedirs(model_output_path)

    # get logger
    logger = utils.get_logger(model_output_path)

    # copy blink source and create rerun script
    blink_copy_path = os.path.join(model_output_path, "blink")
    copy_directory("blink", blink_copy_path)
    cmd = sys.argv
    with open(os.path.join(model_output_path, "rerun.sh"), "w") as f:
        cmd.insert(0, "python")
        f.write(" ".join(cmd))

    # Init model
    ctxt_reranker = CrossEncoderRanker(params)
    ctxt_model = ctxt_reranker.model
    tokenizer = ctxt_reranker.tokenizer

    params["pool_highlighted"] = False  # only `True` for ctxt
    cand_reranker = CrossEncoderRanker(params)
    cand_model = cand_reranker.model

    device = ctxt_reranker.device
    n_gpu = ctxt_reranker.n_gpu

    if params["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(params["gradient_accumulation_steps"]))

    # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
    # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
    params["train_batch_size"] = (params["train_batch_size"] //
                                  params["gradient_accumulation_steps"])
    train_batch_size = params["train_batch_size"]
    eval_batch_size = params["eval_batch_size"]

    # Fix the random seeds
    seed = params["seed"]
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if ctxt_reranker.n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    context_length = params["max_context_length"]

    # create train dataloaders
    fname = os.path.join(params["data_path"], "joint_train.t7")
    train_data = torch.load(fname)
    gold_coref_clusters = build_gold_coref_clusters(train_data)
    ctxt_train_dataloader = create_mst_dataloader(
        params, gold_coref_clusters, train_data["contexts"],
        train_data["pos_coref_ctxts"], train_data["pos_coref_ctxt_uids"],
        train_data["knn_ctxts"], train_data["knn_ctxt_uids"])

    cand_train_dataloader = create_dataloader(params, train_data["contexts"],
                                              train_data["pos_cands"],
                                              train_data["pos_cand_uids"],
                                              train_data["knn_cands"],
                                              train_data["knn_cand_uids"])

    fname = os.path.join(params["data_path"], "joint_valid.t7")
    valid_data = torch.load(fname)
    ctxt_valid_dataloader = create_dataloader(
        params,
        valid_data["contexts"],
        valid_data["pos_coref_ctxts"],
        valid_data["pos_coref_ctxt_uids"],
        valid_data["knn_ctxts"],
        valid_data["knn_ctxt_uids"],
        evaluate=True)
    cand_valid_dataloader = create_dataloader(params,
                                              valid_data["contexts"],
                                              valid_data["pos_cands"],
                                              valid_data["pos_cand_uids"],
                                              valid_data["knn_cands"],
                                              valid_data["knn_cand_uids"],
                                              evaluate=True)

    # evaluate before training
    ctxt_results = evaluate(
        ctxt_reranker,
        ctxt_valid_dataloader,
        device=device,
        logger=logger,
        context_length=context_length,
        suffix="ctxt",
        silent=params["silent"],
    )
    cand_results = evaluate(
        cand_reranker,
        cand_valid_dataloader,
        device=device,
        logger=logger,
        context_length=context_length,
        suffix="cand",
        silent=params["silent"],
    )

    number_of_samples_per_dataset = {}

    time_start = time.time()

    utils.write_to_file(os.path.join(model_output_path, "training_params.txt"),
                        str(params))

    logger.info("Starting training")
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, False))

    ctxt_optimizer = get_optimizer(ctxt_model, params)
    ctxt_scheduler = get_scheduler(
        params, ctxt_optimizer,
        len(ctxt_train_dataloader) * train_batch_size, logger)

    cand_optimizer = get_optimizer(cand_model, params)
    cand_scheduler = get_scheduler(
        params, cand_optimizer,
        len(cand_train_dataloader) * train_batch_size, logger)

    ctxt_best_epoch_idx = -1
    ctxt_best_score = -1
    cand_best_epoch_idx = -1
    cand_best_score = -1

    num_train_epochs = params["num_train_epochs"]

    for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
        # train ctxt model
        train_one_epoch_mst(ctxt_train_dataloader,
                            ctxt_reranker,
                            ctxt_optimizer,
                            ctxt_scheduler,
                            logger,
                            params,
                            epoch_idx,
                            device=device,
                            suffix='ctxt')

        # train cand model
        train_one_epoch(cand_train_dataloader,
                        cand_reranker,
                        cand_optimizer,
                        cand_scheduler,
                        logger,
                        params,
                        epoch_idx,
                        device=device,
                        suffix='cand')

        logger.info("***** Saving fine - tuned models *****")
        ctxt_epoch_output_folder_path = os.path.join(
            model_output_path, "epoch_{}".format(epoch_idx), "ctxt")
        utils.save_model(ctxt_model, tokenizer, ctxt_epoch_output_folder_path)
        cand_epoch_output_folder_path = os.path.join(
            model_output_path, "epoch_{}".format(epoch_idx), "cand")
        utils.save_model(cand_model, tokenizer, cand_epoch_output_folder_path)

        ctxt_results = evaluate(
            ctxt_reranker,
            ctxt_valid_dataloader,
            device=device,
            logger=logger,
            context_length=context_length,
            suffix="ctxt",
            silent=params["silent"],
        )
        cand_results = evaluate(
            cand_reranker,
            cand_valid_dataloader,
            device=device,
            logger=logger,
            context_length=context_length,
            suffix="cand",
            silent=params["silent"],
        )

        ctxt_ls = [ctxt_best_score, ctxt_results["normalized_accuracy"]]
        ctxt_li = [ctxt_best_epoch_idx, epoch_idx]
        ctxt_best_score = ctxt_ls[np.argmax(ctxt_ls)]
        ctxt_best_epoch_idx = ctxt_li[np.argmax(ctxt_ls)]

        cand_ls = [cand_best_score, cand_results["normalized_accuracy"]]
        cand_li = [cand_best_epoch_idx, epoch_idx]
        cand_best_score = cand_ls[np.argmax(cand_ls)]
        cand_best_epoch_idx = cand_li[np.argmax(cand_ls)]

        logger.info("\n")

    execution_time = (time.time() - time_start) / 60
    utils.write_to_file(
        os.path.join(model_output_path, "training_time.txt"),
        "The training took {} minutes\n".format(execution_time),
    )
    logger.info("The training took {} minutes\n".format(execution_time))

    # save the best models
    logger.info(
        "Best ctxt performance in epoch: {}".format(ctxt_best_epoch_idx))
    best_ctxt_model_path = os.path.join(model_output_path,
                                        "epoch_{}".format(ctxt_best_epoch_idx),
                                        "ctxt")
    logger.info(
        "Best cand performance in epoch: {}".format(cand_best_epoch_idx))
    best_cand_model_path = os.path.join(model_output_path,
                                        "epoch_{}".format(cand_best_epoch_idx),
                                        "cand")

    copy_directory(best_ctxt_model_path,
                   os.path.join(model_output_path, "best_epoch", "ctxt"))
    copy_directory(best_cand_model_path,
                   os.path.join(model_output_path, "best_epoch", "cand"))
Ejemplo n.º 2
0
def main(params):
    model_output_path = params["output_path"]
    if not os.path.exists(model_output_path):
        os.makedirs(model_output_path)
    logger = utils.get_logger(params["output_path"])

    # Init model
    reranker = CrossEncoderRanker(params)
    tokenizer = reranker.tokenizer
    model = reranker.model

    # utils.save_model(model, tokenizer, model_output_path)

    device = reranker.device
    n_gpu = reranker.n_gpu

    if params["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(params["gradient_accumulation_steps"]))

    # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
    # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
    params["train_batch_size"] = (params["train_batch_size"] //
                                  params["gradient_accumulation_steps"])
    train_batch_size = params["train_batch_size"]
    eval_batch_size = params["eval_batch_size"]
    grad_acc_steps = params["gradient_accumulation_steps"]

    # Fix the random seeds
    seed = params["seed"]
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if reranker.n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    max_seq_length = params["max_seq_length"]
    context_length = params["max_context_length"]

    fname = os.path.join(params["data_path"], "train.t7")
    train_data = torch.load(fname)
    context_input = train_data["context_vecs"]
    candidate_input = train_data["candidate_vecs"]
    label_input = train_data["labels"]
    if params["debug"]:
        max_n = 200
        context_input = context_input[:max_n]
        candidate_input = candidate_input[:max_n]
        label_input = label_input[:max_n]

    context_input = modify(context_input, candidate_input, max_seq_length)
    if params["zeshel"]:
        src_input = train_data['worlds'][:len(context_input)]
        train_tensor_data = TensorDataset(context_input, label_input,
                                          src_input)
    else:
        train_tensor_data = TensorDataset(context_input, label_input)
    train_sampler = RandomSampler(train_tensor_data)

    train_dataloader = DataLoader(train_tensor_data,
                                  sampler=train_sampler,
                                  batch_size=params["train_batch_size"])

    fname = os.path.join(params["data_path"], "valid.t7")
    valid_data = torch.load(fname)
    context_input = valid_data["context_vecs"]
    candidate_input = valid_data["candidate_vecs"]
    label_input = valid_data["labels"]
    if params["debug"]:
        max_n = 200
        context_input = context_input[:max_n]
        candidate_input = candidate_input[:max_n]
        label_input = label_input[:max_n]

    context_input = modify(context_input, candidate_input, max_seq_length)
    if params["zeshel"]:
        src_input = valid_data["worlds"][:len(context_input)]
        valid_tensor_data = TensorDataset(context_input, label_input,
                                          src_input)
    else:
        valid_tensor_data = TensorDataset(context_input, label_input)
    valid_sampler = SequentialSampler(valid_tensor_data)

    valid_dataloader = DataLoader(valid_tensor_data,
                                  sampler=valid_sampler,
                                  batch_size=params["eval_batch_size"])

    # evaluate before training
    results = evaluate(
        reranker,
        valid_dataloader,
        device=device,
        logger=logger,
        context_length=context_length,
        silent=params["silent"],
    )

    number_of_samples_per_dataset = {}

    time_start = time.time()

    utils.write_to_file(os.path.join(model_output_path, "training_params.txt"),
                        str(params))

    logger.info("Starting training")
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, False))

    optimizer = get_optimizer(model, params)
    scheduler = get_scheduler(params, optimizer, len(train_tensor_data),
                              logger)

    model.train()

    best_epoch_idx = -1
    best_score = -1

    num_train_epochs = params["num_train_epochs"]

    for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        results = None

        if params["silent"]:
            iter_ = train_dataloader
        else:
            iter_ = tqdm(train_dataloader, desc="Batch")

        part = 0
        for step, batch in enumerate(iter_):
            batch = tuple(t.to(device) for t in batch)
            context_input = batch[0]
            label_input = batch[1]
            loss, _ = reranker(context_input, label_input, context_length)

            # if n_gpu > 1:
            #     loss = loss.mean() # mean() to average on multi-gpu.

            if grad_acc_steps > 1:
                loss = loss / grad_acc_steps

            tr_loss += loss.item()

            if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0:
                logger.info("Step {} - epoch {} average loss: {}\n".format(
                    step,
                    epoch_idx,
                    tr_loss / (params["print_interval"] * grad_acc_steps),
                ))
                tr_loss = 0

            loss.backward()

            if (step + 1) % grad_acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               params["max_grad_norm"])
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0:
                logger.info("Evaluation on the development dataset")
                evaluate(
                    reranker,
                    valid_dataloader,
                    device=device,
                    logger=logger,
                    context_length=context_length,
                    silent=params["silent"],
                )
                logger.info("***** Saving fine - tuned model *****")
                epoch_output_folder_path = os.path.join(
                    model_output_path, "epoch_{}_{}".format(epoch_idx, part))
                part += 1
                utils.save_model(model, tokenizer, epoch_output_folder_path)
                model.train()
                logger.info("\n")

        logger.info("***** Saving fine - tuned model *****")
        epoch_output_folder_path = os.path.join(model_output_path,
                                                "epoch_{}".format(epoch_idx))
        utils.save_model(model, tokenizer, epoch_output_folder_path)
        # reranker.save(epoch_output_folder_path)

        output_eval_file = os.path.join(epoch_output_folder_path,
                                        "eval_results.txt")
        results = evaluate(
            reranker,
            valid_dataloader,
            device=device,
            logger=logger,
            context_length=context_length,
            silent=params["silent"],
        )

        ls = [best_score, results["normalized_accuracy"]]
        li = [best_epoch_idx, epoch_idx]

        best_score = ls[np.argmax(ls)]
        best_epoch_idx = li[np.argmax(ls)]
        logger.info("\n")

    execution_time = (time.time() - time_start) / 60
    utils.write_to_file(
        os.path.join(model_output_path, "training_time.txt"),
        "The training took {} minutes\n".format(execution_time),
    )
    logger.info("The training took {} minutes\n".format(execution_time))

    # save the best model in the parent_dir
    logger.info("Best performance in epoch: {}".format(best_epoch_idx))
    params["path_to_model"] = os.path.join(model_output_path,
                                           "epoch_{}".format(best_epoch_idx))
Ejemplo n.º 3
0
def main(parameters):
    # Read model
    reranker = utils.get_reranker(parameters)
    tokenizer = reranker.tokenizer
    model = reranker.model

    device = reranker.device
    n_gpu = reranker.n_gpu

    if parameters["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                parameters["gradient_accumulation_steps"]
            )
        )

    # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
    # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
    parameters["train_batch_size"] = (
        parameters["train_batch_size"] // parameters["gradient_accumulation_steps"]
    )
    train_batch_size = parameters["train_batch_size"]
    evaluation_batch_size = parameters["evaluation_batch_size"]
    gradient_accumulation_steps = parameters["gradient_accumulation_steps"]

    # Fix the random seeds
    seed = parameters["seed"]
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    logger = None
    number_of_samples_per_dataset = {}

    if reranker.n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    time_start = time.time()
    model_output_path = parameters["model_output_path"]

    # Make sure everything is in order with the output directiory
    if os.path.exists(model_output_path) and os.listdir(model_output_path):
        print(
            "Output directory ({}) already exists and is not empty.".format(
                model_output_path
            )
        )
        answer = input("Would you like to empty the existing directory? [Y/N]\n")
        if answer.strip() == "Y":
            print("Deleteing {}...".format(model_output_path))
            shutil.rmtree(model_output_path)
        else:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".format(
                    model_output_path
                )
            )

    if not os.path.exists(model_output_path):
        os.makedirs(model_output_path)

    utils.write_to_file(
        os.path.join(model_output_path, "training_parameters.txt"), str(parameters)
    )

    logger = utils.get_logger(model_output_path)
    logger.info("Starting training")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}".format(device, n_gpu, False)
    )

    ### Load training data
    train_dataset_name = "aida-train"
    train_samples = utils.read_dataset(
        train_dataset_name, parameters["path_to_preprocessed_json_data"]
    )
    train_samples_filtered = utils.filter_samples(train_samples, parameters["top_k"])
    logger.info(
        "Retained {} out of {} samples".format(
            len(train_samples_filtered), len(train_samples)
        )
    )
    number_of_samples_per_dataset[train_dataset_name] = len(train_samples)

    train_data, train_tensor_data = reranker._process_mentions_for_model(
        parameters["context_key"],
        train_samples_filtered,
        tokenizer,
        parameters["max_seq_length"],
        silent=parameters["silent"],
        logger=logger,
        top_k=parameters["top_k"],
        debug=parameters["debug"],
    )
    train_sampler = RandomSampler(train_tensor_data)
    train_dataloader = DataLoader(
        train_tensor_data, sampler=train_sampler, batch_size=train_batch_size
    )
    ###

    ### Loading dev data
    dev_dataset_name = "aida-A"
    dev_samples = utils.read_dataset(
        dev_dataset_name, parameters["path_to_preprocessed_json_data"]
    )
    dev_samples_filtered = utils.filter_samples(dev_samples, parameters["top_k"])
    logger.info(
        "Retained {} out of {} samples".format(
            len(dev_samples_filtered), len(dev_samples)
        )
    )
    number_of_samples_per_dataset[dev_dataset_name] = len(dev_samples)

    dev_data, dev_tensor_data = reranker._process_mentions_for_model(
        parameters["context_key"],
        train_samples_filtered,
        tokenizer,
        parameters["max_seq_length"],
        silent=parameters["silent"],
        logger=logger,
        top_k=parameters["top_k"],
        debug=parameters["debug"],
    )
    dev_sampler = SequentialSampler(dev_tensor_data)
    dev_dataloader = DataLoader(
        dev_tensor_data, sampler=dev_sampler, batch_size=evaluation_batch_size
    )
    ###

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_samples_filtered))
    logger.info("  Batch size = %d", train_batch_size)
    logger.info("  Gradient accumulation steps = %d", gradient_accumulation_steps)

    optimizer, scheduler = reranker.get_scheduler_and_optimizer(
        parameters, train_tensor_data, logger
    )

    best_epoch_idx = -1
    best_score = -1

    num_train_epochs = parameters["num_train_epochs"]

    model.train()

    for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        results = None

        for step, batch in enumerate(tqdm(train_dataloader, desc="Batch")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, entity_mask = batch
            loss, _ = model(
                input_ids, segment_ids, input_mask, label_ids, entity_mask=entity_mask
            )

            # if n_gpu > 1:
            #     loss = loss.mean() # mean() to average on multi-gpu.

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            tr_loss += loss.item()

            if (step + 1) % (
                parameters["print_tr_loss_opt_steps_interval"]
                * parameters["gradient_accumulation_steps"]
            ) == 0:
                logger.info(
                    "Step {} - epoch {} average loss: {}\n".format(
                        step,
                        epoch_idx,
                        tr_loss
                        / (
                            parameters["print_tr_loss_opt_steps_interval"]
                            * gradient_accumulation_steps
                        ),
                    )
                )
                tr_loss = 0

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), parameters["max_grad_norm"]
                )
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            if (step + 1) % (
                parameters["dev_evaluation_interval"]
                * gradient_accumulation_steps
                * train_batch_size
            ) == 0:
                logger.info("Evaluation on the development dataset")
                evaluate_model_on_dataset(
                    model,
                    dev_dataloader,
                    dev_dataset_name,
                    device=device,
                    logger=logger,
                    number_of_samples=number_of_samples_per_dataset[dev_dataset_name],
                )
                model.train()
                logger.info("\n")

        logger.info("***** Saving fine - tuned model *****")
        epoch_output_folder_path = os.path.join(
            model_output_path, "epoch_{}".format(epoch_idx)
        )
        utils.save_model(model, tokenizer, epoch_output_folder_path)

        output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt")
        results = evaluate_model_on_dataset(
            model,
            dev_dataloader,
            dev_dataset_name,
            device=device,
            logger=logger,
            path_to_file_to_write_results=output_eval_file,
            number_of_samples=number_of_samples_per_dataset[dev_dataset_name],
        )

        ls = [best_score, results["normalized_accuracy"]]
        li = [best_epoch_idx, epoch_idx]

        best_score = ls[np.argmax(ls)]
        best_epoch_idx = li[np.argmax(ls)]
        logger.info("\n")

    execution_time = (time.time() - time_start) / 60
    utils.write_to_file(
        os.path.join(model_output_path, "training_time.txt"),
        "The training took {} minutes\n".format(execution_time),
    )
    logger.info("The training took {} minutes\n".format(execution_time))

    # save the best model in the parent_dir
    logger.info("Best performance in epoch: {}".format(best_epoch_idx))
    parameters["path_to_model"] = os.path.join(
        model_output_path, "epoch_{}".format(best_epoch_idx)
    )
    reranker = utils.get_reranker(parameters)
    utils.save_model(reranker.model, tokenizer, model_output_path)

    if parameters["evaluate"]:
        parameters["path_to_model"] = model_output_path
        evaluate(parameters, logger=logger)