コード例 #1
0
ファイル: main_dense.py プロジェクト: yyht/BLINK
def _process_biencoder_dataloader(samples, tokenizer, biencoder_params):
    _, tensor_data = process_mention_data(
        samples,
        tokenizer,
        biencoder_params["max_context_length"],
        biencoder_params["max_cand_length"],
        silent=True,
        logger=None,
        debug=biencoder_params["debug"],
    )
    sampler = SequentialSampler(tensor_data)
    dataloader = DataLoader(
        tensor_data, sampler=sampler, batch_size=biencoder_params["eval_batch_size"]
    )
    return dataloader
コード例 #2
0
ファイル: train_biencoder.py プロジェクト: rangell/BLINK
def main(params):
    model_output_path = params["output_path"]
    if not os.path.exists(model_output_path):
        os.makedirs(model_output_path)
    logger = utils.get_logger(params["output_path"])

    # Init model
    reranker = BiEncoderRanker(params)
    tokenizer = reranker.tokenizer
    model = reranker.model

    # utils.save_model(model, tokenizer, model_output_path)

    device = reranker.device
    n_gpu = reranker.n_gpu

    if params["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(params["gradient_accumulation_steps"]))

    # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
    # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
    params["train_batch_size"] = (params["train_batch_size"] //
                                  params["gradient_accumulation_steps"])
    train_batch_size = params["train_batch_size"]
    eval_batch_size = params["eval_batch_size"]
    grad_acc_steps = params["gradient_accumulation_steps"]

    # Fix the random seeds
    seed = params["seed"]
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if reranker.n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    # Load train data
    train_samples = utils.read_dataset("train", params["data_path"])
    logger.info("Read %d train samples." % len(train_samples))

    train_data, train_tensor_data = data.process_mention_data(
        train_samples,
        tokenizer,
        params["max_context_length"],
        params["max_cand_length"],
        context_key=params["context_key"],
        silent=params["silent"],
        logger=logger,
        debug=params["debug"],
    )
    if params["shuffle"]:
        train_sampler = RandomSampler(train_tensor_data)
    else:
        train_sampler = SequentialSampler(train_tensor_data)

    train_dataloader = DataLoader(train_tensor_data,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)

    # Load eval data
    # TODO: reduce duplicated code here
    valid_samples = utils.read_dataset("valid", params["data_path"])
    logger.info("Read %d valid samples." % len(valid_samples))

    valid_data, valid_tensor_data = data.process_mention_data(
        valid_samples,
        tokenizer,
        params["max_context_length"],
        params["max_cand_length"],
        context_key=params["context_key"],
        silent=params["silent"],
        logger=logger,
        debug=params["debug"],
    )
    valid_sampler = SequentialSampler(valid_tensor_data)
    valid_dataloader = DataLoader(valid_tensor_data,
                                  sampler=valid_sampler,
                                  batch_size=eval_batch_size)

    # evaluate before training
    results = evaluate(
        reranker,
        valid_dataloader,
        params,
        device=device,
        logger=logger,
    )

    number_of_samples_per_dataset = {}

    time_start = time.time()

    utils.write_to_file(os.path.join(model_output_path, "training_params.txt"),
                        str(params))

    logger.info("Starting training")
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, False))

    optimizer = get_optimizer(model, params)
    scheduler = get_scheduler(params, optimizer, len(train_tensor_data),
                              logger)

    model.train()

    best_epoch_idx = -1
    best_score = -1

    num_train_epochs = params["num_train_epochs"]
    for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        results = None

        if params["silent"]:
            iter_ = train_dataloader
        else:
            iter_ = tqdm(train_dataloader, desc="Batch")

        for step, batch in enumerate(iter_):
            batch = tuple(t.to(device) for t in batch)
            if params["zeshel"]:
                context_input, candidate_input, _, _ = batch
            else:
                context_input, candidate_input, _ = batch
            loss, _ = reranker(context_input, candidate_input)

            # if n_gpu > 1:
            #     loss = loss.mean() # mean() to average on multi-gpu.

            if grad_acc_steps > 1:
                loss = loss / grad_acc_steps

            tr_loss += loss.item()

            if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0:
                logger.info("Step {} - epoch {} average loss: {}\n".format(
                    step,
                    epoch_idx,
                    tr_loss / (params["print_interval"] * grad_acc_steps),
                ))
                tr_loss = 0

            loss.backward()

            if (step + 1) % grad_acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               params["max_grad_norm"])
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0:
                logger.info("Evaluation on the development dataset")
                evaluate(
                    reranker,
                    valid_dataloader,
                    params,
                    device=device,
                    logger=logger,
                )
                model.train()
                logger.info("\n")

        logger.info("***** Saving fine - tuned model *****")
        epoch_output_folder_path = os.path.join(model_output_path,
                                                "epoch_{}".format(epoch_idx))
        utils.save_model(model, tokenizer, epoch_output_folder_path)

        output_eval_file = os.path.join(epoch_output_folder_path,
                                        "eval_results.txt")
        results = evaluate(
            reranker,
            valid_dataloader,
            params,
            device=device,
            logger=logger,
        )

        ls = [best_score, results["normalized_accuracy"]]
        li = [best_epoch_idx, epoch_idx]

        best_score = ls[np.argmax(ls)]
        best_epoch_idx = li[np.argmax(ls)]
        logger.info("\n")

    execution_time = (time.time() - time_start) / 60
    utils.write_to_file(
        os.path.join(model_output_path, "training_time.txt"),
        "The training took {} minutes\n".format(execution_time),
    )
    logger.info("The training took {} minutes\n".format(execution_time))

    # save the best model in the parent_dir
    logger.info("Best performance in epoch: {}".format(best_epoch_idx))
    params["path_to_model"] = os.path.join(model_output_path,
                                           "epoch_{}".format(best_epoch_idx))
    utils.save_model(reranker.model, tokenizer, model_output_path)

    if params["evaluate"]:
        params["path_to_model"] = model_output_path
        results = evaluate(
            reranker,
            valid_dataloader,
            params,
            device=device,
            logger=logger,
        )
コード例 #3
0
def main(params):
    output_path = params["output_path"]
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    logger = utils.get_logger(params["output_path"])

    # Init model
    reranker = BiEncoderRanker(params)
    tokenizer = reranker.tokenizer
    model = reranker.model

    device = reranker.device

    cand_encode_path = params.get("cand_encode_path", None)

    # candidate encoding is not pre-computed.
    # load/generate candidate pool to compute candidate encoding.
    cand_pool_path = params.get("cand_pool_path", None)
    candidate_pool = load_or_generate_candidate_pool(
        tokenizer,
        params,
        logger,
        cand_pool_path,
    )

    candidate_encoding = None
    if cand_encode_path is not None:
        # try to load candidate encoding from path
        # if success, avoid computing candidate encoding
        try:
            logger.info("Loading pre-generated candidate encode path.")
            candidate_encoding = torch.load(cand_encode_path)
        except:
            logger.info("Loading failed. Generating candidate encoding.")

    if candidate_encoding is None:
        candidate_encoding = encode_candidate_zeshel(
            reranker,
            candidate_pool,
            params["encode_batch_size"],
            silent=params["silent"],
            logger=logger,
        )

        if cand_encode_path is not None:
            # Save candidate encoding to avoid re-compute
            logger.info("Saving candidate encoding to file " +
                        cand_encode_path)
            torch.save(cand_encode_path, candidate_encoding)

    test_samples = utils.read_dataset(params["mode"], params["data_path"])
    logger.info("Read %d test samples." % len(test_samples))

    test_data, test_tensor_data = data.process_mention_data(
        test_samples,
        tokenizer,
        params["max_context_length"],
        params["max_cand_length"],
        context_key=params['context_key'],
        silent=params["silent"],
        logger=logger,
        debug=params["debug"],
    )
    test_sampler = SequentialSampler(test_tensor_data)
    test_dataloader = DataLoader(test_tensor_data,
                                 sampler=test_sampler,
                                 batch_size=params["encode_batch_size"])

    save_results = params.get("save_topk_result")
    new_data = nnquery.get_topk_predictions(
        reranker,
        test_dataloader,
        candidate_pool,
        candidate_encoding,
        params["silent"],
        logger,
        params["top_k"],
        params.get("zeshel", None),
        save_results,
    )

    if save_results:
        save_data_path = os.path.join(
            params['output_path'],
            'candidates_%s_top%d.t7' % (params['mode'], params['top_k']))
        torch.save(new_data, save_data_path)
コード例 #4
0
def main(params):
    output_path = params["output_path"]
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    logger = utils.get_logger(params["output_path"])

    # Init model 
    reranker = BiEncoderRanker(params)
    tokenizer = reranker.tokenizer

    # laod entities
    entity_dict, entity_json = load_entity_dict(logger, params)

    # load tfidf candidates
    tfidf_cand_dict = read_tfidf_cands(params["data_path"], params["mode"])

    # load mentions
    test_samples = utils.read_dataset(params["mode"], params["data_path"])
    logger.info("Read %d test samples." % len(test_samples))

    # get only the cands we need to tokenize
    cand_ids = [c for l in tfidf_cand_dict.values() for c in l]
    cand_ids.extend([x["label_umls_cuid"] for x in test_samples])
    cand_ids = list(set(cand_ids))
    num_cands = len(cand_ids)

    # tokenize the candidates
    cand_uid_map = {c : i for i, c in enumerate(cand_ids)}
    candidate_pool = get_candidate_pool_tensor(
        [entity_dict[c] for c in cand_ids],
        tokenizer,
        params["max_cand_length"],
        logger
    )

    # create mention maps
    ctxt_uid_map = {x["mm_mention_id"] : i + num_cands
                        for i, x in enumerate(test_samples)}
    ctxt_cand_map = {x["mm_mention_id"] : x["label_umls_cuid"]
                        for x in test_samples}
    ctxt_doc_map = {x["mm_mention_id"] : x["context_doc_id"]
                        for x in test_samples}
    doc_ctxt_map = defaultdict(list)
    for c, d in ctxt_doc_map.items():
        doc_ctxt_map[d].append(c)

    # create text maps for investigative evaluation
    uid_to_json = {
        uid : entity_json[cuid] for cuid, uid in cand_uid_map.items()
    }
    uid_to_json.update({i+num_cands : x for i, x in enumerate(test_samples)})

    # tokenize the contexts
    test_data, test_tensor_data = data.process_mention_data(
        test_samples,
        tokenizer,
        params["max_context_length"],
        params["max_cand_length"],
        context_key=params['context_key'],
        silent=params["silent"],
        logger=logger,
        debug=params["debug"],
    )
    context_pool = test_data["context_vecs"]
    
    # create output variables
    contexts = context_pool
    context_uids = torch.LongTensor(list(ctxt_uid_map.values()))

    pos_coref_ctxts = []
    pos_coref_ctxt_uids = []
    for i, c in enumerate(ctxt_doc_map.keys()):
        assert ctxt_uid_map[c] == i + num_cands
        doc = ctxt_doc_map[c]
        coref_ctxts = [x for x in doc_ctxt_map[doc]
                          if x != c and ctxt_cand_map[x] == ctxt_cand_map[c]]
        coref_ctxt_uids = [ctxt_uid_map[x] for x in coref_ctxts]
        coref_ctxt_idxs = [x - num_cands for x in coref_ctxt_uids]
        pos_coref_ctxts.append(context_pool[coref_ctxt_idxs])
        pos_coref_ctxt_uids.append(torch.LongTensor(coref_ctxt_uids))

    knn_ctxts = []
    knn_ctxt_uids = []
    for i, c in enumerate(ctxt_doc_map.keys()):
        assert ctxt_uid_map[c] == i + num_cands
        doc = ctxt_doc_map[c]
        wdoc_ctxts = [x for x in doc_ctxt_map[doc] if x != c]
        wdoc_ctxt_uids = [ctxt_uid_map[x] for x in wdoc_ctxts]
        wdoc_ctxt_idxs = [x - num_cands for x in wdoc_ctxt_uids]
        knn_ctxts.append(context_pool[wdoc_ctxt_idxs])
        knn_ctxt_uids.append(torch.LongTensor(wdoc_ctxt_uids))
        
    pos_cands = []
    pos_cand_uids = []
    for i, c in enumerate(ctxt_cand_map.keys()):
        assert ctxt_uid_map[c] == i + num_cands
        pos_cands.append(candidate_pool[cand_uid_map[ctxt_cand_map[c]]])
        pos_cand_uids.append(torch.LongTensor([cand_uid_map[ctxt_cand_map[c]]]))

    knn_cands = []
    knn_cand_uids = []
    for i, c in enumerate(ctxt_cand_map.keys()):
        assert ctxt_uid_map[c] == i + num_cands
        tfidf_cands = tfidf_cand_dict.get(c, [])
        tfidf_cand_uids = [cand_uid_map[x] for x in tfidf_cands]
        knn_cands.append(candidate_pool[tfidf_cand_uids])
        knn_cand_uids.append(torch.LongTensor(tfidf_cand_uids))

    tfidf_data = {
        "contexts" : contexts,
        "context_uids":  context_uids,
        "pos_coref_ctxts":  pos_coref_ctxts,
        "pos_coref_ctxt_uids":  pos_coref_ctxt_uids,
        "knn_ctxts":  knn_ctxts,
        "knn_ctxt_uids":  knn_ctxt_uids,
        "pos_cands":  pos_cands,
        "pos_cand_uids":  pos_cand_uids,
        "knn_cands":  knn_cands,
        "knn_cand_uids":  knn_cand_uids,
        "uid_to_json":  uid_to_json,
    }
    
    save_data_path = os.path.join(
        params['output_path'], 
        'joint_candidates_%s_tfidf.t7' % (params['mode'])
    )
    torch.save(tfidf_data, save_data_path)