Esempio n. 1
0
def main(args):
    # load biosyn model
    biosyn = BioSyn().load_model(path=args.model_dir,
                                 max_length=25,
                                 use_cuda=args.use_cuda)
    # preprocess mention
    mention = TextPreprocess().run(args.mention)

    # embed mention
    mention_sparse_embeds = biosyn.embed_sparse(names=[mention])
    mention_dense_embeds = biosyn.embed_dense(names=[mention])
    mention_sent_embeds = biosyn.embed_sent(self, names, show_progress=False)

    output = {
        'mention': args.mention,
    }

    if args.show_embeddings:
        output = {
            'mention': args.mention,
            'mention_sparse_embeds': mention_sparse_embeds.squeeze(0),
            'mention_dense_embeds': mention_dense_embeds.squeeze(0)
        }

    if args.show_predictions:
        if args.dictionary_path == None:
            print('insert the dictionary path')
            return

        # cache or load dictionary
        dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary(
            biosyn, args.dictionary_path)

        # calcuate score matrix and get top 5
        sparse_score_matrix = biosyn.get_score_matrix(
            query_embeds=mention_sparse_embeds, dict_embeds=dict_sparse_embeds)
        dense_score_matrix = biosyn.get_score_matrix(
            query_embeds=mention_dense_embeds, dict_embeds=dict_dense_embeds)
        sparse_weight = biosyn.get_sparse_weight().item()
        hybrid_score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix
        hybrid_candidate_idxs = biosyn.retrieve_candidate(
            score_matrix=hybrid_score_matrix, topk=5)

        # get predictions from dictionary
        predictions = dictionary[hybrid_candidate_idxs].squeeze(0)
        output['predictions'] = []

        for prediction in predictions:
            predicted_name = prediction[0]
            predicted_id = prediction[1]
            output['predictions'].append({
                'name': predicted_name,
                'id': predicted_id
            })

    print(output)
Esempio n. 2
0
def main(args):
    init_logging()
    init_seed(args.seed)
    print(args)

    # prepare for output
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # load dictionary and queries
    train_dictionary = load_dictionary(
        dictionary_path=args.train_dictionary_path)
    train_queries = load_queries(data_dir=args.train_dir,
                                 filter_composite=True,
                                 filter_duplicate=True)

    # filter only names
    names_in_train_dictionary = train_dictionary[:, 0]
    names_in_train_queries = train_queries[:, 0]

    # load BERT tokenizer, dense_encoder, sparse_encoder
    biosyn = BioSyn()
    encoder, tokenizer = biosyn.load_bert(
        path=args.model_dir,
        max_length=args.max_length,
        use_cuda=args.use_cuda,
    )
    sparse_encoder = biosyn.train_sparse_encoder(
        corpus=names_in_train_dictionary)
    sparse_weight = biosyn.init_sparse_weight(
        initial_sparse_weight=args.initial_sparse_weight,
        use_cuda=args.use_cuda)

    # ------ MY CODE ------

    # load sentence tokenizer
    sent_encoder = biosyn.init_sent_encoder()
    sent_weight = biosyn.init_sent_weight(
        initial_sent_weight=args.initial_sent_weight, use_cuda=args.use_cuda)

    # ------ MY CODE ------

    # load rerank model
    model = RerankNet(encoder=encoder,
                      learning_rate=args.learning_rate,
                      weight_decay=args.weight_decay,
                      sparse_weight=sparse_weight,
                      sent_weight=sent_weight,
                      use_cuda=args.use_cuda)

    # embed sparse representations for query and dictionary
    # Important! This is one time process because sparse represenation never changes.
    LOGGER.info("Sparse embedding")
    train_query_sparse_embeds = biosyn.embed_sparse(
        names=names_in_train_queries)
    train_dict_sparse_embeds = biosyn.embed_sparse(
        names=names_in_train_dictionary)
    train_sparse_score_matrix = biosyn.get_score_matrix(
        query_embeds=train_query_sparse_embeds,
        dict_embeds=train_dict_sparse_embeds)
    train_sparse_candidate_idxs = biosyn.retrieve_candidate(
        score_matrix=train_sparse_score_matrix, topk=args.topk)

    # ------ MY CODE ------

    # sentence embedding, not for training(?)
    LOGGER.info("Sentence embedding")

    train_query_sent_embeds = biosyn.embed_sent(names=names_in_train_queries)
    train_dict_sent_embeds = biosyn.embed_sent(names=names_in_train_dictionary)
    train_sent_score_matrix = biosyn.get_score_matrix(
        query_embeds=train_query_sent_embeds,
        dict_embeds=train_dict_sent_embeds)
    train_sent_candidate_idxs = biosyn.retrieve_candidate(
        score_matrix=train_sent_score_matrix, topk=args.topk)

    # ------ MY CODE ------

    # prepare for data loader of train and dev
    train_set = CandidateDataset(queries=train_queries,
                                 dicts=train_dictionary,
                                 tokenizer=tokenizer,
                                 topk=args.topk,
                                 d_ratio=args.dense_ratio,
                                 s_ratio=args.sparse_ratio,
                                 s_score_matrix=train_sparse_score_matrix,
                                 s_candidate_idxs=train_sparse_candidate_idxs,
                                 sent_score_matrix=train_sent_score_matrix,
                                 sent_candidate_idxs=train_sent_candidate_idxs)
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=args.train_batch_size,
        shuffle=True,
    )

    start = time.time()
    for epoch in range(1, args.epoch + 1):
        # embed dense representations for query and dictionary for train
        # Important! This is iterative process because dense represenation changes as model is trained.
        LOGGER.info("Epoch {}/{}".format(epoch, args.epoch))
        LOGGER.info(
            "train_set dense embedding for iterative candidate retrieval")
        train_query_dense_embeds = biosyn.embed_dense(
            names=names_in_train_queries, show_progress=True)
        train_dict_dense_embeds = biosyn.embed_dense(
            names=names_in_train_dictionary, show_progress=True)
        train_dense_score_matrix = biosyn.get_score_matrix(
            query_embeds=train_query_dense_embeds,
            dict_embeds=train_dict_dense_embeds)
        train_dense_candidate_idxs = biosyn.retrieve_candidate(
            score_matrix=train_dense_score_matrix, topk=args.topk)
        # replace dense candidates in the train_set
        train_set.set_dense_candidate_idxs(
            d_candidate_idxs=train_dense_candidate_idxs)

        # train
        train_loss = train(args, data_loader=train_loader, model=model)
        LOGGER.info('loss/train_per_epoch={}/{}'.format(train_loss, epoch))

        # save model every epoch
        if args.save_checkpoint_all:
            checkpoint_dir = os.path.join(args.output_dir,
                                          "checkpoint_{}".format(epoch))
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            biosyn.save_model(checkpoint_dir)

        # save model last epoch
        if epoch == args.epoch:
            biosyn.save_model(args.output_dir)

    end = time.time()
    training_time = end - start
    training_hour = int(training_time / 60 / 60)
    training_minute = int(training_time / 60 % 60)
    training_second = int(training_time % 60)
    LOGGER.info("Training Time!{} hours {} minutes {} seconds".format(
        training_hour, training_minute, training_second))