Esempio n. 1
0
def main(args):
    init_logging()
    print(args)

    # load dictionary and data
    eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path)
    eval_queries = load_queries(
        data_dir=args.data_dir,
        filter_composite=args.filter_composite,
        filter_duplicate=args.filter_duplicate
    )

    biosyn = BioSyn(
        max_length=args.max_length,
        use_cuda=args.use_cuda
    )
    biosyn.load_model(
        model_name_or_path=args.model_name_or_path,
    )
    
    result_evalset = evaluate(
        biosyn=biosyn,
        eval_dictionary=eval_dictionary,
        eval_queries=eval_queries,
        topk=args.topk,
    )
    
    LOGGER.info("acc@1={}".format(result_evalset['acc1']))
    LOGGER.info("acc@5={}".format(result_evalset['acc5']))
    
    if args.save_predictions:
        output_file = os.path.join(args.output_dir,"predictions_eval.json")
        with open(output_file, 'w') as f:
            json.dump(result_evalset, f, indent=2)
Esempio n. 2
0
def main(args):
    # load biosyn model
    biosyn = BioSyn(max_length=25, use_cuda=args.use_cuda)

    biosyn.load_model(model_name_or_path=args.model_name_or_path)
    # preprocess mention
    mention = TextPreprocess().run(args.mention)

    # embed mention
    mention_sparse_embeds = biosyn.embed_sparse(names=[mention])
    mention_dense_embeds = biosyn.embed_dense(names=[mention])

    output = {
        'mention': args.mention,
    }

    if args.show_embeddings:
        output = {
            'mention': args.mention,
            'mention_sparse_embeds': mention_sparse_embeds.squeeze(0),
            'mention_dense_embeds': mention_dense_embeds.squeeze(0)
        }

    if args.show_predictions:
        if args.dictionary_path == None:
            print('insert the dictionary path')
            return

        # cache or load dictionary
        dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary(
            biosyn, args.model_name_or_path, args.dictionary_path)

        # calcuate score matrix and get top 5
        sparse_score_matrix = biosyn.get_score_matrix(
            query_embeds=mention_sparse_embeds, dict_embeds=dict_sparse_embeds)
        dense_score_matrix = biosyn.get_score_matrix(
            query_embeds=mention_dense_embeds, dict_embeds=dict_dense_embeds)
        sparse_weight = biosyn.get_sparse_weight().item()
        hybrid_score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix
        hybrid_candidate_idxs = biosyn.retrieve_candidate(
            score_matrix=hybrid_score_matrix, topk=5)

        # get predictions from dictionary
        predictions = dictionary[hybrid_candidate_idxs].squeeze(0)
        output['predictions'] = []

        for prediction in predictions:
            predicted_name = prediction[0]
            predicted_id = prediction[1]
            output['predictions'].append({
                'name': predicted_name,
                'id': predicted_id
            })

    print(output)
Esempio n. 3
0
def main(args):
    init_logging()
    print(args)

    # load dictionary and data
    eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path)
    eval_queries = load_queries(data_dir=args.data_dir,
                                filter_composite=args.filter_composite,
                                filter_duplicate=args.filter_duplicate)

    biosyn = BioSyn().load_model(path=args.model_dir,
                                 max_length=args.max_length,
                                 use_cuda=args.use_cuda)

    result_evalset, errors_evalset = evaluate(biosyn=biosyn,
                                              eval_dictionary=eval_dictionary,
                                              eval_queries=eval_queries,
                                              topk=args.topk,
                                              score_mode=args.score_mode)

    # load hierarchy
    # 	<unk>	0
    # 	# MESH:C	-1
    if args.hierarchy:
        tree_map = defaultdict(dict)
        with open(args.hierarchy) as f:
            for l in f:
                fields = l[:-1].split('\t')
                if len(fields) > 2:
                    tree_map[fields[0].replace('MESH:', '').replace(
                        'OMIM:', '')][fields[1].replace('MESH:', '').replace(
                            'OMIM:', '')] = int(fields[2])
                    tree_map[fields[1].replace('MESH:', '').replace(
                        'OMIM:', '')][fields[0].replace('MESH:', '').replace(
                            'OMIM:', '')] = -1 * int(fields[2])
                else:
                    print(fields)
        print(tree_map['D018256'])
        getLCAStatistics(eval_dictionary, tree_map, errors_evalset)

    LOGGER.info("acc@1={}".format(result_evalset['acc1']))
    LOGGER.info("acc@5={}".format(result_evalset['acc5']))

    if args.save_predictions:
        output_file = os.path.join(args.output_dir, "predictions_eval.json")
        with open(output_file, 'w') as f:
            json.dump(result_evalset, f, indent=2)

        df = pd.DataFrame.from_records(
            errors_evalset, columns=['true', 'true name', 'pred', 'pred name'])
        df.to_csv(os.path.join(args.output_dir, "errors_eval.json"),
                  sep='\t',
                  index=False)
Esempio n. 4
0
def main(args):
    init_logging()
    init_seed(args.seed)
    print(args)

    # prepare for output
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # load dictionary and queries
    train_dictionary = load_dictionary(
        dictionary_path=args.train_dictionary_path)
    train_queries = load_queries(data_dir=args.train_dir,
                                 filter_composite=True,
                                 filter_duplicate=True)

    # filter only names
    names_in_train_dictionary = train_dictionary[:, 0]
    names_in_train_queries = train_queries[:, 0]

    # load BERT tokenizer, dense_encoder, sparse_encoder
    biosyn = BioSyn()
    encoder, tokenizer = biosyn.load_bert(
        path=args.model_dir,
        max_length=args.max_length,
        use_cuda=args.use_cuda,
    )
    sparse_encoder = biosyn.train_sparse_encoder(
        corpus=names_in_train_dictionary)
    sparse_weight = biosyn.init_sparse_weight(
        initial_sparse_weight=args.initial_sparse_weight,
        use_cuda=args.use_cuda)

    # ------ MY CODE ------

    # load sentence tokenizer
    sent_encoder = biosyn.init_sent_encoder()
    sent_weight = biosyn.init_sent_weight(
        initial_sent_weight=args.initial_sent_weight, use_cuda=args.use_cuda)

    # ------ MY CODE ------

    # load rerank model
    model = RerankNet(encoder=encoder,
                      learning_rate=args.learning_rate,
                      weight_decay=args.weight_decay,
                      sparse_weight=sparse_weight,
                      sent_weight=sent_weight,
                      use_cuda=args.use_cuda)

    # embed sparse representations for query and dictionary
    # Important! This is one time process because sparse represenation never changes.
    LOGGER.info("Sparse embedding")
    train_query_sparse_embeds = biosyn.embed_sparse(
        names=names_in_train_queries)
    train_dict_sparse_embeds = biosyn.embed_sparse(
        names=names_in_train_dictionary)
    train_sparse_score_matrix = biosyn.get_score_matrix(
        query_embeds=train_query_sparse_embeds,
        dict_embeds=train_dict_sparse_embeds)
    train_sparse_candidate_idxs = biosyn.retrieve_candidate(
        score_matrix=train_sparse_score_matrix, topk=args.topk)

    # ------ MY CODE ------

    # sentence embedding, not for training(?)
    LOGGER.info("Sentence embedding")

    train_query_sent_embeds = biosyn.embed_sent(names=names_in_train_queries)
    train_dict_sent_embeds = biosyn.embed_sent(names=names_in_train_dictionary)
    train_sent_score_matrix = biosyn.get_score_matrix(
        query_embeds=train_query_sent_embeds,
        dict_embeds=train_dict_sent_embeds)
    train_sent_candidate_idxs = biosyn.retrieve_candidate(
        score_matrix=train_sent_score_matrix, topk=args.topk)

    # ------ MY CODE ------

    # prepare for data loader of train and dev
    train_set = CandidateDataset(queries=train_queries,
                                 dicts=train_dictionary,
                                 tokenizer=tokenizer,
                                 topk=args.topk,
                                 d_ratio=args.dense_ratio,
                                 s_ratio=args.sparse_ratio,
                                 s_score_matrix=train_sparse_score_matrix,
                                 s_candidate_idxs=train_sparse_candidate_idxs,
                                 sent_score_matrix=train_sent_score_matrix,
                                 sent_candidate_idxs=train_sent_candidate_idxs)
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=args.train_batch_size,
        shuffle=True,
    )

    start = time.time()
    for epoch in range(1, args.epoch + 1):
        # embed dense representations for query and dictionary for train
        # Important! This is iterative process because dense represenation changes as model is trained.
        LOGGER.info("Epoch {}/{}".format(epoch, args.epoch))
        LOGGER.info(
            "train_set dense embedding for iterative candidate retrieval")
        train_query_dense_embeds = biosyn.embed_dense(
            names=names_in_train_queries, show_progress=True)
        train_dict_dense_embeds = biosyn.embed_dense(
            names=names_in_train_dictionary, show_progress=True)
        train_dense_score_matrix = biosyn.get_score_matrix(
            query_embeds=train_query_dense_embeds,
            dict_embeds=train_dict_dense_embeds)
        train_dense_candidate_idxs = biosyn.retrieve_candidate(
            score_matrix=train_dense_score_matrix, topk=args.topk)
        # replace dense candidates in the train_set
        train_set.set_dense_candidate_idxs(
            d_candidate_idxs=train_dense_candidate_idxs)

        # train
        train_loss = train(args, data_loader=train_loader, model=model)
        LOGGER.info('loss/train_per_epoch={}/{}'.format(train_loss, epoch))

        # save model every epoch
        if args.save_checkpoint_all:
            checkpoint_dir = os.path.join(args.output_dir,
                                          "checkpoint_{}".format(epoch))
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            biosyn.save_model(checkpoint_dir)

        # save model last epoch
        if epoch == args.epoch:
            biosyn.save_model(args.output_dir)

    end = time.time()
    training_time = end - start
    training_hour = int(training_time / 60 / 60)
    training_minute = int(training_time / 60 % 60)
    training_second = int(training_time % 60)
    LOGGER.info("Training Time!{} hours {} minutes {} seconds".format(
        training_hour, training_minute, training_second))
Esempio n. 5
0
def main(args):
    init_logging()
    print(args)

    # load dictionary and data
    eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path)
    eval_queries = load_queries(data_dir=args.data_dir,
                                filter_composite=args.filter_composite,
                                filter_duplicate=args.filter_duplicate)

    biosyn = BioSyn().load_model(path=args.model_dir,
                                 max_length=args.max_length,
                                 normalize_vecs=args.normalize_vecs,
                                 use_cuda=args.use_cuda)

    result_evalset = evaluate(biosyn=biosyn,
                              eval_dictionary=eval_dictionary,
                              eval_queries=eval_queries,
                              topk=args.topk,
                              output_dir=args.output_dir,
                              score_mode=args.score_mode,
                              type_given=args.type_given,
                              use_cluster_linking=args.use_cluster_linking,
                              directed=args.directed_graph,
                              debug_mode=args.debug_mode)

    if not args.use_cluster_linking:
        # Try to report accuracies from acc@1 to acc@64
        for i in range(6):
            accuracy_level = 2**i

            # accuracies above arg.topk wouldn't be available
            if accuracy_level > args.topk:
                break

            LOGGER.info("acc@{}={}".format(
                accuracy_level, result_evalset['acc' + str(accuracy_level)]))

        if args.save_predictions:
            output_file = os.path.join(
                args.output_dir,
                f"{__import__('calendar').timegm(__import__('time').gmtime())}_predictions_eval.json"
            )
            with open(output_file, 'w') as f:
                json.dump(result_evalset, f, indent=2)
                print(f"\nPredictions saved at: {output_file}")
    else:
        output_file_name = os.path.join(
            args.output_dir,
            f"{__import__('calendar').timegm(__import__('time').gmtime())}_predictions_eval"
        )
        result_overview = {
            'n_entities': result_evalset[0]['n_entities'],
            'n_mentions': result_evalset[0]['n_mentions'],
            'directed': args.directed_graph
        }
        for results in result_evalset:
            k = results['k_candidates']
            result_overview[f'accuracy@k{k}'] = results['accuracy']
            LOGGER.info(f"accuracy@k{k} = {results['accuracy']}")
            output_file = f'{output_file_name}-{k}.json'
            if args.save_predictions:
                with open(output_file, 'w') as f:
                    json.dump(results, f, indent=2)
                    print(f"\nPredictions @k{k} saved at: {output_file}")
        if args.save_predictions:
            with open(f'{output_file_name}.json', 'w') as f:
                json.dump(result_overview, f, indent=2)
                print(
                    f"\nPredictions overview saved at: {output_file_name}.json"
                )
Esempio n. 6
0
    output = {'predictions': []}

    for prediction in predictions:
        predicted_name = prediction[0]
        predicted_id = prediction[1]
        output['predictions'].append({
            'name': predicted_name,
            'id': predicted_id
        })

    return output


# load biosyn model
biosyn = BioSyn().load_model(path=args.model_dir,
                             max_length=25,
                             use_cuda=args.use_cuda)

# cache or load dictionary
dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary()


class MainHandler(tornado.web.RequestHandler):
    def get(self):
        self.render("./template/index.html")


class NormalizeHandler(tornado.web.RequestHandler):
    def get(self):
        string = self.get_argument('string', '')
        logging.info('get!{}'.format({
Esempio n. 7
0
    predictions = dictionary[hybrid_candidate_idxs].squeeze(0)
    output = {'predictions': []}

    for prediction in predictions:
        predicted_name = prediction[0]
        predicted_id = prediction[1]
        output['predictions'].append({
            'name': predicted_name,
            'id': predicted_id
        })

    return output


# load biosyn model
biosyn = BioSyn(use_cuda=args.use_cuda, max_length=25)

biosyn.load_model(model_name_or_path=args.model_name_or_path)

# cache or load dictionary
dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary()


class MainHandler(tornado.web.RequestHandler):
    def get(self):
        self.render("./template/index.html")


class NormalizeHandler(tornado.web.RequestHandler):
    def get(self):
        string = self.get_argument('string', '')