def main(args): init_logging() print(args) # load dictionary and data eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path) eval_queries = load_queries( data_dir=args.data_dir, filter_composite=args.filter_composite, filter_duplicate=args.filter_duplicate ) biosyn = BioSyn( max_length=args.max_length, use_cuda=args.use_cuda ) biosyn.load_model( model_name_or_path=args.model_name_or_path, ) result_evalset = evaluate( biosyn=biosyn, eval_dictionary=eval_dictionary, eval_queries=eval_queries, topk=args.topk, ) LOGGER.info("acc@1={}".format(result_evalset['acc1'])) LOGGER.info("acc@5={}".format(result_evalset['acc5'])) if args.save_predictions: output_file = os.path.join(args.output_dir,"predictions_eval.json") with open(output_file, 'w') as f: json.dump(result_evalset, f, indent=2)
def main(args): # load biosyn model biosyn = BioSyn(max_length=25, use_cuda=args.use_cuda) biosyn.load_model(model_name_or_path=args.model_name_or_path) # preprocess mention mention = TextPreprocess().run(args.mention) # embed mention mention_sparse_embeds = biosyn.embed_sparse(names=[mention]) mention_dense_embeds = biosyn.embed_dense(names=[mention]) output = { 'mention': args.mention, } if args.show_embeddings: output = { 'mention': args.mention, 'mention_sparse_embeds': mention_sparse_embeds.squeeze(0), 'mention_dense_embeds': mention_dense_embeds.squeeze(0) } if args.show_predictions: if args.dictionary_path == None: print('insert the dictionary path') return # cache or load dictionary dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary( biosyn, args.model_name_or_path, args.dictionary_path) # calcuate score matrix and get top 5 sparse_score_matrix = biosyn.get_score_matrix( query_embeds=mention_sparse_embeds, dict_embeds=dict_sparse_embeds) dense_score_matrix = biosyn.get_score_matrix( query_embeds=mention_dense_embeds, dict_embeds=dict_dense_embeds) sparse_weight = biosyn.get_sparse_weight().item() hybrid_score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix hybrid_candidate_idxs = biosyn.retrieve_candidate( score_matrix=hybrid_score_matrix, topk=5) # get predictions from dictionary predictions = dictionary[hybrid_candidate_idxs].squeeze(0) output['predictions'] = [] for prediction in predictions: predicted_name = prediction[0] predicted_id = prediction[1] output['predictions'].append({ 'name': predicted_name, 'id': predicted_id }) print(output)
def main(args): init_logging() print(args) # load dictionary and data eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path) eval_queries = load_queries(data_dir=args.data_dir, filter_composite=args.filter_composite, filter_duplicate=args.filter_duplicate) biosyn = BioSyn().load_model(path=args.model_dir, max_length=args.max_length, use_cuda=args.use_cuda) result_evalset, errors_evalset = evaluate(biosyn=biosyn, eval_dictionary=eval_dictionary, eval_queries=eval_queries, topk=args.topk, score_mode=args.score_mode) # load hierarchy # <unk> 0 # # MESH:C -1 if args.hierarchy: tree_map = defaultdict(dict) with open(args.hierarchy) as f: for l in f: fields = l[:-1].split('\t') if len(fields) > 2: tree_map[fields[0].replace('MESH:', '').replace( 'OMIM:', '')][fields[1].replace('MESH:', '').replace( 'OMIM:', '')] = int(fields[2]) tree_map[fields[1].replace('MESH:', '').replace( 'OMIM:', '')][fields[0].replace('MESH:', '').replace( 'OMIM:', '')] = -1 * int(fields[2]) else: print(fields) print(tree_map['D018256']) getLCAStatistics(eval_dictionary, tree_map, errors_evalset) LOGGER.info("acc@1={}".format(result_evalset['acc1'])) LOGGER.info("acc@5={}".format(result_evalset['acc5'])) if args.save_predictions: output_file = os.path.join(args.output_dir, "predictions_eval.json") with open(output_file, 'w') as f: json.dump(result_evalset, f, indent=2) df = pd.DataFrame.from_records( errors_evalset, columns=['true', 'true name', 'pred', 'pred name']) df.to_csv(os.path.join(args.output_dir, "errors_eval.json"), sep='\t', index=False)
def main(args): init_logging() init_seed(args.seed) print(args) # prepare for output if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # load dictionary and queries train_dictionary = load_dictionary( dictionary_path=args.train_dictionary_path) train_queries = load_queries(data_dir=args.train_dir, filter_composite=True, filter_duplicate=True) # filter only names names_in_train_dictionary = train_dictionary[:, 0] names_in_train_queries = train_queries[:, 0] # load BERT tokenizer, dense_encoder, sparse_encoder biosyn = BioSyn() encoder, tokenizer = biosyn.load_bert( path=args.model_dir, max_length=args.max_length, use_cuda=args.use_cuda, ) sparse_encoder = biosyn.train_sparse_encoder( corpus=names_in_train_dictionary) sparse_weight = biosyn.init_sparse_weight( initial_sparse_weight=args.initial_sparse_weight, use_cuda=args.use_cuda) # ------ MY CODE ------ # load sentence tokenizer sent_encoder = biosyn.init_sent_encoder() sent_weight = biosyn.init_sent_weight( initial_sent_weight=args.initial_sent_weight, use_cuda=args.use_cuda) # ------ MY CODE ------ # load rerank model model = RerankNet(encoder=encoder, learning_rate=args.learning_rate, weight_decay=args.weight_decay, sparse_weight=sparse_weight, sent_weight=sent_weight, use_cuda=args.use_cuda) # embed sparse representations for query and dictionary # Important! This is one time process because sparse represenation never changes. LOGGER.info("Sparse embedding") train_query_sparse_embeds = biosyn.embed_sparse( names=names_in_train_queries) train_dict_sparse_embeds = biosyn.embed_sparse( names=names_in_train_dictionary) train_sparse_score_matrix = biosyn.get_score_matrix( query_embeds=train_query_sparse_embeds, dict_embeds=train_dict_sparse_embeds) train_sparse_candidate_idxs = biosyn.retrieve_candidate( score_matrix=train_sparse_score_matrix, topk=args.topk) # ------ MY CODE ------ # sentence embedding, not for training(?) LOGGER.info("Sentence embedding") train_query_sent_embeds = biosyn.embed_sent(names=names_in_train_queries) train_dict_sent_embeds = biosyn.embed_sent(names=names_in_train_dictionary) train_sent_score_matrix = biosyn.get_score_matrix( query_embeds=train_query_sent_embeds, dict_embeds=train_dict_sent_embeds) train_sent_candidate_idxs = biosyn.retrieve_candidate( score_matrix=train_sent_score_matrix, topk=args.topk) # ------ MY CODE ------ # prepare for data loader of train and dev train_set = CandidateDataset(queries=train_queries, dicts=train_dictionary, tokenizer=tokenizer, topk=args.topk, d_ratio=args.dense_ratio, s_ratio=args.sparse_ratio, s_score_matrix=train_sparse_score_matrix, s_candidate_idxs=train_sparse_candidate_idxs, sent_score_matrix=train_sent_score_matrix, sent_candidate_idxs=train_sent_candidate_idxs) train_loader = torch.utils.data.DataLoader( train_set, batch_size=args.train_batch_size, shuffle=True, ) start = time.time() for epoch in range(1, args.epoch + 1): # embed dense representations for query and dictionary for train # Important! This is iterative process because dense represenation changes as model is trained. LOGGER.info("Epoch {}/{}".format(epoch, args.epoch)) LOGGER.info( "train_set dense embedding for iterative candidate retrieval") train_query_dense_embeds = biosyn.embed_dense( names=names_in_train_queries, show_progress=True) train_dict_dense_embeds = biosyn.embed_dense( names=names_in_train_dictionary, show_progress=True) train_dense_score_matrix = biosyn.get_score_matrix( query_embeds=train_query_dense_embeds, dict_embeds=train_dict_dense_embeds) train_dense_candidate_idxs = biosyn.retrieve_candidate( score_matrix=train_dense_score_matrix, topk=args.topk) # replace dense candidates in the train_set train_set.set_dense_candidate_idxs( d_candidate_idxs=train_dense_candidate_idxs) # train train_loss = train(args, data_loader=train_loader, model=model) LOGGER.info('loss/train_per_epoch={}/{}'.format(train_loss, epoch)) # save model every epoch if args.save_checkpoint_all: checkpoint_dir = os.path.join(args.output_dir, "checkpoint_{}".format(epoch)) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) biosyn.save_model(checkpoint_dir) # save model last epoch if epoch == args.epoch: biosyn.save_model(args.output_dir) end = time.time() training_time = end - start training_hour = int(training_time / 60 / 60) training_minute = int(training_time / 60 % 60) training_second = int(training_time % 60) LOGGER.info("Training Time!{} hours {} minutes {} seconds".format( training_hour, training_minute, training_second))
def main(args): init_logging() print(args) # load dictionary and data eval_dictionary = load_dictionary(dictionary_path=args.dictionary_path) eval_queries = load_queries(data_dir=args.data_dir, filter_composite=args.filter_composite, filter_duplicate=args.filter_duplicate) biosyn = BioSyn().load_model(path=args.model_dir, max_length=args.max_length, normalize_vecs=args.normalize_vecs, use_cuda=args.use_cuda) result_evalset = evaluate(biosyn=biosyn, eval_dictionary=eval_dictionary, eval_queries=eval_queries, topk=args.topk, output_dir=args.output_dir, score_mode=args.score_mode, type_given=args.type_given, use_cluster_linking=args.use_cluster_linking, directed=args.directed_graph, debug_mode=args.debug_mode) if not args.use_cluster_linking: # Try to report accuracies from acc@1 to acc@64 for i in range(6): accuracy_level = 2**i # accuracies above arg.topk wouldn't be available if accuracy_level > args.topk: break LOGGER.info("acc@{}={}".format( accuracy_level, result_evalset['acc' + str(accuracy_level)])) if args.save_predictions: output_file = os.path.join( args.output_dir, f"{__import__('calendar').timegm(__import__('time').gmtime())}_predictions_eval.json" ) with open(output_file, 'w') as f: json.dump(result_evalset, f, indent=2) print(f"\nPredictions saved at: {output_file}") else: output_file_name = os.path.join( args.output_dir, f"{__import__('calendar').timegm(__import__('time').gmtime())}_predictions_eval" ) result_overview = { 'n_entities': result_evalset[0]['n_entities'], 'n_mentions': result_evalset[0]['n_mentions'], 'directed': args.directed_graph } for results in result_evalset: k = results['k_candidates'] result_overview[f'accuracy@k{k}'] = results['accuracy'] LOGGER.info(f"accuracy@k{k} = {results['accuracy']}") output_file = f'{output_file_name}-{k}.json' if args.save_predictions: with open(output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\nPredictions @k{k} saved at: {output_file}") if args.save_predictions: with open(f'{output_file_name}.json', 'w') as f: json.dump(result_overview, f, indent=2) print( f"\nPredictions overview saved at: {output_file_name}.json" )
output = {'predictions': []} for prediction in predictions: predicted_name = prediction[0] predicted_id = prediction[1] output['predictions'].append({ 'name': predicted_name, 'id': predicted_id }) return output # load biosyn model biosyn = BioSyn().load_model(path=args.model_dir, max_length=25, use_cuda=args.use_cuda) # cache or load dictionary dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary() class MainHandler(tornado.web.RequestHandler): def get(self): self.render("./template/index.html") class NormalizeHandler(tornado.web.RequestHandler): def get(self): string = self.get_argument('string', '') logging.info('get!{}'.format({
predictions = dictionary[hybrid_candidate_idxs].squeeze(0) output = {'predictions': []} for prediction in predictions: predicted_name = prediction[0] predicted_id = prediction[1] output['predictions'].append({ 'name': predicted_name, 'id': predicted_id }) return output # load biosyn model biosyn = BioSyn(use_cuda=args.use_cuda, max_length=25) biosyn.load_model(model_name_or_path=args.model_name_or_path) # cache or load dictionary dictionary, dict_sparse_embeds, dict_dense_embeds = cache_or_load_dictionary() class MainHandler(tornado.web.RequestHandler): def get(self): self.render("./template/index.html") class NormalizeHandler(tornado.web.RequestHandler): def get(self): string = self.get_argument('string', '')