def generate_new_ann(args, data_loader_dict, checkpoint_path, latest_step_num): pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0') model = SemanticIndexANCE(pretrained_model, output_emb_size=args.output_emb_size) logger.info("checkpoint_path:{}".format(checkpoint_path)) state_dict = paddle.load(checkpoint_path) model.set_dict(state_dict) logger.info("load params from:{}".format(checkpoint_path)) logger.info("***** inference of corpus *****") final_index = build_index(args, data_loader_dict["corpus_data_loader"], model) logger.info("***** inference of query *****") query_embedding = model.get_semantic_embedding( data_loader_dict["text_data_loader"]) text_list = data_loader_dict["text_list"] id2corpus = data_loader_dict["id2corpus"] text2similar_text = data_loader_dict["text2similar_text"] new_ann_data_path = os.path.join(args.ann_data_dir, str(latest_step_num)) if not os.path.exists(new_ann_data_path): os.mkdir(new_ann_data_path) with open(os.path.join(new_ann_data_path, "new_ann_data"), 'w') as f: for batch_index, batch_query_embedding in enumerate(query_embedding): recalled_idx, cosine_sims = final_index.knn_query( batch_query_embedding, args.topk_training) batch_size = len(cosine_sims) for row_index in range(batch_size): text_index = args.batch_size * batch_index + row_index hard_neg_samples = recalled_idx[row_index][ -1 * args.num_negative_sample:] hard_neg_sims = cosine_sims[row_index][-1 * args. num_negative_sample:] for idx, hard_neg_doc_idx in enumerate(hard_neg_samples): text = text_list[text_index]["text"] similar_text = text2similar_text[text] hard_neg_sample = id2corpus[hard_neg_doc_idx] cosine_sim = 1.0 - hard_neg_sims[idx] f.write("{}\t{}\t{}\n".format(text, similar_text, hard_neg_sample)) succeed_flag_file = os.path.join(new_ann_data_path, "succeed_flag_file") open(succeed_flag_file, 'a').close() logger.info("finish generate ann data step:{}".format(latest_step_num))
id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) corpus_data_loader = create_dataloader(corpus_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) # Need better way to get inner model of DataParallel inner_model = model._layers final_index = build_index(args, corpus_data_loader, inner_model) text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) query_ds = MapDataset(text_list) query_data_loader = create_dataloader(query_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) query_embedding = inner_model.get_semantic_embedding(query_data_loader) if not os.path.exists(args.recall_result_dir): os.mkdir(args.recall_result_dir)