Beispiel #1
0
def generate_new_ann(args, data_loader_dict, checkpoint_path, latest_step_num):

    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
        'ernie-1.0')

    model = SemanticIndexANCE(pretrained_model,
                              output_emb_size=args.output_emb_size)

    logger.info("checkpoint_path:{}".format(checkpoint_path))
    state_dict = paddle.load(checkpoint_path)

    model.set_dict(state_dict)
    logger.info("load params from:{}".format(checkpoint_path))

    logger.info("***** inference of corpus *****")
    final_index = build_index(args, data_loader_dict["corpus_data_loader"],
                              model)

    logger.info("***** inference of query *****")
    query_embedding = model.get_semantic_embedding(
        data_loader_dict["text_data_loader"])

    text_list = data_loader_dict["text_list"]
    id2corpus = data_loader_dict["id2corpus"]
    text2similar_text = data_loader_dict["text2similar_text"]

    new_ann_data_path = os.path.join(args.ann_data_dir, str(latest_step_num))
    if not os.path.exists(new_ann_data_path):
        os.mkdir(new_ann_data_path)

    with open(os.path.join(new_ann_data_path, "new_ann_data"), 'w') as f:
        for batch_index, batch_query_embedding in enumerate(query_embedding):
            recalled_idx, cosine_sims = final_index.knn_query(
                batch_query_embedding, args.topk_training)

            batch_size = len(cosine_sims)

            for row_index in range(batch_size):
                text_index = args.batch_size * batch_index + row_index

                hard_neg_samples = recalled_idx[row_index][
                    -1 * args.num_negative_sample:]
                hard_neg_sims = cosine_sims[row_index][-1 * args.
                                                       num_negative_sample:]

                for idx, hard_neg_doc_idx in enumerate(hard_neg_samples):
                    text = text_list[text_index]["text"]
                    similar_text = text2similar_text[text]
                    hard_neg_sample = id2corpus[hard_neg_doc_idx]
                    cosine_sim = 1.0 - hard_neg_sims[idx]
                    f.write("{}\t{}\t{}\n".format(text, similar_text,
                                                  hard_neg_sample))

    succeed_flag_file = os.path.join(new_ann_data_path, "succeed_flag_file")
    open(succeed_flag_file, 'a').close()
    logger.info("finish generate ann data step:{}".format(latest_step_num))
Beispiel #2
0
    id2corpus = gen_id2corpus(args.corpus_file)

    # conver_example function's input must be dict
    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
    corpus_ds = MapDataset(corpus_list)

    corpus_data_loader = create_dataloader(corpus_ds,
                                           mode='predict',
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)

    # Need better way to get inner model of DataParallel
    inner_model = model._layers

    final_index = build_index(args, corpus_data_loader, inner_model)

    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)

    query_ds = MapDataset(text_list)

    query_data_loader = create_dataloader(query_ds,
                                          mode='predict',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    query_embedding = inner_model.get_semantic_embedding(query_data_loader)

    if not os.path.exists(args.recall_result_dir):
        os.mkdir(args.recall_result_dir)