Beispiel #1
0
def get_st_embeds(args, dataset, config, lang, base_model=None):
    logger.info("***** Compute sentence embeddings for [%s] plain text dataset using the [%s] base_model *****", lang, "pre-trained" if  base_model is None else "domain")
    if base_model is None:
        base_model = BaseModel.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config)
        base_model.to(args.device)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    st_embeds = None
    base_model.eval()
    for batch in eval_dataloader:
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "token_type_ids": batch[2]}
            outputs = base_model(**inputs)
            pooled_outputs = outputs[1]

        st_embeds = pooled_outputs.detach() if st_embeds is None else torch.cat((st_embeds, pooled_outputs.detach()), dim=0)  # dataset_len x hidden_size

    return st_embeds
Beispiel #2
0
def get_init_domain_embed(args, dataset, lang):
    config = BertConfig.from_pretrained(args.model_name_or_path)
    base_model = BaseModel.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config)
    base_model.to(args.device)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # compute logits for the dataset using the model!
    logger.info(
        "***** Compute logits for [%s] dataset using the base_model *****",
        lang)
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    st_embeds = None
    base_model.eval()
    for batch in eval_dataloader:
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2]
            }
            outputs = base_model(**inputs)
            pooled_outputs = outputs[1]

        st_embeds = pooled_outputs.detach(
        ) if st_embeds is None else torch.cat(
            (st_embeds,
             pooled_outputs.detach()), dim=0)  # dataset_len x hidden_size

    return st_embeds