def get_st_embeds(args, dataset, config, lang, base_model=None): logger.info("***** Compute sentence embeddings for [%s] plain text dataset using the [%s] base_model *****", lang, "pre-trained" if base_model is None else "domain") if base_model is None: base_model = BaseModel.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) base_model.to(args.device) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) st_embeds = None base_model.eval() for batch in eval_dataloader: batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2]} outputs = base_model(**inputs) pooled_outputs = outputs[1] st_embeds = pooled_outputs.detach() if st_embeds is None else torch.cat((st_embeds, pooled_outputs.detach()), dim=0) # dataset_len x hidden_size return st_embeds
def get_init_domain_embed(args, dataset, lang): config = BertConfig.from_pretrained(args.model_name_or_path) base_model = BaseModel.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) base_model.to(args.device) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # compute logits for the dataset using the model! logger.info( "***** Compute logits for [%s] dataset using the base_model *****", lang) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) st_embeds = None base_model.eval() for batch in eval_dataloader: batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } outputs = base_model(**inputs) pooled_outputs = outputs[1] st_embeds = pooled_outputs.detach( ) if st_embeds is None else torch.cat( (st_embeds, pooled_outputs.detach()), dim=0) # dataset_len x hidden_size return st_embeds