def train_zeshel(work_dir: str, data_dir: str, batch_size: int, val_check_interval: int, limit_train_batches: Optional[int] = None, max_epochs: int = 1, base_model_type: str = BaseModelType.BERT_BASE.name): model = BiEncoder(base_model_type=base_model_type) model.train() model.to(DEVICE) tokenizer = get_tokenizer(base_model_type) trainset = ZeshelDataset(data_dir, split='train', tokenizer=tokenizer, device=DEVICE, base_model_type=base_model_type) valset = ZeshelDataset(data_dir, split='val', tokenizer=tokenizer, device=DEVICE, base_model_type=base_model_type) print('Training examples:', len(trainset)) print('Validation examples:', len(valset)) valset = [valset[i] for i in range(100)] trainloader = DataLoader(trainset, batch_size=batch_size, num_workers=12, shuffle=True) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, num_workers=12, shuffle=True) accumulate_grad_batches = max(1, 128 // batch_size) wandb_logger = WandbLogger( name=f'{base_model_type}_{datetime.now().strftime("%m_%d_%H%M_%S")}', project='entity-linker') checkpoint_callback = ModelCheckpoint( monitor='val_loss', mode='min', save_top_k=2, verbose=True, dirpath=os.path.join(work_dir, f'checkpoints'), filename='{epoch}-{val_loss:.3f}' + f'_{base_model_type}_{datetime.now().strftime("%m_%d_%H%M_%S")}') trainer = pl.Trainer(gpus=-1 if DEVICE != 'cpu' else 0, logger=[wandb_logger], val_check_interval=val_check_interval, accumulate_grad_batches=accumulate_grad_batches, log_every_n_steps=1, limit_train_batches=limit_train_batches if limit_train_batches else 1.0, callbacks=[checkpoint_callback], max_epochs=max_epochs) trainer.fit(model, trainloader, valloader)
def setUp(self): base_model_type = BaseModelType.BERT_BASE.name self.tokenizer = get_tokenizer(base_model_type) self.dataset = ZeshelDataset(os.path.join(dir_path, 'data'), split='train', tokenizer=self.tokenizer, base_model_type=base_model_type) self.entities_dataset = ZeshelEntitiesDataset( os.path.join(dir_path, 'data'), split='train', tokenizer=self.tokenizer, base_model_type=base_model_type)
def test_special_token_declutr(self): base_model_type = BaseModelType.DECLUTR_BASE.name tokenizer = get_tokenizer(base_model_type) dataset = ZeshelDataset(os.path.join(dir_path, 'data'), split='train', tokenizer=tokenizer, base_model_type=base_model_type) sample = dataset[0] mention_input_tokens = tokenizer.convert_ids_to_tokens( sample['mention_inputs']['input_ids']) entity_input_tokens = tokenizer.convert_ids_to_tokens( sample['entity_inputs']['input_ids']) self.assertEqual(mention_input_tokens[0], '<s>') self.assertEqual(mention_input_tokens[-1], '<pad>') self.assertEqual(entity_input_tokens[0], '<s>') self.assertEqual(entity_input_tokens[-1], '</s>')
def embedd_mentions(checkpoint_path: str, data_dir: str, batch_size: int, base_model_type: str, split: str): model = BiEncoder.load_from_checkpoint(checkpoint_path, map_location=torch.device('cpu'), base_model_type=base_model_type) model.eval() model.to(DEVICE) tokenizer = get_tokenizer(base_model_type) mentions_dataset = ZeshelDataset(data_dir, split=split, tokenizer=tokenizer, base_model_type=base_model_type) logger.info(f'Num mentions: {len(mentions_dataset)}') mentions_loader = torch.utils.data.DataLoader(mentions_dataset, batch_size=batch_size, num_workers=12) all_embeddings = [] entity_ids = [] mention_ids = [] with torch.no_grad(): for batch in tqdm(mentions_loader): entity_ids += batch['entity_document_ids'] mention_ids += batch['mention_document_ids'] mention_inputs = batch['mention_inputs'] mention_inputs = { k: v.to(DEVICE) for (k, v) in mention_inputs.items() } embeddings = model.get_mention_embeddings( mention_inputs).cpu().numpy() all_embeddings.append(embeddings) all_embeddings = np.vstack(all_embeddings) print(all_embeddings.shape) np.save( f'zeshel_mention_embeddings_{split}', { 'embeddings': all_embeddings, 'entity_ids': entity_ids, 'mention_ids': mention_ids })