Ejemplo n.º 1
0
def train_zeshel(work_dir: str,
                 data_dir: str,
                 batch_size: int,
                 val_check_interval: int,
                 limit_train_batches: Optional[int] = None,
                 max_epochs: int = 1,
                 base_model_type: str = BaseModelType.BERT_BASE.name):
    model = BiEncoder(base_model_type=base_model_type)
    model.train()
    model.to(DEVICE)

    tokenizer = get_tokenizer(base_model_type)
    trainset = ZeshelDataset(data_dir,
                             split='train',
                             tokenizer=tokenizer,
                             device=DEVICE,
                             base_model_type=base_model_type)
    valset = ZeshelDataset(data_dir,
                           split='val',
                           tokenizer=tokenizer,
                           device=DEVICE,
                           base_model_type=base_model_type)
    print('Training examples:', len(trainset))
    print('Validation examples:', len(valset))
    valset = [valset[i] for i in range(100)]
    trainloader = DataLoader(trainset,
                             batch_size=batch_size,
                             num_workers=12,
                             shuffle=True)
    valloader = torch.utils.data.DataLoader(valset,
                                            batch_size=batch_size,
                                            num_workers=12,
                                            shuffle=True)

    accumulate_grad_batches = max(1, 128 // batch_size)
    wandb_logger = WandbLogger(
        name=f'{base_model_type}_{datetime.now().strftime("%m_%d_%H%M_%S")}',
        project='entity-linker')
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        mode='min',
        save_top_k=2,
        verbose=True,
        dirpath=os.path.join(work_dir, f'checkpoints'),
        filename='{epoch}-{val_loss:.3f}' +
        f'_{base_model_type}_{datetime.now().strftime("%m_%d_%H%M_%S")}')
    trainer = pl.Trainer(gpus=-1 if DEVICE != 'cpu' else 0,
                         logger=[wandb_logger],
                         val_check_interval=val_check_interval,
                         accumulate_grad_batches=accumulate_grad_batches,
                         log_every_n_steps=1,
                         limit_train_batches=limit_train_batches
                         if limit_train_batches else 1.0,
                         callbacks=[checkpoint_callback],
                         max_epochs=max_epochs)
    trainer.fit(model, trainloader, valloader)
Ejemplo n.º 2
0
 def setUp(self):
     base_model_type = BaseModelType.BERT_BASE.name
     self.tokenizer = get_tokenizer(base_model_type)
     self.dataset = ZeshelDataset(os.path.join(dir_path, 'data'),
                                  split='train',
                                  tokenizer=self.tokenizer,
                                  base_model_type=base_model_type)
     self.entities_dataset = ZeshelEntitiesDataset(
         os.path.join(dir_path, 'data'),
         split='train',
         tokenizer=self.tokenizer,
         base_model_type=base_model_type)
Ejemplo n.º 3
0
 def test_special_token_declutr(self):
     base_model_type = BaseModelType.DECLUTR_BASE.name
     tokenizer = get_tokenizer(base_model_type)
     dataset = ZeshelDataset(os.path.join(dir_path, 'data'),
                             split='train',
                             tokenizer=tokenizer,
                             base_model_type=base_model_type)
     sample = dataset[0]
     mention_input_tokens = tokenizer.convert_ids_to_tokens(
         sample['mention_inputs']['input_ids'])
     entity_input_tokens = tokenizer.convert_ids_to_tokens(
         sample['entity_inputs']['input_ids'])
     self.assertEqual(mention_input_tokens[0], '<s>')
     self.assertEqual(mention_input_tokens[-1], '<pad>')
     self.assertEqual(entity_input_tokens[0], '<s>')
     self.assertEqual(entity_input_tokens[-1], '</s>')
Ejemplo n.º 4
0
def embedd_mentions(checkpoint_path: str, data_dir: str, batch_size: int,
                    base_model_type: str, split: str):
    model = BiEncoder.load_from_checkpoint(checkpoint_path,
                                           map_location=torch.device('cpu'),
                                           base_model_type=base_model_type)
    model.eval()
    model.to(DEVICE)

    tokenizer = get_tokenizer(base_model_type)
    mentions_dataset = ZeshelDataset(data_dir,
                                     split=split,
                                     tokenizer=tokenizer,
                                     base_model_type=base_model_type)

    logger.info(f'Num mentions: {len(mentions_dataset)}')
    mentions_loader = torch.utils.data.DataLoader(mentions_dataset,
                                                  batch_size=batch_size,
                                                  num_workers=12)

    all_embeddings = []
    entity_ids = []
    mention_ids = []
    with torch.no_grad():
        for batch in tqdm(mentions_loader):
            entity_ids += batch['entity_document_ids']
            mention_ids += batch['mention_document_ids']
            mention_inputs = batch['mention_inputs']
            mention_inputs = {
                k: v.to(DEVICE)
                for (k, v) in mention_inputs.items()
            }
            embeddings = model.get_mention_embeddings(
                mention_inputs).cpu().numpy()
            all_embeddings.append(embeddings)

    all_embeddings = np.vstack(all_embeddings)
    print(all_embeddings.shape)
    np.save(
        f'zeshel_mention_embeddings_{split}', {
            'embeddings': all_embeddings,
            'entity_ids': entity_ids,
            'mention_ids': mention_ids
        })