def _test_checkpoints(self, training_loop_type: str): """Test whether interrupting the given training loop type can be resumed using checkpoints.""" training_loop_class = get_training_loop_cls(training_loop_type) # Train a model in one shot model = TransE( triples_factory=self.triples_factory, random_seed=self.random_seed, ) optimizer_cls = get_optimizer_cls(None) optimizer = optimizer_cls(params=model.get_grad_params()) training_loop = training_loop_class(model=model, optimizer=optimizer, automatic_memory_optimization=False) losses = training_loop.train( num_epochs=self.num_epochs, batch_size=self.batch_size, use_tqdm=False, use_tqdm_batch=False, ) # Train a model for the first half model = TransE( triples_factory=self.triples_factory, random_seed=self.random_seed, ) optimizer_cls = get_optimizer_cls(None) optimizer = optimizer_cls(params=model.get_grad_params()) training_loop = training_loop_class(model=model, optimizer=optimizer, automatic_memory_optimization=False) training_loop.train( num_epochs=int(self.num_epochs // 2), batch_size=self.batch_size, checkpoint_name=self.checkpoint_file, checkpoint_directory=self.temporary_directory.name, checkpoint_frequency=0, ) # Continue training of the first part model = TransE( triples_factory=self.triples_factory, random_seed=123, ) optimizer_cls = get_optimizer_cls(None) optimizer = optimizer_cls(params=model.get_grad_params()) training_loop = training_loop_class(model=model, optimizer=optimizer, automatic_memory_optimization=False) losses_2 = training_loop.train( num_epochs=self.num_epochs, batch_size=self.batch_size, checkpoint_name=self.checkpoint_file, checkpoint_directory=self.temporary_directory.name, checkpoint_frequency=0, ) self.assertEqual(losses, losses_2)
def test_error_on_nan(self): """Test if the correct error is raised for non-finite loss values.""" model = TransE(triples_factory=self.triples_factory) patience = 2 class NaNTrainingLoop(self.cls): def __init__(self, **kwargs): super().__init__(**kwargs) self.patience = patience def _process_batch(self, *args, **kwargs): loss = super()._process_batch(*args, **kwargs) self.patience -= 1 if self.patience < 0: return torch.as_tensor([float("nan")], device=loss.device, dtype=torch.float32) return loss training_loop = NaNTrainingLoop( model=model, triples_factory=self.triples_factory, optimizer=self.optimizer_cls(model.get_grad_params()), ) with self.assertRaises(NonFiniteLossError): training_loop.train( triples_factory=self.triples_factory, num_epochs=patience + 1, batch_size=self.batch_size, )
class TrainingLoopTestCase(unittest_templates.GenericTestCase[TrainingLoop]): """A generic test case for training loops.""" model: Model factory: TriplesFactory loss_cls: ClassVar[Type[Loss]] loss: Loss optimizer_cls: ClassVar[Type[Optimizer]] = Adam optimizer: Optimizer random_seed = 0 batch_size: int = 128 sub_batch_size: int = 30 num_epochs: int = 10 def pre_setup_hook(self) -> None: """Prepare case-level variables before the setup() function.""" self.triples_factory = Nations().training self.loss = self.loss_cls() self.model = TransE(triples_factory=self.triples_factory, loss=self.loss, random_seed=self.random_seed) self.optimizer = self.optimizer_cls(self.model.get_grad_params()) def _with_model(self, model: Model) -> TrainingLoop: return self.cls( model=model, triples_factory=self.triples_factory, automatic_memory_optimization=False, optimizer=self.optimizer_cls(model.get_grad_params()), ) def _pre_instantiation_hook(self, kwargs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # noqa: D102 kwargs = super()._pre_instantiation_hook(kwargs=kwargs) kwargs["triples_factory"] = self.triples_factory kwargs["automatic_memory_optimization"] = False kwargs["optimizer"] = self.optimizer kwargs["model"] = self.model return kwargs def test_train(self): """Test training does not error.""" self.instance.train( triples_factory=self.triples_factory, num_epochs=1, ) def test_sub_batching(self): """Test if sub-batching works as expected.""" self.instance.train( triples_factory=self.triples_factory, num_epochs=1, batch_size=self.batch_size, sub_batch_size=self.sub_batch_size, ) def test_sub_batching_support(self): """Test if sub-batching works as expected.""" model = ConvE(triples_factory=self.triples_factory) training_loop = self._with_model(model) with self.assertRaises(NotImplementedError): training_loop.train( triples_factory=self.triples_factory, num_epochs=1, batch_size=self.batch_size, sub_batch_size=self.sub_batch_size, ) def test_error_on_nan(self): """Test if the correct error is raised for non-finite loss values.""" model = TransE(triples_factory=self.triples_factory) patience = 2 class NaNTrainingLoop(self.cls): def __init__(self, **kwargs): super().__init__(**kwargs) self.patience = patience def _process_batch(self, *args, **kwargs): loss = super()._process_batch(*args, **kwargs) self.patience -= 1 if self.patience < 0: return torch.as_tensor([float("nan")], device=loss.device, dtype=torch.float32) return loss training_loop = NaNTrainingLoop( model=model, triples_factory=self.triples_factory, optimizer=self.optimizer_cls(model.get_grad_params()), ) with self.assertRaises(NonFiniteLossError): training_loop.train( triples_factory=self.triples_factory, num_epochs=patience + 1, batch_size=self.batch_size, ) def test_checkpoints(self): """Test whether interrupting the given training loop type can be resumed using checkpoints.""" # Train a model in one shot model = TransE( triples_factory=self.triples_factory, random_seed=self.random_seed, ) training_loop = self._with_model(model) losses = training_loop.train( triples_factory=self.triples_factory, num_epochs=self.num_epochs, batch_size=self.batch_size, use_tqdm=False, use_tqdm_batch=False, ) with tempfile.TemporaryDirectory() as directory: name = "checkpoint.pt" # Train a model for the first half model = TransE( triples_factory=self.triples_factory, random_seed=self.random_seed, ) training_loop = self._with_model(model) training_loop.train( triples_factory=self.triples_factory, num_epochs=int(self.num_epochs // 2), batch_size=self.batch_size, checkpoint_name=name, checkpoint_directory=directory, checkpoint_frequency=0, ) # Continue training of the first part model = TransE( triples_factory=self.triples_factory, random_seed=123, ) training_loop = self._with_model(model) losses_2 = training_loop.train( triples_factory=self.triples_factory, num_epochs=self.num_epochs, batch_size=self.batch_size, checkpoint_name=name, checkpoint_directory=directory, checkpoint_frequency=0, ) self.assertEqual(losses, losses_2)
from pykeen.triples import TriplesFactory from pykeen.evaluation import RankBasedEvaluator from pykeen.models import TransE from torch.optim import Adam from pykeen.training import SLCWATrainingLoop n_tokeep = 300 minimum = 500 tf = TriplesFactory.from_path(f'data/rare/rare_{minimum}_{n_tokeep}.csv') training, testing, validation = tf.split([.8, .1, .1]) # Pick model, optimizer, training approach model = TransE(triples_factory=training) optimizer = Adam(params=model.get_grad_params()) training_loop = SLCWATrainingLoop( model=model, triples_factory=training, optimizer=optimizer, ) # train training_loop.train( triples_factory=training, num_epochs=500, batch_size=256, ) # evaluate evaluator = RankBasedEvaluator(ks=[50]) mapped_triples = testing.mapped_triples