def run(self, config, trial): """Run the early stopper HPO experiment.""" results = pipeline( dataset=config['dataset'], model=config['model'], random_seed=trial, device='cpu', stopper='early', stopper_kwargs=dict( metric='adjusted_mean_rank', frequency=config['frequency'], patience=config['patience'], relative_delta=config['relative_delta'], ), training_kwargs=dict( num_epochs=1000, tqdm_kwargs=dict(leave=False), ), evaluation_kwargs=dict(use_tqdm=False), automatic_memory_optimization=False, # not necessary on CPU ) return ( len(results.losses), results.metric_results.get_metric('both.avg.adjusted_mean_rank'), results.metric_results.get_metric('hits@10'), )
def test_custom_training_loop(self): """Test providing a custom training loop.""" losses = [] class ModifiedTrainingLoop(SLCWATrainingLoop): """A wrapper around SLCWA training loop which remembers batch losses.""" def _forward_pass(self, *args, **kwargs): # noqa: D102 loss = super()._forward_pass(*args, **kwargs) losses.append(loss) return loss _ = pipeline( training=self.training, testing=self.testing, validation=self.validation, training_loop=ModifiedTrainingLoop, model='TransE', training_kwargs=dict(num_epochs=1, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), random_seed=0, ) # empty lists are falsy self.assertTrue(losses)
def _test_pipeline_x_resumption(self, training_loop_type: str): """Test whether the resumed pipeline creates the same results as the one shot pipeline.""" # As the resumption capability currently is a function of the training loop, more thorough tests can be found # in the test_training.py unit tests. In the tests below the handling of training loop checkpoints by the # pipeline is checked. result_standard = pipeline( model=self.model, dataset=self.dataset, training_loop=training_loop_type, training_kwargs=dict(num_epochs=10, use_tqdm=False, use_tqdm_batch=False), random_seed=self.random_seed, ) # Set up a shared result that runs two pipelines that should replicate the results of the standard pipeline. _ = pipeline( model=self.model, dataset=self.dataset, training_loop=training_loop_type, training_kwargs=dict( num_epochs=5, use_tqdm=False, use_tqdm_batch=False, checkpoint_name=self.checkpoint_name, checkpoint_directory=self.temporary_directory.name, checkpoint_frequency=0, ), random_seed=self.random_seed, ) # Resume the previous pipeline result_split = pipeline( model=self.model, dataset=self.dataset, training_loop=training_loop_type, training_kwargs=dict( num_epochs=10, use_tqdm=False, use_tqdm_batch=False, checkpoint_name=self.checkpoint_name, checkpoint_directory=self.temporary_directory.name, checkpoint_frequency=0, ), ) self.assertEqual(result_standard.losses, result_split.losses)
def _help(self, model): return pipeline( dataset=NationsLiteral, model=model, training_kwargs=dict(num_epochs=5, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), training_loop='lcwa', )
def test_pipeline(self): """Test the pipeline on TransE and nations.""" pipeline_result = pipeline( model='TransE', dataset='nations', ) self.assertIsInstance(pipeline_result, PipelineResult) self.assertIsInstance(pipeline_result.model, Model) self.assertIsInstance(pipeline_result.model.regularizer, NoRegularizer)
def train_embedding(self, g, model="SimplE"): # pykeen from pykeen.pipeline import pipeline from pykeen.triples import TriplesFactory # create pseudo-nodes to enucode node attributes pleasent, not_pleasent = len(g.concepts), len(g.concepts) + 1 sensitiv, not_sensitive = len(g.concepts) + 2, len(g.concepts) + 3 # build triples triples = [] for c in g.concepts: # actual connections triples.extend(([c.index, 'semantic', j] for j in g.get_semantic_ids(c))) # encode attributes by binning if c.pleasentness != 0: triples.append([c.index, 'pleasent', pleasent if c.pleasentness > 0 else not_pleasent]) if c.sensitivity != 0: triples.append([c.index, 'sensitiv', sensitiv if c.sensitivity > 0 else not_sensitive]) triples, n = np.asarray(triples), len(triples) print("Number of Triples (Train/Total): %i/%i" % (int(0.8 * n), n)) # create mask for training and testing separation train_mask = np.full(n, False) train_mask[:int(n * 0.9)] = True np.random.shuffle(train_mask) # separate into training and testing train_triples = triples[train_mask] test_triples = triples[~train_mask] # create triples factories train_factory = TriplesFactory(triples=train_triples) test_factory = TriplesFactory(triples=test_triples) # create and run pipeline results = pipeline( # data training_triples_factory=train_factory, testing_triples_factory=test_factory, # model model=model, model_kwargs={ "embedding_dim": self.embedd_dim, "automatic_memory_optimization": True } ) # get embedding tensor - remove pseudo nodes weight = results.model.entity_embeddings.weight[:len(g.concepts), ...].cpu() # update word2id words = [c.text for c in g.concepts] self.word2id = OrderedDict( zip(words, range(1, len(words) + 1)) ) # 0th element is padding # update embeddings - add padding embedding at position 0 self.embedding = nn.Embedding( num_embeddings=len(words) + 1, embedding_dim=self.embedd_dim, _weight=torch.cat((torch.zeros((1, self.embedd_dim)), weight), dim=0) ) # return results return results
def setUpClass(cls): """Set up a shared result.""" cls.result = pipeline( model='TransE', dataset='nations', training_kwargs=dict(num_epochs=5), ) cls.model = cls.result.model nations = Nations() cls.testing_mapped_triples = nations.testing.mapped_triples.to(cls.model.device)
def test_unlabeled_triples(self): """Test running the pipeline on unlabeled triples factories.""" _ = pipeline( training=self.training, testing=self.testing, validation=self.validation, model='TransE', training_kwargs=dict(num_epochs=1, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), )
def test_specify_regularizer(self): """Test a pipeline that uses a regularizer.""" pipeline_result = pipeline( model=TransE, dataset='nations', regularizer='powersum', ) self.assertIsInstance(pipeline_result, PipelineResult) self.assertIsInstance(pipeline_result.model, Model) self.assertIsInstance(pipeline_result.model.regularizer, PowerSumRegularizer)
def _help(self, model): rv = pipeline( dataset=NationsLiteral, model=model, training_kwargs=dict(num_epochs=5, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), training_loop='lcwa', ) self.assertIsNotNone(rv) with tempfile.TemporaryDirectory() as d: rv.save_to_directory(d)
def test_eager_unlabeled_dataset(self): """Test running the pipeline on unlabeled triples factories in a dataset.""" dataset = EagerDataset( training=self.training, testing=self.testing, validation=self.validation, ) _ = pipeline( dataset=dataset, model='TransE', training_kwargs=dict(num_epochs=1, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), )
def _help_test_interaction_resolver(self, model_cls): self.assertTrue(issubclass(model_cls, ERModel)) self.assertIsInstance(model_cls._interaction, TransEInteraction) self.assertEqual(2, model_cls._interaction.p) _ = pipeline( training=self.training, testing=self.testing, validation=self.validation, model=model_cls, training_kwargs=dict(num_epochs=1, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), random_seed=0, )
def setUpClass(cls): """Set up a shared result.""" cls.device = resolve_device('cuda') cls.result = pipeline( model='TransE', dataset='nations', training_kwargs=dict(num_epochs=5, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), device=cls.device, random_seed=42, ) cls.model = cls.result.model nations = Nations() cls.testing_mapped_triples = nations.testing.mapped_triples.to(cls.model.device)
def test_pipeline(self): """Test the pipeline on RotatE with negative sampling self adversarial loss and nations.""" loss = NSSALoss loss_kwargs = {"margin": 1., "adversarial_temperature": 1.} pipeline_results = pipeline( model='RotatE', dataset='nations', loss=loss, loss_kwargs=loss_kwargs, ) self.assertIsInstance(pipeline_results, PipelineResult) self.assertIsInstance(pipeline_results.model.loss, loss) self.assertEqual(pipeline_results.model.loss.margin, 1.) self.assertEqual(pipeline_results.model.loss.adversarial_temperature, 1.)
def test_pipeline_evaluation_filtering_with_validation_triples(self): """Test if the evaluator's triple filtering with validation triples works as expected using the pipeline.""" results = pipeline( model=self.model, dataset=self.dataset, training_loop_kwargs=dict(automatic_memory_optimization=False), training_kwargs=dict(num_epochs=0, use_tqdm=False), evaluator_kwargs=dict(filtered=True, automatic_memory_optimization=False), evaluation_kwargs=dict(use_tqdm=False), device=self.device, random_seed=42, filter_validation_when_testing=True, ) assert results.metric_results.arithmetic_mean_rank['both'][ 'realistic'] == 1, 'The rank should equal 1'
def test_pipeline(self): """Test the pipeline on RotatE with negative sampling self adversarial loss and nations.""" loss = NSSALoss loss_kwargs = {"margin": 1.0, "adversarial_temperature": 1.0} pipeline_results = pipeline( model="RotatE", dataset="nations", loss=loss, loss_kwargs=loss_kwargs, training_kwargs=dict(use_tqdm=False), ) self.assertIsInstance(pipeline_results, PipelineResult) self.assertIsInstance(pipeline_results.model.loss, loss) self.assertEqual(pipeline_results.model.loss.margin, 1.0) self.assertEqual( pipeline_results.model.loss.inverse_softmax_temperature, 1.0)
def train(outfolder, training, epochs=10): from pykeen.pipeline import pipeline result = pipeline( training=combinedtraining_file, testing=combinedtraining_file, model='TransE', training_kwargs=dict(num_epochs=epochs), ) result.save_to_directory(outfolder) with open("{}/entity_id_to_label.json".format(outfolder), 'w') as outfile: json.dump(result.training.entity_id_to_label, outfile, indent=2) with open("{}/relation_id_to_label.json".format(outfolder), 'w') as outfile: json.dump(result.training.relation_id_to_label, outfile, indent=2) return result
def test_specify_regularizer(self): """Test a pipeline that uses a regularizer.""" for regularizer, cls in [ (None, pykeen.regularizers.NoRegularizer), ('no', pykeen.regularizers.NoRegularizer), (NoRegularizer, pykeen.regularizers.NoRegularizer), ('powersum', pykeen.regularizers.PowerSumRegularizer), ('lp', pykeen.regularizers.LpRegularizer), ]: with self.subTest(regularizer=regularizer): pipeline_result = pipeline( model='TransE', dataset='Nations', regularizer=regularizer, training_kwargs=dict(num_epochs=1), ) self.assertIsInstance(pipeline_result, PipelineResult) self.assertIsInstance(pipeline_result.model, Model) self.assertIsInstance(pipeline_result.model.regularizer, cls)
def test_interaction_instance_builder(self): """Test resolving an interaction model instance.""" model = make_model( dimensions={"d": 3}, interaction=TransEInteraction, interaction_kwargs=dict(p=2), triples_factory=self.training, ) self.assertIsInstance(model, ERModel) self.assertIsInstance(model.interaction, TransEInteraction) self.assertEqual(2, model.interaction.p) _ = pipeline( training=self.training, testing=self.testing, validation=self.validation, model=model, training_kwargs=dict(num_epochs=1, use_tqdm=False), evaluation_kwargs=dict(use_tqdm=False), random_seed=0, )
def __fit( self, model_text, num_epochs, train_batch_size, eval_batch_size, model_location ): """ Fit the model. This method can be expanded more for optimizing the model in a better manner. To make the code scalable - we can use json config for training_kwargs, model_kwargs, etc. """ self.__result = pipeline( training=self.__training, validation=self.__valid, testing=self.__testing, model=model_text, training_kwargs=dict(num_epochs=num_epochs, batch_size=train_batch_size), evaluation_kwargs=dict(batch_size=eval_batch_size) ) self.__result.save_to_directory(model_location) self.__model = self.__result.model
from pykeen.triples import TriplesFactory from pykeen.pipeline import pipeline training_path: str = "kg/train.hrt.txt" validation_path: str = "kg/valid.hrt.txt" testing_path: str = "kg/test.hrt.txt" training = TriplesFactory(path=training_path, ) valid = TriplesFactory( path=validation_path, entity_to_id=training.entity_to_id, relation_to_id=training.relation_to_id, ) testing = TriplesFactory( path=testing_path, entity_to_id=training.entity_to_id, relation_to_id=training.relation_to_id, ) result = pipeline(training=training, validation=valid, testing=testing, model='TransE', training_kwargs=dict(num_epochs=2, batch_size=512), evaluation_kwargs=dict(batch_size=128)) result.save_to_directory('saved-model') import torch model = torch.load('saved-model/trained_model.pkl') print(model.predict_heads('VARIANT_DISEASE_associated', 'Leigh_syndrome'))
from pykeen.triples import TriplesFactory from pykeen.evaluation import RankBasedEvaluator from pykeen.pipeline import pipeline import json n_tokeep = 300 minimum = 500 tf = TriplesFactory.from_path(f'data/rare/rare_{minimum}_{n_tokeep}.csv') training, testing = tf.split([.8, .2]) result_pipeline = pipeline( training=training, testing=testing, model='RESCAL', model_kwargs=dict(embedding_dim=300), training_kwargs=dict( #sampler="schlichtkrull", # checkpoint_name='RGCN_checkpointt.pt', # checkpoint_frequency=5, num_epochs=200 #, #batch_size=128 ), evaluator=RankBasedEvaluator, evaluator_kwargs=dict(ks=[50])) result_pipeline.plot_losses() result_pipeline.plot()
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device='cpu'): """Run an inverse stability experiment.""" dataset: Dataset = get_dataset( dataset=dataset, dataset_kwargs=dict( create_inverse_triples=True, ), ) dataset_name = dataset.get_normalized_name() model_cls: Type[Model] = get_model_cls(model) model_name = model_cls.__name__.lower() dataset_dir = INVERSE_STABILITY / dataset_name dataset_dir.mkdir(exist_ok=True, parents=True) pipeline_result = pipeline( dataset=dataset, model=model, training_loop=training_loop, training_kwargs=dict( num_epochs=1000, use_tqdm_batch=False, ), stopper='early', stopper_kwargs=dict(patience=5, frequency=5), random_seed=random_seed, device=device, ) test_tf = dataset.testing model = pipeline_result.model # Score with original triples scores_forward = model.score_hrt(test_tf.mapped_triples) scores_forward_np = scores_forward.detach().numpy()[:, 0] # Score with inverse triples scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples) scores_inverse_np = scores_inverse.detach().numpy()[:, 0] scores_path = dataset_dir / f'{model_name}_{training_loop}_scores.tsv' df = pd.DataFrame( list(zip( itt.repeat(training_loop), itt.repeat(dataset_name), itt.repeat(model_name), scores_forward_np, scores_inverse_np, )), columns=['training_loop', 'dataset', 'model', 'forward', 'inverse'], ) df.to_csv(scores_path, sep='\t', index=False) fig, ax = plt.subplots(1, 1) sns.histplot(data=df, x='forward', label='Forward', ax=ax, color='blue', stat="density") sns.histplot(data=df, x='inverse', label='Inverse', ax=ax, color='orange', stat="density") ax.set_title(f'{dataset_name} - {model_name} - {training_loop}') ax.set_xlabel('Score') plt.legend() plt.savefig(dataset_dir / f'{model_name}_{training_loop}_overlay.png', dpi=300) plt.close(fig) fig, ax = plt.subplots(1, 1) sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density") ax.set_title(f'{dataset_name} - {model_name} - {training_loop}') ax.set_xlabel('Forward - Inverse Score Difference') plt.savefig(dataset_dir / f'{model_name}_{training_loop}_residuals.png', dpi=300) plt.close(fig) return df
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device="cpu"): """Run an inverse stability experiment.""" dataset_instance: Dataset = get_dataset( dataset=dataset, dataset_kwargs=dict(create_inverse_triples=True, ), ) dataset_name = dataset_instance.get_normalized_name() model_cls: Type[Model] = model_resolver.lookup(model) model_name = model_cls.__name__.lower() dataset_dir = INVERSE_STABILITY / dataset_name dataset_dir.mkdir(exist_ok=True, parents=True) pipeline_result = pipeline( dataset=dataset_instance, model=model, training_loop=training_loop, training_kwargs=dict( num_epochs=1000, use_tqdm_batch=False, ), stopper="early", stopper_kwargs=dict(patience=5, frequency=5), random_seed=random_seed, device=device, ) test_tf = dataset_instance.testing model = pipeline_result.model # Score with original triples scores_forward = model.score_hrt(test_tf.mapped_triples) scores_forward_np = scores_forward.detach().numpy()[:, 0] # Score with inverse triples scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples) scores_inverse_np = scores_inverse.detach().numpy()[:, 0] scores_path = dataset_dir / f"{model_name}_{training_loop}_scores.tsv" df = pd.DataFrame( list( zip( itt.repeat(training_loop), itt.repeat(dataset_name), itt.repeat(model_name), scores_forward_np, scores_inverse_np, )), columns=["training_loop", "dataset", "model", "forward", "inverse"], ) df.to_csv(scores_path, sep="\t", index=False) fig, ax = plt.subplots(1, 1) sns.histplot(data=df, x="forward", label="Forward", ax=ax, color="blue", stat="density") sns.histplot(data=df, x="inverse", label="Inverse", ax=ax, color="orange", stat="density") ax.set_title(f"{dataset_name} - {model_name} - {training_loop}") ax.set_xlabel("Score") plt.legend() plt.savefig(dataset_dir / f"{model_name}_{training_loop}_overlay.png", dpi=300) plt.close(fig) fig, ax = plt.subplots(1, 1) sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density") ax.set_title(f"{dataset_name} - {model_name} - {training_loop}") ax.set_xlabel("Forward - Inverse Score Difference") plt.savefig(dataset_dir / f"{model_name}_{training_loop}_residuals.png", dpi=300) plt.close(fig) return df
valid._num_relations = _num_relations test = TriplesFactory(path=test_path, entity_to_id=entity_to_id, relation_to_id=relation_to_id) test._num_entities = _num_entities test._num_relations = _num_relations model = 'TransE' result = pipeline( model=model, training_triples_factory=train, validation_triples_factory=valid, testing_triples_factory=test, training_kwargs={'num_epochs': 300}, # 30 model_kwargs={'embedding_dim': 300}, stopper='early', stopper_kwargs={ 'frequency': 10, 'stopped': True, 'patience': 1 }, evaluation_kwargs={'batch_size': 32}, optimizer_kwargs={'lr': 0.1}, ) # ============================================================================= # print(result.metric_results.hits_at_k['avg']) # print(result.metric_results.hits_at_k['pred']) # np.save(f'{dataset}_{model}_pred.npy', result.metric_results.hits_at_k['pred']) # ============================================================================= print(result) result.save_to_directory(f'{dataset}_{model}')