def start_predictions_pipeline( model_directory: str, data_directory: str, path_to_blacklisted_triples: Optional[str] = None, export_predictions=True) -> None: """ Performs inference based on a trained KGE model. The predictions are saved predictions.tsv in the provided data directory. :param model_directory: Directory containing the experimental artifacts: configuration.json, entities_to_embeddings.json, relations_to_embeddings.json and trained_model.pkl :param data_directory: Directory containing the candidate entities as an entities.tsv file and the candidate relations as relations.tsv. Both files consists of one column containint the entities/relations, and based on these all combinatios of possible triples are created. :param remove_training_triples: :param path_to_blacklisted_triples: :return: """ # Load configuration file with open(os.path.join(model_directory, 'configuration.json')) as f: config = json.load(f) # Load entity to id mapping with open(os.path.join(model_directory, 'entity_to_id.json')) as f: entity_to_id = json.load(f) # Load relation to id mapping with open(os.path.join(model_directory, 'relation_to_id.json')) as f: relation_to_id = json.load(f) trained_kge_model: Module = get_kge_model(config=config) path_to_model = os.path.join(model_directory, 'trained_model.pkl') trained_kge_model.load_state_dict(torch.load(path_to_model)) entities = np.loadtxt(fname=os.path.join(data_directory, 'entities.tsv'), dtype=str, delimiter='\t') relations = np.loadtxt(fname=os.path.join(data_directory, 'relations.tsv'), dtype=str, delimiter='\t') device_name = 'cuda:0' if torch.cuda.is_available( ) and config[PREFERRED_DEVICE] == GPU else CPU device = torch.device(device_name) ranked_triples = make_predictions( kge_model=trained_kge_model, entities=entities, relations=relations, entity_to_id=entity_to_id, rel_to_id=relation_to_id, device=device, blacklist_path=path_to_blacklisted_triples, ) if export_predictions: np.savetxt(os.path.join(data_directory, 'predictions.tsv'), ranked_triples, fmt='%s') return ranked_triples
def run(self) -> Mapping: """Run this pipeline.""" metric_results = None if self._use_hpo(self.config): # Hyper-parameter optimization mode mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_and_test_triples() (trained_model, loss_per_epoch, entity_label_to_embedding, relation_label_to_embedding, metric_results, params) = RandomSearch.run( mapped_train_triples=mapped_pos_train_triples, mapped_test_triples=mapped_pos_test_triples, entity_to_id=self.entity_label_to_id, rel_to_id=self.relation_label_to_id, config=self.config, device=str(self.device), seed=self.seed, ) else: # Training Mode if self.is_evaluation_required: mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_and_test_triples() else: mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_triples(), None all_entities = np.array(list(self.entity_label_to_id.values())) # Initialize KG embedding model self.config[pkc.PREFERRED_DEVICE] = pkc.CPU if self.device_name == pkc.CPU else pkc.GPU if self.seed is not None: torch.manual_seed(self.seed) kge_model: Module = get_kge_model(config=self.config) kge_model.entity_label_to_id = self.entity_label_to_id kge_model.relation_label_to_id = self.relation_label_to_id kge_model.num_entities = len(self.entity_label_to_id) kge_model.num_relations = len(self.relation_label_to_id) batch_size = self.config[pkc.BATCH_SIZE] num_epochs = self.config[pkc.NUM_EPOCHS] learning_rate = self.config[pkc.LEARNING_RATE] log.info("-------------Train KG Embeddings-------------") loss_per_epoch = kge_model.fit( pos_triples=mapped_pos_train_triples, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, ) trained_model = kge_model params = self.config if self.is_evaluation_required: log.info("-------------Start Evaluation-------------") metric_results = compute_metric_results( kg_embedding_model=kge_model, mapped_train_triples=mapped_pos_train_triples, mapped_test_triples=mapped_pos_test_triples, device=str(self.device), filter_neg_triples=self.config[pkc.FILTER_NEG_TRIPLES], ) # Prepare Output entity_id_to_label = { value: key for key, value in self.entity_label_to_id.items() } relation_id_to_label = { value: key for key, value in self.relation_label_to_id.items() } entity_label_to_embedding = { entity_id_to_label[entity_id]: embedding.detach().cpu().numpy() for entity_id, embedding in enumerate(trained_model.entity_embeddings.weight) } if self.config[pkc.KG_EMBEDDING_MODEL_NAME] in (pkc.SE_NAME, pkc.UM_NAME): relation_label_to_embedding = None else: relation_label_to_embedding = { relation_id_to_label[relation_id]: embedding.detach().cpu().numpy() for relation_id, embedding in enumerate(trained_model.relation_embeddings.weight) } return _make_results( trained_model=trained_model, loss_per_epoch=loss_per_epoch, entity_to_embedding=entity_label_to_embedding, relation_to_embedding=relation_label_to_embedding, metric_results=metric_results, entity_to_id=self.entity_label_to_id, rel_to_id=self.relation_label_to_id, params=params, )
def optimize_hyperparams( self, mapped_train_triples, mapped_test_triples, entity_to_id, rel_to_id, config, device, seed: Optional[int] = None, k_evaluation: int = 10, ) -> HPOptimizerResult: if seed is not None: torch.manual_seed(config[pkc.SEED]) trained_kge_models: List[Module] = [] epoch_losses: List[List[float]] = [] hits_at_k_evaluations: List[float] = [] entity_to_ids: List[Dict[int, str]] = [] rel_to_ids: List[Dict[int, str]] = [] models_params: List[Dict] = [] eval_summaries: List = [] config = config.copy() max_iters = config[pkc.NUM_OF_HPO_ITERS] sample_fct = ( self._sample_conv_e_params if config[pkc.KG_EMBEDDING_MODEL_NAME] == pkc.CONV_E_NAME else self._sample_parameter_value ) for _ in trange(max_iters, desc='HPO Iteration'): # Sample hyper-params kge_model_config: Dict[str, Any] = sample_fct(config) kge_model_config[pkc.SEED]: int = seed kge_model_config[pkc.PREFERRED_DEVICE]: str = pkc.CPU if device == pkc.CPU else pkc.GPU # Configure defined model kge_model: Module = get_kge_model(config=kge_model_config) # Load class params kge_model.entity_label_to_id: Dict[str, int] = entity_to_id kge_model.relation_label_to_id: Dict[str, int] = rel_to_id kge_model.num_entities: int = len(entity_to_id) kge_model.num_relations: int = len(rel_to_id) models_params.append(kge_model_config) entity_to_ids.append(entity_to_id) rel_to_ids.append(rel_to_id) all_entities = np.array(list(entity_to_id.values())) batch_size = kge_model_config[pkc.BATCH_SIZE] num_epochs = kge_model_config[pkc.NUM_EPOCHS] learning_rate = kge_model_config[pkc.LEARNING_RATE] epoch_loss = kge_model.fit( pos_triples=mapped_train_triples, learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size, tqdm_kwargs=dict(leave=False), ) trained_kge_model = kge_model # Evaluate trained model metric_results = compute_metric_results( kg_embedding_model=trained_kge_model, mapped_train_triples=mapped_train_triples, mapped_test_triples=mapped_test_triples, device=device, ) # TODO: Define HPO metric eval_summaries.append(metric_results) trained_kge_models.append(trained_kge_model) epoch_losses.append(epoch_loss) hits_at_k_evaluation = metric_results.hits_at_k[k_evaluation] hits_at_k_evaluations.append(hits_at_k_evaluation) index_of_max = int(np.argmax(a=hits_at_k_evaluations)) return ( trained_kge_models[index_of_max], epoch_losses[index_of_max], entity_to_ids[index_of_max], rel_to_ids[index_of_max], eval_summaries[index_of_max], models_params[index_of_max], )