def test_logger(): logger_name = "TEST" log = Logger() logger = log.get_logger(logger_name) captured = io.StringIO() test_handler = logging.StreamHandler(captured) test_handler.setFormatter(logging.Formatter(Logger._FORMAT)) logger.addHandler(test_handler) message = "This is a log message" log.level = logging.WARNING logger.debug(message) logger.info(message) logger.warning(message) logger.error(message) logger.critical(message) another_logger = log.get_logger("ANOTHER") result = captured.getvalue() captured.close() assert not "%s - DEBUG - %s" % (logger_name, message) in result assert not "%s - INFO - %s" % (logger_name, message) in result assert "%s - WARNING - %s" % (logger_name, message) in result assert "%s - ERROR - %s" % (logger_name, message) in result assert "%s - CRITICAL - %s" % (logger_name, message) in result assert Logger() is log assert another_logger.level == log.level
class EarlyStopper: """ Class used by trainer for handling the early stopping mechanism during the training of KGE algorithms. Args: patience (int): Number of epochs to wait before early stopping the training on no improvement. No early stopping if it is a negative number (default: {-1}). monitor (Monitor): the type of metric that earlystopper will monitor. """ _logger = Logger().get_logger(__name__) def __init__(self, patience, monitor): self.monitor = monitor self.patience = patience # controlling variables. self.previous_metrics = None self.patience_left = patience def should_stop(self, curr_metrics): should_stop = False value, name = self.monitor.value, self.monitor.name if self.previous_metrics is not None: if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK: is_worse = self.previous_metrics[value] < curr_metrics[value] else: is_worse = self.previous_metrics[value] > curr_metrics[value] if self.patience_left > 0 and is_worse: self.patience_left -= 1 self._logger.info( '%s more chances before the trainer stops the training. (prev_%s, curr_%s): (%.4f, %.4f)' % (self.patience_left, name, name, self.previous_metrics[value], curr_metrics[value])) elif self.patience_left == 0 and is_worse: self._logger.info('Stop the training.') should_stop = True else: self._logger.info('Reset the patience count to %d' % (self.patience)) self.patience_left = self.patience self.previous_metrics = curr_metrics return should_stop
class EarlyStopper: _logger = Logger().get_logger(__name__) def __init__(self, patience, monitor): self.monitor = monitor self.patience = patience # controlling variables. self.previous_metrics = None self.patience_left = patience def should_stop(self, curr_metrics): should_stop = False value, name = self.monitor.value, self.monitor.name if self.previous_metrics is not None: if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK: is_worse = self.previous_metrics[value] < curr_metrics[value] else: is_worse = self.previous_metrics[value] > curr_metrics[value] if self.patience_left > 0 and is_worse: self.patience_left -= 1 self._logger.info( '%s more chances before the trainer stops the training. (prev_%s, curr_%s): (%.4f, %.4f)' % (self.patience_left, name, name, self.previous_metrics[value], curr_metrics[value])) elif self.patience_left == 0 and is_worse: self._logger.info('Stop the training.') should_stop = True else: self._logger.info('Reset the patience count to %d' % (self.patience)) self.patience_left = self.patience self.previous_metrics = curr_metrics return should_stop
class Config: """ The class defines the basic configuration for the pykg2vec. Config consists of the necessary parameter description used by all the modules including the algorithms and utility functions. Args: test_step (int): Testing is carried out every test_step. test_num (int): Number of triples that will be tested during evaluation. triple_num (int): Number of triples that will be used for plotting the embedding. tmp (Path Object): Path where temporary model information is stored. result (Path Object): Gives the path where the result will be saved. figures (Path Object): Gives the path where the figures will be saved. load_from_data (bool): If True, loads the model parameters if available from memory. save_model (True): If True, store the trained model parameters. disp_summary (bool): If True, display the summary before and after training the algorithm. disp_result (bool): If True, displays result while training. plot_embedding (bool): If True, will plot the embedding after performing t-SNE based dimensionality reduction. log_training_placement (bool): If True, allows us to find out which devices the operations and tensors are assigned to. plot_training_result (bool): If True, plots the loss values stored during training. plot_testing_result (bool): If True, it will plot all the testing result such as mean rank, hit ratio, etc. plot_entity_only (bool): If True, plots the t-SNE reduced embdding of the entities in a figure. hits (List): Gives the list of integer for calculating hits. knowledge_graph (Object): It prepares and holds the instance of the knowledge graph dataset. kg_meta (object): Stores the statistics metadata of the knowledge graph. """ _logger = Logger().get_logger(__name__) def __init__(self, args): for arg_name in vars(args): self.__dict__[arg_name] = getattr(args, arg_name) # Training and evaluating related variables self.hits = [1, 3, 5, 10] self.disp_result = False self.patience = 3 # should make this configurable as well. # Visualization related, # p.s. the visualizer is disable for most of the KGE methods for now. self.disp_triple_num = 20 self.plot_training_result = True self.plot_testing_result = True # Knowledge Graph Information self.knowledge_graph = KnowledgeGraph( dataset=args.dataset_name, custom_dataset_path=args.dataset_path) for key in self.knowledge_graph.kg_meta.__dict__: self.__dict__[key] = self.knowledge_graph.kg_meta.__dict__[key] # The results of training will be stored in the following folders # which are relative to the parent folder (the path of the dataset). dataset_path = self.knowledge_graph.dataset.dataset_path self.path_tmp = dataset_path / 'intermediate' self.path_tmp.mkdir(parents=True, exist_ok=True) self.path_result = dataset_path / 'results' self.path_result.mkdir(parents=True, exist_ok=True) self.path_figures = dataset_path / 'figures' self.path_figures.mkdir(parents=True, exist_ok=True) self.path_embeddings = dataset_path / 'embeddings' self.path_embeddings.mkdir(parents=True, exist_ok=True) if args.exp is True: paper_params = HyperparamterLoader().load_hyperparameter( args.dataset_name, args.model_name) for key, value in paper_params.items(): self.__dict__[ key] = value # copy all the setting from the paper. def summary(self): """Function to print the summary.""" summary = [] summary.append("") summary.append("------------------Global Setting--------------------") # Acquire the max length and add four more spaces maxspace = len(max([k for k in self.__dict__.keys()])) + 20 for key, val in self.__dict__.items(): if isinstance(val, (KGMetaData, KnowledgeGraph)): continue if len(key) < maxspace: for i in range(maxspace - len(key)): key = ' ' + key summary.append("%s : %s" % (key, val)) summary.append("---------------------------------------------------") summary.append("") self._logger.info("\n".join(summary))
class Importer: """The class defines methods for importing pykg2vec modules. Importer is used to defines the maps for the algorithm names and provides methods for loading configuration and models. Attributes: model_path (str): Path where the models are defined. config_path (str): Path where the configuration for each models are defineds. modelMap (dict): This map transforms the names of model to the actual class names. configMap (dict): This map transforms the input config names to the actuall config class names. Examples: >>> from pykg2vec import Importer >>> config_def, model_def = Importer().import_model_config('transe') >>> config = config_def() >>> model = model_def(config) """ _logger = Logger().get_logger(__name__) def __init__(self): self.model_path = "pykg2vec.models" self.config_path = "pykg2vec.config" self.modelMap = {"analogy": "pointwise.ANALOGY", "complex": "pointwise.Complex", "complexn3": "pointwise.ComplexN3", "conve": "projection.ConvE", "convkb": "pointwise.ConvKB", "cp": "pointwise.CP", "hole": "pairwise.HoLE", "distmult": "pointwise.DistMult", "kg2e": "pairwise.KG2E", "ntn": "pairwise.NTN", "proje_pointwise": "projection.ProjE_pointwise", "rescal": "pairwise.Rescal", "rotate": "pairwise.RotatE", "simple": "pointwise.SimplE", "simple_ignr": "pointwise.SimplE_ignr", "slm": "pairwise.SLM", "sme": "pairwise.SME", "sme_bl": "pairwise.SME_BL", "transd": "pairwise.TransD", "transe": "pairwise.TransE", "transh": "pairwise.TransH", "transm": "pairwise.TransM", "transr": "pairwise.TransR", "tucker": "projection.TuckER"} def import_model_config(self, name): """This function imports models and configuration. This function is used to dynamically import the modules within pykg2vec. Args: name (str): The input to the module is either name of the model or the configuration file. The strings are converted to lowercase to makesure the user inputs can easily be matched to the names of the models and the configuration class. Returns: object: Configuration and model object after it is successfully loaded. `config_obj` (object): Returns the configuration class object of the corresponding algorithm. `model_obj` (object): Returns the model class object of the corresponding algorithm. Raises: ModuleNotFoundError: It raises a module not found error if the configuration or the model cannot be found. """ config_obj = getattr(importlib.import_module(self.config_path), "Config") model_obj = None try: if name in self.modelMap: splited_path = self.modelMap[name].split('.') else: raise ValueError("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(lambda x: str(x).split(".")[1], self.modelMap.values())))) model_obj = getattr(importlib.import_module(self.model_path + ".%s" % splited_path[0]), splited_path[1]) except ModuleNotFoundError: self._logger.error("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(str.split(".")[1], self.modelMap.values())))) raise ValueError("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(str.split(".")[1], self.modelMap.values())))) return config_obj, model_obj
class HyperparameterLoader: """Hyper parameters loading based datasets and embedding algorithms""" _logger = Logger().get_logger(__name__) def __init__(self, args): self.hyperparams = {} self.search_space = {} # load hyperparameters from options (file, dir or with pkg.) default_search_space_dir = (Path(__file__).resolve().parent)/"searchspaces" for config_file in default_search_space_dir.glob('**/*.yaml'): self.search_space = self._load_ss_yaml(config_file, self.search_space) default_hyperparam_dir = (Path(__file__).resolve().parent)/"hyperparams" for config_file in default_hyperparam_dir.glob('**/*.yaml'): self.hyperparams = self._load_hp_yaml(config_file, self.hyperparams) # load search spaces from options (file, dir or with pkg.) if hasattr(args, "hp_abs_file") and args.hp_abs_file is not None: self.hyperparams = self._load_hp_yaml(args.hp_abs_file, self.hyperparams) if hasattr(args, "ss_abs_file") and args.ss_abs_file is not None: self.search_space = self._load_ss_yaml(args.ss_abs_file, self.search_space) def load_hyperparameter(self, dataset_name, algorithm): d_name = dataset_name.lower() a_name = algorithm.lower() if d_name in self.hyperparams and a_name in self.hyperparams[d_name]: params = self.hyperparams[d_name][a_name] return params raise Exception("This experimental setting for (%s, %s) has not been configured" % (dataset_name, algorithm)) def load_search_space(self, algorithm): if algorithm in self.search_space: return self.search_space[algorithm] raise ValueError("Hyperparameter search space is not configured for %s" % algorithm) @staticmethod def _load_hp_yaml(config_file, hyperparams): if not os.path.isfile(config_file): raise FileNotFoundError("Cannot find configuration file %s" % config_file) if str(config_file).endswith("yaml") or str(config_file).endswith("yml"): with open(os.path.abspath(config_file), "r") as file: try: config = yaml.safe_load(file) algorithm = config["model_name"].lower() for dataset in config["datasets"]: if dataset["dataset"] in hyperparams: hyperparams[dataset["dataset"]][algorithm] = dataset["parameters"] else: hyperparams = {**hyperparams, **{dataset["dataset"]: {algorithm: dataset["parameters"]}}} except yaml.YAMLError: HyperparameterLoader._logger.error("Cannot load configuration: %s" % config_file) raise else: raise ValueError("Configuration file must have .yaml or .yml extension: %s" % config_file) return hyperparams @staticmethod def _load_ss_yaml(config_file, search_space): ''' loading search space configurationfrom yaml file''' if not os.path.isfile(config_file): raise FileNotFoundError("Cannot find configuration file %s" % config_file) if str(config_file).endswith("yaml") or str(config_file).endswith("yml"): with open(os.path.abspath(config_file), "r") as file: try: config = yaml.safe_load(file) algorithm = config["model_name"].lower() search_space = {**search_space, **{algorithm: HyperparameterLoader._config_tuning_space(config["search_space"])}} except yaml.YAMLError: HyperparameterLoader._logger.error("Cannot load configuration: %s" % config_file) raise else: raise ValueError("Configuration file must have .yaml or .yml extension: %s" % config_file) return search_space @staticmethod def _config_tuning_space(tuning_space_raw): if tuning_space_raw is None: return None hyper_obj = {} if "learning_rate" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"learning_rate": hp.loguniform('learning_rate', np.log(tuning_space_raw['learning_rate']['min']), np.log(tuning_space_raw['learning_rate']['max']))}} if "hidden_size" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"hidden_size": scope.int(hp.qloguniform('hidden_size', np.log(tuning_space_raw['hidden_size']['min']), np.log(tuning_space_raw['hidden_size']['max']), 1))}} if "ent_hidden_size" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"ent_hidden_size": scope.int(hp.qloguniform("ent_hidden_size", np.log(tuning_space_raw['ent_hidden_size']['min']), np.log(tuning_space_raw['ent_hidden_size']['max']), 1))}} if "rel_hidden_size" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"rel_hidden_size": scope.int(hp.qloguniform("rel_hidden_size", np.log(tuning_space_raw['rel_hidden_size']['min']), np.log(tuning_space_raw['rel_hidden_size']['max']), 1))}} if "batch_size" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"batch_size": scope.int(hp.qloguniform("batch_size", np.log(tuning_space_raw['batch_size']['min']), np.log(tuning_space_raw['batch_size']['max']), 1))}} if "margin" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"margin": hp.uniform("margin", tuning_space_raw["margin"]["min"], tuning_space_raw["margin"]["max"])}} if "lmbda" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"lmbda": hp.loguniform('lmbda', np.log(tuning_space_raw["lmbda"]["min"]), np.log(tuning_space_raw["lmbda"]["max"]))}} if "distance_measure" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"distance_measure": hp.choice('distance_measure', tuning_space_raw["distance_measure"])}} if "cmax" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"cmax": hp.loguniform('cmax', np.log(tuning_space_raw["cmax"]["min"]), np.log(tuning_space_raw["cmax"]["max"]))}} if "cmin" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"cmin": hp.loguniform('cmin', np.log(tuning_space_raw["cmin"]["min"]), np.log(tuning_space_raw["cmin"]["max"]))}} if "optimizer" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"optimizer": hp.choice("optimizer", tuning_space_raw["optimizer"])}} if "bilinear" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"bilinear": hp.choice('bilinear', tuning_space_raw["bilinear"])}} if "epochs" in tuning_space_raw: hyper_obj = {**hyper_obj, **{"epochs": hp.choice("epochs", tuning_space_raw["epochs"])}} return hyper_obj
class KnownDataset: """The class consists of modules to handle the known datasets. There are various known knowledge graph datasets used by the research community. These datasets maybe in different format. This module helps in parsing those known datasets for training and testing the algorithms. Args: name (str): Name of the datasets url (str): The full url where the dataset resides. prefix (str): The prefix of the dataset given the website. Attributes: dataset_home_path (object): Path object where the data will be downloaded root_oath (object): Path object for the specific dataset. Examples: >>> from pykg2vec.data.kgcontroller import KnownDataset >>> name = "dL50a" >>> url = "https://github.com/louisccc/KGppler/raw/master/datasets/dL50a.tgz" >>> prefix = 'deeplearning_dataset_50arch-' >>> kgdata = KnownDataset(name, url, prefix) >>> kgdata.download() >>> kgdata.extract() >>> kgdata.dump() """ _logger = Logger().get_logger(__name__) def __init__(self, name, url, prefix): self.name = name self.url = url self.prefix = prefix self.dataset_home_path = Path('..') / 'dataset' self.dataset_home_path.mkdir(parents=True, exist_ok=True) self.dataset_home_path = self.dataset_home_path.resolve() self.root_path = self.dataset_home_path / self.name self.tar = self.root_path / ('%s.tgz' % self.name) self.zip = self.root_path / ('%s.zip' % self.name) if not self.root_path.exists(): self.download() self.extract() path_eq_root = [ 'YAGO3_10', 'WN18RR', 'FB15K_237', 'Kinship', 'Nations', 'UMLS', 'NELL_995' ] if self.name == 'WN18': self.dataset_path = self.root_path / 'wordnet-mlj12' elif self.name in path_eq_root: self.dataset_path = self.root_path else: self.dataset_path = self.root_path / self.name self.data_paths = { 'train': self.dataset_path / ('%strain.txt' % self.prefix), 'test': self.dataset_path / ('%stest.txt' % self.prefix), 'valid': self.dataset_path / ('%svalid.txt' % self.prefix) } self.cache_triplet_paths = { 'train': self.dataset_path / 'triplets_train.pkl', 'test': self.dataset_path / 'triplets_test.pkl', 'valid': self.dataset_path / 'triplets_valid.pkl' } self.cache_metadata_path = self.dataset_path / 'metadata.pkl' self.cache_hr_t_path = self.dataset_path / 'hr_t.pkl' self.cache_tr_h_path = self.dataset_path / 'tr_h.pkl' self.cache_hr_t_train_path = self.dataset_path / 'hr_t_train.pkl' self.cache_tr_h_train_path = self.dataset_path / 'tr_h_train.pkl' self.cache_idx2entity_path = self.dataset_path / 'idx2entity.pkl' self.cache_idx2relation_path = self.dataset_path / 'idx2relation.pkl' self.cache_entity2idx_path = self.dataset_path / 'entity2idx.pkl' self.cache_relation2idx_path = self.dataset_path / 'relation2idx.pkl' self.cache_relationproperty_path = self.dataset_path / 'relationproperty.pkl' def download(self): ''' Downloads the given dataset from url''' self._logger.info("Downloading the dataset %s" % self.name) self.root_path.mkdir() if self.url.endswith('.tar.gz') or self.url.endswith('.tgz'): with urllib.request.urlopen(self.url) as response, open( str(self.tar), 'wb') as out_file: shutil.copyfileobj(response, out_file) elif self.url.endswith('.zip'): with urllib.request.urlopen(self.url) as response, open( str(self.zip), 'wb') as out_file: shutil.copyfileobj(response, out_file) else: raise NotImplementedError("Unknown compression format") def extract(self): ''' Extract the downloaded file under the folder with the given dataset name''' try: if (os.path.exists(self.tar)): self._logger.info( "Extracting the downloaded dataset from %s to %s" % (self.tar, self.root_path)) extract_tar(str(self.tar), str(self.root_path)) return if (os.path.exists(self.zip)): self._logger.info( "Extracting the downloaded dataset from %s to %s" % (self.zip, self.root_path)) extract_zip(str(self.zip), str(self.root_path)) return except Exception as e: self._logger.error("Could not extract the target file!") self._logger.error("%s %s" % (type(e), e.args)) def read_metadata(self): ''' Reads the metadata of the knowledge graph if available''' with open(str(self.cache_metadata_path), 'rb') as f: meta = pickle.load(f) return meta def is_meta_cache_exists(self): ''' Checks if the metadata of the knowledge graph if available''' return self.cache_metadata_path.exists() def dump(self): ''' Displays all the metadata of the knowledge graph''' for key, value in self.__dict__.items(): self._logger.info("%s %s" % (key, value))
class UserDefinedDataset(object): """The class consists of modules to handle the user defined datasets. User may define their own datasets to be processed with the pykg2vec library. Args: name (str): Name of the datasets Attributes: dataset_home_path (object): Path object where the data will be downloaded root_oath (object): Path object for the specific dataset. """ _logger = Logger().get_logger(__name__) def __init__(self, name, custom_dataset_path): self.name = name self.dataset_path = Path(custom_dataset_path).resolve() self.root_path = self.dataset_path if not self.root_path.exists(): raise NotImplementedError("%s user defined dataset not found!" % self.root_path) train_file = self.root_path / (name + '-train.txt') test_file = self.root_path / (name + '-test.txt') valid_file = self.root_path / (name + '-valid.txt') if not train_file.exists(): raise NotImplementedError("%s training file not found!" % train_file) if not test_file.exists(): raise NotImplementedError("%s test file not found!" % test_file) if not test_file.exists(): raise NotImplementedError("%s validation file not found!" % valid_file) self.data_paths = { 'train': self.root_path / (name + '-train.txt'), 'test': self.root_path / (name + '-test.txt'), 'valid': self.root_path / (name + '-valid.txt') } self.cache_triplet_paths = { 'train': self.root_path / 'triplets_train.pkl', 'test': self.root_path / 'triplets_test.pkl', 'valid': self.root_path / 'triplets_valid.pkl' } self.cache_metadata_path = self.root_path / 'metadata.pkl' self.cache_hr_t_path = self.root_path / 'hr_t.pkl' self.cache_tr_h_path = self.root_path / 'tr_h.pkl' self.cache_hr_t_train_path = self.root_path / 'hr_t_train.pkl' self.cache_tr_h_train_path = self.root_path / 'tr_h_train.pkl' self.cache_idx2entity_path = self.root_path / 'idx2entity.pkl' self.cache_idx2relation_path = self.root_path / 'idx2relation.pkl' self.cache_entity2idx_path = self.root_path / 'entity2idx.pkl' self.cache_relation2idx_path = self.root_path / 'relation2idx.pkl' self.cache_relationproperty_path = self.root_path / 'relationproperty.pkl' def is_meta_cache_exists(self): """ Checks if the metadata has been cached""" return self.cache_metadata_path.exists() def read_metadata(self): """ Reads the metadata of the user defined dataset""" with open(str(self.cache_metadata_path), 'rb') as f: meta = pickle.load(f) return meta def dump(self): """ Prints the metadata of the user-defined dataset.""" for key, value in self.__dict__.items(): self._logger.info("%s %s" % (key, value))
class KnowledgeGraph(object): """The class is the main module that handles the knowledge graph. KnowledgeGraph is responsible for downloading, parsing, processing and preparing the training, testing and validation dataset. Args: dataset_name (str): Name of the datasets Attributes: dataset_name (str): The name of the dataset. dataset (object): The dataset object isntance. triplets (dict): dictionary with three list of training, testing and validation triples. relations (list):list of all the relations. entities (list): List of all the entities. entity2idx (dict): Dictionary for mapping string name of entities to unique numerical id. idx2entity (dict): Dictionary for mapping the id to string. relation2idx (dict): Dictionary for mapping the id to string. idx2relation (dict): Dictionary for mapping the id to string. hr_t (dict): Dictionary with set as a default key and list as values. tr_h (dict): Dictionary with set as a default key and list as values. hr_t_train (dict): Dictionary with set as a default key and list as values. tr_h_train (dict): Dictionary with set as a default key and list as values. relation_property (list): list storing the entities tied to a specific relation. kg_meta (object): Object storing the statistics metadata of the dataset. Examples: >>> from pykg2vec.config.global_config import KnowledgeGraph >>> knowledge_graph = KnowledgeGraph(dataset='Freebase15k') >>> knowledge_graph.prepare_data() """ _logger = Logger().get_logger(__name__) def __init__(self, dataset='Freebase15k', custom_dataset_path=None): self.dataset_name = dataset if dataset.lower() == 'freebase15k' or dataset.lower() == 'fb15k': self.dataset = FreebaseFB15k() elif dataset.lower() == 'deeplearning50a' or dataset.lower( ) == 'dl50a': self.dataset = DeepLearning50a() elif dataset.lower() == 'wordnet18' or dataset.lower() == 'wn18': self.dataset = WordNet18() elif dataset.lower() == 'wordnet18_rr' or dataset.lower() == 'wn18_rr': self.dataset = WordNet18_RR() elif dataset.lower() == 'yago3_10' or dataset.lower() == 'yago': self.dataset = YAGO3_10() elif dataset.lower() == 'freebase15k_237' or dataset.lower( ) == 'fb15k_237': self.dataset = FreebaseFB15k_237() elif dataset.lower() == 'kinship' or dataset.lower() == 'ks': self.dataset = Kinship() elif dataset.lower() == 'nations': self.dataset = Nations() elif dataset.lower() == 'umls': self.dataset = UMLS() elif dataset.lower() == 'nell_995': self.dataset = NELL_995() else: # if the dataset does not match with existing one, check if it exists in user's local space. # if it still can't find corresponding folder, raise exception in UserDefinedDataset.__init__() self.dataset = UserDefinedDataset(dataset, custom_dataset_path) # KG data structure stored in triplet format self.triplets = {'train': [], 'test': [], 'valid': []} self.triple_store = self.triplets # TODO: should also have graph-based data structure for a KG. self.relations = [] self.entities = [] self.entity2idx = {} self.idx2entity = {} self.relation2idx = {} self.idx2relation = {} self.hr_t = defaultdict(set) self.tr_h = defaultdict(set) self.hr_t_train = defaultdict(set) self.tr_h_train = defaultdict(set) self.hr_t_valid = defaultdict(set) self.tr_h_valid = defaultdict(set) self.relation_property = [] if self.dataset.is_meta_cache_exists(): self.kg_meta = self.dataset.read_metadata() else: self.kg_meta = KGMetaData() self.prepare_data() def force_prepare_data(self): shutil.rmtree(str(self.dataset.root_path), ignore_errors=True) time.sleep(1) self.__init__(dataset=self.dataset_name) def prepare_data(self): """Function to prepare the dataset""" if self.dataset.is_meta_cache_exists(): return self.read_entities() self.read_relations() self.read_mappings() self.read_triple_ids('train') self.read_triple_ids('test') self.read_triple_ids('valid') self.read_hr_t() self.read_tr_h() self.read_hr_t_train() self.read_tr_h_train() self.read_hr_t_valid() self.read_tr_h_valid() self.read_relation_property() self.kg_meta.tot_relation = len(self.relations) self.kg_meta.tot_entity = len(self.entities) self.kg_meta.tot_valid_triples = len(self.triplets['valid']) self.kg_meta.tot_test_triples = len(self.triplets['test']) self.kg_meta.tot_train_triples = len(self.triplets['train']) self.kg_meta.tot_triple = self.kg_meta.tot_valid_triples + \ self.kg_meta.tot_test_triples + \ self.kg_meta.tot_train_triples self.cache_data() def cache_data(self): """Function to cache the prepared dataset in the memory""" with open(str(self.dataset.cache_metadata_path), 'wb') as f: pickle.dump(self.kg_meta, f) with open(str(self.dataset.cache_triplet_paths['train']), 'wb') as f: pickle.dump(self.triplets['train'], f) with open(str(self.dataset.cache_triplet_paths['test']), 'wb') as f: pickle.dump(self.triplets['test'], f) with open(str(self.dataset.cache_triplet_paths['valid']), 'wb') as f: pickle.dump(self.triplets['valid'], f) with open(str(self.dataset.cache_hr_t_path), 'wb') as f: pickle.dump(self.hr_t, f) with open(str(self.dataset.cache_tr_h_path), 'wb') as f: pickle.dump(self.tr_h, f) with open(str(self.dataset.cache_hr_t_train_path), 'wb') as f: pickle.dump(self.hr_t_train, f) with open(str(self.dataset.cache_tr_h_train_path), 'wb') as f: pickle.dump(self.tr_h_train, f) with open(str(self.dataset.cache_idx2entity_path), 'wb') as f: pickle.dump(self.idx2entity, f) with open(str(self.dataset.cache_idx2relation_path), 'wb') as f: pickle.dump(self.idx2relation, f) with open(str(self.dataset.cache_relation2idx_path), 'wb') as f: pickle.dump(self.relation2idx, f) with open(str(self.dataset.cache_entity2idx_path), 'wb') as f: pickle.dump(self.entity2idx, f) with open(str(self.dataset.cache_relationproperty_path), 'wb') as f: pickle.dump(self.relation_property, f) def read_cache_data(self, key): """Function to read the cached dataset from the memory""" if key == 'triplets_train': with open(str(self.dataset.cache_triplet_paths['train']), 'rb') as f: triplets = pickle.load(f) return triplets elif key == 'triplets_test': with open(str(self.dataset.cache_triplet_paths['test']), 'rb') as f: triplets = pickle.load(f) return triplets elif key == 'triplets_valid': with open(str(self.dataset.cache_triplet_paths['valid']), 'rb') as f: triplets = pickle.load(f) return triplets elif key == 'hr_t': with open(str(self.dataset.cache_hr_t_path), 'rb') as f: hr_t = pickle.load(f) return hr_t elif key == 'tr_h': with open(str(self.dataset.cache_tr_h_path), 'rb') as f: tr_h = pickle.load(f) return tr_h elif key == 'hr_t_train': with open(str(self.dataset.cache_hr_t_train_path), 'rb') as f: hr_t_train = pickle.load(f) return hr_t_train elif key == 'tr_h_train': with open(str(self.dataset.cache_tr_h_train_path), 'rb') as f: tr_h_train = pickle.load(f) return tr_h_train elif key == 'idx2entity': with open(str(self.dataset.cache_idx2entity_path), 'rb') as f: idx2entity = pickle.load(f) return idx2entity elif key == 'idx2relation': with open(str(self.dataset.cache_idx2relation_path), 'rb') as f: idx2relation = pickle.load(f) return idx2relation elif key == 'entity2idx': with open(str(self.dataset.cache_entity2idx_path), 'rb') as f: entity2idx = pickle.load(f) return entity2idx elif key == 'relation2idx': with open(str(self.dataset.cache_relation2idx_path), 'rb') as f: relation2idx = pickle.load(f) return relation2idx elif key == 'relationproperty': with open(str(self.dataset.cache_relationproperty_path), 'rb') as f: relation_property = pickle.load(f) return relation_property def is_cache_exists(self): """Function to check if the dataset is cached in the memory""" return self.dataset.is_meta_cache_exists() def read_triplets(self, set_type): ''' read triplets from txt files in dataset folder. (in string format) ''' triplets = self.triplets[set_type] if len(triplets) == 0: with open(str(self.dataset.data_paths[set_type]), 'r', encoding='utf-8') as file: for line in file.readlines(): s, p, o = line.split('\t') triplets.append(Triple(s.strip(), p.strip(), o.strip())) return triplets def read_entities(self): """ Function to read the entities. """ if len(self.entities) == 0: entities = set() all_triplets = self.read_triplets('train') + \ self.read_triplets('valid') + \ self.read_triplets('test') for triplet in all_triplets: entities.add(triplet.h) entities.add(triplet.t) self.entities = np.sort(list(entities)) return self.entities def read_relations(self): """ Function to read the relations. """ if len(self.relations) == 0: relations = set() all_triplets = self.read_triplets('train') + \ self.read_triplets('valid') + \ self.read_triplets('test') for triplet in all_triplets: relations.add(triplet.r) self.relations = np.sort(list(relations)) return self.relations def read_mappings(self): """ Function to generate the mapping from string name to integer ids. """ self.entity2idx = {v: k for k, v in enumerate(self.read_entities())} ## self.idx2entity = {v: k for k, v in self.entity2idx.items()} self.relation2idx = { v: k for k, v in enumerate(self.read_relations()) } ## self.idx2relation = {v: k for k, v in self.relation2idx.items()} def read_triple_ids(self, set_type): """ Function to read the triple idx. Args: set_type (str): Type of data, eithe train, test or valid. """ # assert entities can not be none # assert relations can not be none triplets = self.triplets[set_type] entity2idx = self.entity2idx relation2idx = self.relation2idx if len(triplets) != 0: for t in triplets: t.set_ids(entity2idx[t.h], relation2idx[t.r], entity2idx[t.t]) return triplets def read_hr_t(self): """ Function to read the list of tails for the given head and relation pair. """ for set_type in self.triplets: triplets = self.triplets[set_type] for t in triplets: self.hr_t[(t.h, t.r)].add(t.t) return self.hr_t def read_tr_h(self): """ Function to read the list of heads for the given tail and relation pair. """ for set_type in self.triplets: triplets = self.triplets[set_type] for t in triplets: self.tr_h[(t.t, t.r)].add(t.h) return self.tr_h def read_hr_t_train(self): """ Function to read the list of tails for the given head and relation pair for the training set. """ triplets = self.triplets['train'] for t in triplets: self.hr_t_train[(t.h, t.r)].add(t.t) return self.hr_t_train def read_tr_h_train(self): """ Function to read the list of heads for the given tail and relation pair for the training set. """ triplets = self.triplets['train'] for t in triplets: self.tr_h_train[(t.t, t.r)].add(t.h) return self.tr_h_train def read_hr_t_valid(self): """ Function to read the list of tails for the given head and relation pair for the valid set. """ triplets = self.triplets['valid'] for t in triplets: self.hr_t_valid[(t.h, t.r)].add(t.t) return self.hr_t_valid def read_tr_h_valid(self): """ Function to read the list of heads for the given tail and relation pair for the valid set. """ triplets = self.triplets['valid'] for t in triplets: self.tr_h_valid[(t.t, t.r)].add(t.h) return self.tr_h_valid def read_relation_property(self): """ Function to read the relation property. Returns: list: Returns the list of relation property. """ relation_property_head = {x: [] for x in range(len(self.relations))} relation_property_tail = {x: [] for x in range(len(self.relations))} for t in self.triplets['train']: relation_property_head[t.r].append(t.h) relation_property_tail[t.r].append(t.t) self.relation_property = {} for x in relation_property_head.keys(): value_up = len(set(relation_property_tail[x])) value_bot = len(set(relation_property_head[x])) + len( set(relation_property_tail[x])) if value_bot == 0: value = 0 else: value = value_up / value_bot self.relation_property[x] = value return self.relation_property ''' reserved for debugging ''' def dump(self): """ Function to dump statistic information of a dataset """ ''' dump key information''' dump = [] dump.append("") dump.append("----------Metadata Info for Dataset:%s----------------" % self.dataset_name) dump.append("Total Training Triples :%s" % self.kg_meta.tot_train_triples) dump.append("Total Testing Triples :%s" % self.kg_meta.tot_test_triples) dump.append("Total validation Triples :%s" % self.kg_meta.tot_valid_triples) dump.append("Total Entities :%s" % self.kg_meta.tot_entity) dump.append("Total Relations :%s" % self.kg_meta.tot_relation) dump.append("---------------------------------------------") dump.append("") self._logger.info(("\n".join(dump)))
class HyperparamterLoader: """Hyper parameters loading based datasets and embedding algorithms""" _logger = Logger().get_logger(__name__) def __init__(self, args): self.hyperparams, self.search_space = self._load_parameter_config( args.hp_abs_dir) if hasattr( args, "hp_abs_dir") else self._load_parameter_config(None) def load_hyperparameter(self, dataset_name, algorithm): d_name = dataset_name.lower() a_name = algorithm.lower() if d_name in self.hyperparams and a_name in self.hyperparams[d_name]: params = self.hyperparams[d_name][a_name] return params raise Exception( "This experimental setting for (%s, %s) has not been configured" % (dataset_name, algorithm)) def load_search_space(self, algorithm): if algorithm in self.search_space: return self.search_space[algorithm] raise ValueError( "Hyperparameter search space is not configured for %s" % algorithm) @staticmethod def _load_parameter_config(config_abs_dir): default_config_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "hyperparams") hyperparams, search_space = HyperparamterLoader._load_yaml_config( default_config_dir, {}, {}) if config_abs_dir is not None: hyperparams, search_space = HyperparamterLoader._load_yaml_config( config_abs_dir, hyperparams, search_space) return hyperparams, search_space @staticmethod def _load_yaml_config(config_dir, hyperparams, search_space): for config_file in os.listdir(config_dir): if config_file.endswith("yaml") or config_file.endswith("yml"): with open( os.path.abspath(os.path.join(config_dir, config_file)), "r") as file: try: config = yaml.safe_load(file) algorithm = os.path.splitext(config_file)[0].lower() if config["dataset"] in hyperparams: hyperparams[config["dataset"]][algorithm] = config[ "parameters"] else: hyperparams = { **hyperparams, **{ config["dataset"]: { algorithm: config["parameters"] } } } search_space = { **search_space, **{ algorithm: HyperparamterLoader._config_tuning_space(config["search_space"]) } } except yaml.YAMLError: HyperparamterLoader._logger.error( "Cannot load configuration: %s" % config_file) raise else: HyperparamterLoader._logger.warning( "Skipped non YAML file: %s" % config_file) return hyperparams, search_space @staticmethod def _config_tuning_space(tuning_space_raw): if tuning_space_raw is None: return None hyper_obj = {} if "learning_rate" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "learning_rate": hp.loguniform( 'learning_rate', np.log(tuning_space_raw['learning_rate']['min']), np.log(tuning_space_raw['learning_rate']['max'])) } } if "hidden_size" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "hidden_size": scope.int( hp.qloguniform( 'hidden_size', np.log(tuning_space_raw['hidden_size']['min']), np.log(tuning_space_raw['hidden_size']['max']), 1)) } } if "ent_hidden_size" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "ent_hidden_size": scope.int( hp.qloguniform( "ent_hidden_size", np.log(tuning_space_raw['ent_hidden_size']['min']), np.log(tuning_space_raw['ent_hidden_size']['max']), 1)) } } if "rel_hidden_size" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "rel_hidden_size": scope.int( hp.qloguniform( "rel_hidden_size", np.log(tuning_space_raw['rel_hidden_size']['min']), np.log(tuning_space_raw['rel_hidden_size']['max']), 1)) } } if "batch_size" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "batch_size": scope.int( hp.qloguniform( "batch_size", np.log(tuning_space_raw['batch_size']['min']), np.log(tuning_space_raw['batch_size']['max']), 1)) } } if "margin" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "margin": hp.uniform("margin", tuning_space_raw["margin"]["min"], tuning_space_raw["margin"]["max"]) } } if "lmbda" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "lmbda": hp.loguniform('lmbda', np.log(tuning_space_raw["lmbda"]["min"]), np.log(tuning_space_raw["lmbda"]["max"])) } } if "distance_measure" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "distance_measure": hp.choice('distance_measure', tuning_space_raw["distance_measure"]) } } if "cmax" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "cmax": hp.loguniform('cmax', np.log(tuning_space_raw["cmax"]["min"]), np.log(tuning_space_raw["cmax"]["max"])) } } if "cmin" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "cmin": hp.loguniform('cmin', np.log(tuning_space_raw["cmin"]["min"]), np.log(tuning_space_raw["cmin"]["max"])) } } if "optimizer" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "optimizer": hp.choice("optimizer", tuning_space_raw["optimizer"]) } } if "bilinear" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "bilinear": hp.choice('bilinear', tuning_space_raw["bilinear"]) } } if "epochs" in tuning_space_raw: hyper_obj = { **hyper_obj, **{ "epochs": hp.choice("epochs", tuning_space_raw["epochs"]) } } return hyper_obj
class BaysOptimizer: """Bayesian optimizer class for tuning hyperparameter. This class implements the Bayesian Optimizer for tuning the hyper-parameter. Args: args (object): The Argument Parser object providing arguments. name_dataset (str): The name of the dataset. sampling (str): sampling to be used for generating negative triples Examples: >>> from pykg2vec.common import KGEArgParser >>> from pykg2vec.utils.bayesian_optimizer import BaysOptimizer >>> model = Complex() >>> args = KGEArgParser().get_args(sys.argv[1:]) >>> bays_opt = BaysOptimizer(args=args) >>> bays_opt.optimize() """ _logger = Logger().get_logger(__name__) def __init__(self, args): """store the information of database""" if args.model_name.lower() in [ "conve", "convkb", "proje_pointwise", "interacte", "hyper", "acre" ]: raise Exception( "Model %s has not been supported in tuning hyperparameters!" % args.model) self.model_name = args.model_name self.knowledge_graph = KnowledgeGraph( dataset=args.dataset_name, custom_dataset_path=args.dataset_path) self.kge_args = args self.max_evals = args.max_number_trials if not args.debug else 3 self.config_obj, self.model_obj = Importer().import_model_config( self.model_name.lower()) self.config_local = self.config_obj(self.kge_args) self.search_space = HyperparameterLoader(args).load_search_space( self.model_name.lower()) self._best_result = None self.trainer = None def optimize(self): """Function that performs bayesian optimization""" trials = Trials() self._best_result = fmin(fn=self._get_loss, space=self.search_space, trials=trials, algo=tpe.suggest, max_evals=self.max_evals) columns = list(self.search_space.keys()) results = pd.DataFrame(columns=['iteration'] + columns + ['loss']) for idx, trial in enumerate(trials.trials): row = [idx] translated_eval = space_eval( self.search_space, {k: v[0] for k, v in trial['misc']['vals'].items()}) for k in columns: row.append(translated_eval[k]) row.append(trial['result']['loss']) results.loc[idx] = row path = self.config_local.path_result / self.model_name path.mkdir(parents=True, exist_ok=True) results.to_csv(str(path / "trials.csv"), index=False) self._logger.info(results) self._logger.info('Found golden setting:') self._logger.info(space_eval(self.search_space, self._best_result)) def return_best(self): """Function to return the best hyper-parameters""" assert self._best_result is not None, 'Cannot find golden setting. Has optimize() been called?' return space_eval(self.search_space, self._best_result) def _get_loss(self, params): """Function that defines and acquires the loss""" # copy the hyperparameters to trainer config and hyperparameter set. for key, value in params.items(): self.config_local.__dict__[key] = value self.config_local.__dict__['device'] = self.kge_args.device model = self.model_obj(**self.config_local.__dict__) self.trainer = Trainer(model, self.config_local) # configure common setting for a tuning training. self.config_local.disp_result = False self.config_local.disp_summary = False self.config_local.save_model = False # do not overwrite test numbers if set if self.config_local.test_num is None: self.config_local.test_num = 1000 if self.kge_args.debug: self.config_local.epochs = 1 # start the trial. self.trainer.build_model() loss = self.trainer.tune_model() return {'loss': loss, 'status': STATUS_OK}
class Trainer(TrainerMeta): """Class for handling the training of the algorithms. Args: model (object): Model object debug (bool): Flag to check if its debugging tuning (bool): Flag to denoting tuning if True patience (int): Number of epochs to wait before early stopping the training on no improvement. No early stopping if it is a negative number (default: {-1}). Examples: >>> from pykg2vec.utils.trainer import Trainer >>> from pykg2vec.core.TransE import TransE >>> trainer = Trainer(TransE()) >>> trainer.build_model() >>> trainer.train_model() """ _logger = Logger().get_logger(__name__) def __init__(self, model): self.model = model self.config = model.config self.training_results = [] self.evaluator = None self.generator = None def build_model(self): """function to build the model""" if self.config.optimizer == 'sgd': self.optimizer = tf.keras.optimizers.SGD(learning_rate=self.config.learning_rate) elif self.config.optimizer == 'rms': self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=self.config.learning_rate) elif self.config.optimizer == 'adam': self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.config.learning_rate) elif self.config.optimizer == 'adagrad': self.optimizer = tf.keras.optimizers.Adagrad(learning_rate=self.config.learning_rate, initial_accumulator_value=0.0, epsilon=1e-08) elif self.config.optimizer == 'adadelta': self.optimizer = tf.keras.optimizers.Adadelta(learning_rate=self.config.learning_rate) else: raise NotImplementedError("No support for %s optimizer" % self.config.optimizer) # For optimizer that has not supported gpu computation in TF2, place parameters in cpu. if self.config.optimizer in ['rms', 'adagrad', 'adadelta']: with tf.device('cpu:0'): self.model.def_parameters() else: self.model.def_parameters() self.config.summary() self.config.summary_hyperparameter(self.model.model_name) self.early_stopper = EarlyStopper(self.config.patience, Monitor.FILTERED_MEAN_RANK) ''' Training related functions:''' @tf.function def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t): with tf.GradientTape() as tape: pos_preds = self.model.forward(pos_h, pos_r, pos_t) neg_preds = self.model.forward(neg_h, neg_r, neg_t) if self.config.sampling == 'adversarial_negative_sampling': # RotatE: Adversarial Nnegative Sampling and alpha is the temperature. pos_preds = -pos_preds neg_preds = -neg_preds pos_preds = tf.math.log_sigmoid(pos_preds) neg_preds = tf.reshape(neg_preds, [-1, self.config.neg_rate]) softmax = tf.stop_gradient(tf.nn.softmax(neg_preds*self.config.alpha, axis=1)) neg_preds = tf.reduce_sum(softmax * (tf.math.log_sigmoid(-neg_preds)), axis=-1) loss = -tf.reduce_mean(neg_preds) - tf.reduce_mean(pos_preds) else: # others that use margin-based & pairwise loss function. (unif or bern) loss = tf.reduce_sum(tf.maximum(pos_preds + self.config.margin - neg_preds, 0)) if hasattr(self.model, 'get_reg'): # now only NTN uses regularizer, # other pairwise based KGE methods use normalization to regularize parameters. loss += self.model.get_reg() gradients = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) return loss @tf.function def train_step_projection(self, h, r, t, hr_t, tr_h): with tf.GradientTape() as tape: hr_t = tf.cast(tf.sparse.to_dense(tf.sparse.reorder(hr_t)), dtype=tf.float32) tr_h = tf.cast(tf.sparse.to_dense(tf.sparse.reorder(tr_h)), dtype=tf.float32) if self.model.model_name.lower() == "conve" or self.model.model_name.lower() == "tucker": if hasattr(self.config, 'label_smoothing'): hr_t = hr_t * (1.0 - self.config.label_smoothing) + 1.0 / self.config.kg_meta.tot_entity tr_h = tr_h * (1.0 - self.config.label_smoothing) + 1.0 / self.config.kg_meta.tot_entity pred_tails = self.model.forward(h, r, direction="tail") # (h, r) -> hr_t forward pred_heads = self.model.forward(t, r, direction="head") # (t, r) -> tr_h backward loss_tails = tf.reduce_mean(tf.keras.backend.binary_crossentropy(hr_t, pred_tails)) loss_heads = tf.reduce_mean(tf.keras.backend.binary_crossentropy(tr_h, pred_heads)) loss = loss_tails + loss_heads else: loss_tails = self.model.forward(h, r, hr_t, direction="tail") # (h, r) -> hr_t forward loss_heads = self.model.forward(t, r, tr_h, direction="head") # (t, r) -> tr_h backward loss = loss_tails + loss_heads if hasattr(self.model, 'get_reg'): # now only complex distmult uses regularizer in algorithms, loss += self.model.get_reg() gradients = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) return loss @tf.function def train_step_pointwise(self, h, r, t, y): with tf.GradientTape() as tape: preds = self.model.forward(h, r, t) loss = tf.reduce_mean(tf.nn.softplus(y*preds)) if hasattr(self.model, 'get_reg'): # for complex & complex-N3 & DistMult & CP & ANALOGY loss += self.model.get_reg(h, r, t) gradients = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) return loss def train_model(self, monitor=Monitor.FILTERED_MEAN_RANK): """Function to train the model.""" self.generator = Generator(self.model) self.evaluator = Evaluator(self.model) if self.config.loadFromData: self.load_model() for cur_epoch_idx in range(self.config.epochs): self._logger.info("Epoch[%d/%d]" % (cur_epoch_idx, self.config.epochs)) self.train_model_epoch(cur_epoch_idx) if cur_epoch_idx % self.config.test_step == 0: metrics = self.evaluator.mini_test(cur_epoch_idx) if self.early_stopper.should_stop(metrics): ### Early Stop Mechanism ### start to check if the metric is still improving after each mini-test. ### Example, if test_step == 5, the trainer will check metrics every 5 epoch. break self.evaluator.full_test(cur_epoch_idx) self.evaluator.metric_calculator.save_test_summary(self.model.model_name) self.generator.stop() self.save_training_result() if self.config.save_model: self.save_model() if self.config.disp_result: self.display() if self.config.disp_summary: self.config.summary() self.config.summary_hyperparameter(self.model.model_name) self.export_embeddings() return cur_epoch_idx # the runned epoches. def tune_model(self): """Function to tune the model.""" current_loss = float("inf") self.generator = Generator(self.model) self.evaluator = Evaluator(self.model, tuning=True) for cur_epoch_idx in range(self.config.epochs): current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True) self.evaluator.full_test(cur_epoch_idx) self.generator.stop() return current_loss def train_model_epoch(self, epoch_idx, tuning=False): """Function to train the model for one epoch.""" acc_loss = 0 num_batch = self.model.config.kg_meta.tot_train_triples // self.config.batch_size if not self.config.debug else 10 metrics_names = ['acc_loss', 'loss'] progress_bar = tf.keras.utils.Progbar(num_batch, stateful_metrics=metrics_names) self.generator.start_one_epoch(num_batch) for batch_idx in range(num_batch): data = list(next(self.generator)) if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED: h = tf.convert_to_tensor(data[0], dtype=tf.int32) r = tf.convert_to_tensor(data[1], dtype=tf.int32) t = tf.convert_to_tensor(data[2], dtype=tf.int32) hr_t = data[3] rt_h = data[4] loss = self.train_step_projection(h, r, t, hr_t, rt_h) elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED: h = tf.convert_to_tensor(data[0], dtype=tf.int32) r = tf.convert_to_tensor(data[1], dtype=tf.int32) t = tf.convert_to_tensor(data[2], dtype=tf.int32) y = tf.convert_to_tensor(data[3], dtype=tf.float32) loss = self.train_step_pointwise(h, r, t, y) else: ph = tf.convert_to_tensor(data[0], dtype=tf.int32) pr = tf.convert_to_tensor(data[1], dtype=tf.int32) pt = tf.convert_to_tensor(data[2], dtype=tf.int32) nh = tf.convert_to_tensor(data[3], dtype=tf.int32) nr = tf.convert_to_tensor(data[4], dtype=tf.int32) nt = tf.convert_to_tensor(data[5], dtype=tf.int32) loss = self.train_step_pairwise(ph, pr, pt, nh, nr, nt) acc_loss += loss if not tuning: progress_bar.add(1, values=[('acc_loss', acc_loss), ('loss', loss)]) self.training_results.append([epoch_idx, acc_loss.numpy()]) return acc_loss.numpy() def enter_interactive_mode(self): self.build_model() self.load_model() self.evaluator = Evaluator(self.model) self._logger.info("""The training/loading of the model has finished! Now enter interactive mode :) ----- Example 1: trainer.infer_tails(1,10,topk=5)""") self.infer_tails(1,10,topk=5) self._logger.info("""----- Example 2: trainer.infer_heads(10,20,topk=5)""") self.infer_heads(10,20,topk=5) self._logger.info("""----- Example 3: trainer.infer_rels(1,20,topk=5)""") self.infer_rels(1,20,topk=5) def exit_interactive_mode(self): self._logger.info("Thank you for trying out inference interactive script :)") def infer_tails(self,h,r,topk=5): tails = self.evaluator.test_tail_rank(h,r,topk).numpy() logs = [] logs.append("") logs.append("(head, relation)->({},{}) :: Inferred tails->({})".format(h,r,",".join([str(i) for i in tails]))) logs.append("") idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation') logs.append("head: %s" % idx2ent[h]) logs.append("relation: %s" % idx2rel[r]) for idx, tail in enumerate(tails): logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail])) self._logger.info("\n".join(logs)) return {tail: idx2ent[tail] for tail in tails} def infer_heads(self,r,t,topk=5): heads = self.evaluator.test_head_rank(r,t,topk).numpy() logs = [] logs.append("") logs.append("(relation,tail)->({},{}) :: Inferred heads->({})".format(t,r,",".join([str(i) for i in heads]))) logs.append("") idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation') logs.append("tail: %s" % idx2ent[t]) logs.append("relation: %s" % idx2rel[r]) for idx, head in enumerate(heads): logs.append("%dth predicted head: %s" % (idx, idx2ent[head])) self._logger.info("\n".join(logs)) return {head: idx2ent[head] for head in heads} def infer_rels(self, h, t, topk=5): if self.model.model_name.lower() in ["proje_pointwise", "conve", "tucker"]: self._logger.info("%s model doesn't support relation inference in nature.") return rels = self.evaluator.test_rel_rank(h,t,topk).numpy() logs = [] logs.append("") logs.append("(head,tail)->({},{}) :: Inferred rels->({})".format(h, t, ",".join([str(i) for i in rels]))) logs.append("") idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation') logs.append("head: %s" % idx2ent[h]) logs.append("tail: %s" % idx2ent[t]) for idx, rel in enumerate(rels): logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel])) self._logger.info("\n".join(logs)) return {rel: idx2rel[rel] for rel in rels} ''' Procedural functions:''' def save_model(self): """Function to save the model.""" saved_path = self.config.path_tmp / self.model.model_name saved_path.mkdir(parents=True, exist_ok=True) self.model.save_weights(str(saved_path / 'model.vec')) def load_model(self): """Function to load the model.""" saved_path = self.config.path_tmp / self.model.model_name if saved_path.exists(): self.model.load_weights(str(saved_path / 'model.vec')) def display(self): """Function to display embedding.""" options = {"ent_only_plot": True, "rel_only_plot": not self.config.plot_entity_only, "ent_and_rel_plot": not self.config.plot_entity_only} if self.config.plot_embedding: viz = Visualization(model=self.model, vis_opts = options) viz.plot_embedding(resultpath=self.config.figures, algos=self.model.model_name, show_label=False) if self.config.plot_training_result: viz = Visualization(model=self.model) viz.plot_train_result() if self.config.plot_testing_result: viz = Visualization(model=self.model) viz.plot_test_result() def export_embeddings(self): """ Export embeddings in tsv and pandas pickled format. With tsvs (both label, vector files), you can: 1) Use those pretained embeddings for your applications. 2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/) Pandas dataframes can be read with pd.read_pickle('desired_file.pickle') """ save_path = self.config.path_embeddings / self.model.model_name save_path.mkdir(parents=True, exist_ok=True) idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation') series_ent = pd.Series(idx2ent) series_rel = pd.Series(idx2rel) series_ent.to_pickle(save_path / "ent_labels.pickle") series_rel.to_pickle(save_path / "rel_labels.pickle") with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file: for label in idx2ent.values(): l_export_file.write(label + "\n") with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file: for label in idx2rel.values(): l_export_file.write(label + "\n") for parameter in self.model.parameter_list: all_ids = list(range(0, int(parameter.shape[0]))) stored_name = parameter.name.split(':')[0] # import pdb; pdb.set_trace() if len(parameter.shape) == 2: all_embs = parameter.numpy() with open(str(save_path / ("%s.tsv" % stored_name)), 'w') as v_export_file: for idx in all_ids: v_export_file.write("\t".join([str(x) for x in all_embs[idx]]) + "\n") df = pd.DataFrame(all_embs) df.to_pickle(save_path / ("%s.pickle" % stored_name)) def save_training_result(self): """Function that saves training result""" files = os.listdir(str(self.model.config.path_result)) l = len([f for f in files if self.model.model_name in f if 'Training' in f]) df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss']) with open(str(self.model.config.path_result / (self.model.model_name + '_Training_results_' + str(l) + '.csv')), 'w') as fh: df.to_csv(fh)
class Visualization: """Class to aid in visualizing the results and embddings. Args: model (object): Model object vis_opts (list): Options for visualization. sess (object): TensorFlow session object, initialized by the trainer. Examples: >>> from pykg2vec.utils.visualization import Visualization >>> from pykg2vec.utils.trainer import Trainer >>> from pykg2vec.models.TransE import TransE >>> model = TransE() >>> trainer = Trainer(model=model) >>> trainer.build_model() >>> trainer.train_model() >>> viz = Visualization(model=model) >>> viz.plot_train_result() """ _logger = Logger().get_logger(__name__) def __init__(self, model, config, vis_opts=None): if vis_opts: self.ent_only_plot = vis_opts["ent_only_plot"] self.rel_only_plot = vis_opts["rel_only_plot"] self.ent_and_rel_plot = vis_opts["ent_and_rel_plot"] else: self.ent_only_plot = False self.rel_only_plot = False self.ent_and_rel_plot = False self.model = model self.config = config self.algo_list = [ 'ANALOGY', 'Complex', 'ComplexN3', 'ConvE', 'CP', 'DistMult', 'DistMult2', 'HoLE', 'KG2E', 'NTN', 'ProjE_pointwise', 'Rescal', 'RotatE', 'SimplE_avg', 'SimplE_ignr', 'SLM', 'SME_Bilinear', 'SME_Linear', 'TransD', 'TransE', 'TransH', 'TransM', 'TransR', 'TuckER' ] self.h_name = [] self.r_name = [] self.t_name = [] self.h_emb = [] self.r_emb = [] self.t_emb = [] self.h_proj_emb = [] self.r_proj_emb = [] self.t_proj_emb = [] if self.model is not None: self.validation_triples_ids = self.config.knowledge_graph.read_cache_data( 'triplets_valid') self.idx2entity = self.config.knowledge_graph.read_cache_data( 'idx2entity') self.idx2relation = self.config.knowledge_graph.read_cache_data( 'idx2relation') self.get_idx_n_emb() def get_idx_n_emb(self): """Function to get the integer ids and the embedding.""" idx = np.random.choice(len(self.validation_triples_ids), self.config.disp_triple_num) triples = [] for i, _ in enumerate(idx): triples.append(self.validation_triples_ids[idx[i]]) for t in triples: self.h_name.append(self.idx2entity[t.h]) self.r_name.append(self.idx2relation[t.r]) self.t_name.append(self.idx2entity[t.t]) emb_h, emb_r, emb_t = self.model.embed( torch.LongTensor([t.h]).to(self.config.device), torch.LongTensor([t.r]).to(self.config.device), torch.LongTensor([t.t]).to(self.config.device)) self.h_emb.append(emb_h) self.r_emb.append(emb_r) self.t_emb.append(emb_t) if self.ent_and_rel_plot: try: emb_h, emb_r, emb_t = self.model.embed( torch.LongTensor([t.h]).to(self.config.device), torch.LongTensor([t.r]).to(self.config.device), torch.LongTensor([t.t]).to(self.config.device)) self.h_proj_emb.append(emb_h) self.r_proj_emb.append(emb_r) self.t_proj_emb.append(emb_t) except Exception as e: self._logger.exception(e) def plot_embedding(self, resultpath=None, algos=None, show_label=False, disp_num_r_n_e=20): """Function to plot the embedding. Args: resultpath (str): Path where the result will be saved. show_label (bool): If True, will display the labels. algos (str): Name of the algorithms that generated the embedding. disp_num_r_n_e (int): Total number of entities to display for head, tail and relation. """ assert self.model is not None, 'Please provide a model!' if self.ent_only_plot: x = torch.cat(self.h_emb + self.t_emb, dim=0) ent_names = np.concatenate((self.h_name, self.t_name), axis=0) self._logger.info("\t Reducing dimension using TSNE to 2!") x = TSNE(n_components=2).fit_transform(x.detach().cpu()) x = np.asarray(x) ent_names = np.asarray(ent_names) self.draw_embedding(x, ent_names, resultpath, algos + '_entity_plot', show_label) if self.rel_only_plot: x = torch.cat(self.r_emb, dim=0) self._logger.info("\t Reducing dimension using TSNE to 2!") x = TSNE(n_components=2).fit_transform(x.detach().cpu()) self.draw_embedding(x, self.r_name, resultpath, algos + '_rel_plot', show_label) if self.ent_and_rel_plot: length = len(self.h_proj_emb) x = torch.cat(self.h_proj_emb + self.r_proj_emb + self.t_proj_emb, dim=0) self._logger.info("\t Reducing dimension using TSNE to 2!") x = TSNE(n_components=2).fit_transform(x.detach().cpu()) h_embs = x[:length, :] r_embs = x[length:2 * length, :] t_embs = x[2 * length:3 * length, :] self.draw_embedding_rel_space( h_embs[:disp_num_r_n_e], r_embs[:disp_num_r_n_e], t_embs[:disp_num_r_n_e], self.h_name[:disp_num_r_n_e], self.r_name[:disp_num_r_n_e], self.t_name[:disp_num_r_n_e], resultpath, algos + '_ent_n_rel_plot', show_label) def plot_train_result(self): """Function to plot the training result.""" algo = self.algo_list path = self.config.path_result result = self.config.path_figures data = [self.config.dataset_name] files = os.listdir(str(path)) files_lwcase = [f.lower() for f in files] for d in data: df = pd.DataFrame() for a in algo: file_no = len([ c for c in files_lwcase if a.lower() in c if 'training' in c ]) if file_no < 1: continue file_path = str(path / (a.lower() + '_Training_results_' + str(file_no - 1) + '.csv')) if os.path.exists(file_path): with open( str(path / (a.lower() + '_Training_results_' + str(file_no - 1) + '.csv')), 'r') as fh: df_2 = pd.read_csv(fh) if df.empty: df['Epochs'] = df_2['Epochs'] df['Loss'] = df_2['Loss'] df['Algorithm'] = [a] * len(df_2) else: df_3 = pd.DataFrame() df_3['Epochs'] = df_2['Epochs'] df_3['Loss'] = df_2['Loss'] df_3['Algorithm'] = [a] * len(df_2) frames = [df, df_3] df = pd.concat(frames) plt.figure() seaborn.lineplot(x="Epochs", y="Loss", hue="Algorithm", markers=True, dashes=False, data=df) files = os.listdir(str(result)) files_lwcase = [f.lower() for f in files] file_no = len( [c for c in files_lwcase if d.lower() in c if 'training' in c]) plt.savefig(str( result / (d + '_training_loss_plot_' + str(file_no) + '.pdf')), bbox_inches='tight', dpi=300) # plt.show() def plot_test_result(self): """Function to plot the testing result.""" algo = self.algo_list path = self.config.path_result result = self.config.path_figures data = [self.config.dataset_name] hits = self.config.hits assert path is not None and algo is not None and data is not None, 'Please provide valid path, algorithm and dataset!' files = os.listdir(str(path)) # files_lwcase = [f.lower() for f in files if 'Testing' in f] # self._logger.info(files_lwcase) for d in data: df = pd.DataFrame() for a in algo: file_algo = [ c for c in files if a.lower() in c.lower() if 'testing' in c.lower() ] if not file_algo: continue with open(str(path / file_algo[-1]), 'r') as fh: df_2 = pd.read_csv(fh) if df.empty: df['Algorithm'] = [a] * len(df_2) df['Epochs'] = df_2['Epoch'] df['Mean Rank'] = df_2['Mean Rank'] df['Filt Mean Rank'] = df_2['Filtered Mean Rank'] for hit in hits: df['Hits' + str(hit)] = df_2['Hit-%d Ratio' % hit] df['Filt Hits' + str(hit)] = df_2['Filtered Hit-%d Ratio' % hit] else: df_3 = pd.DataFrame() df_3['Algorithm'] = [a] * len(df_2) df_3['Epochs'] = df_2['Epoch'] df_3['Mean Rank'] = df_2['Mean Rank'] df_3['Filt Mean Rank'] = df_2['Filtered Mean Rank'] for hit in hits: df_3['Hits' + str(hit)] = df_2['Hit-%d Ratio' % hit] df_3['Filt Hits' + str(hit)] = df_2['Filtered Hit-%d Ratio' % hit] frames = [df, df_3] df = pd.concat(frames) files = os.listdir(str(result)) df_4 = df.loc[df['Epochs'] == max(df['Epochs'])] df_4 = df_4.loc[:, df_4.columns != 'Epochs'] file_no = len([ c for c in files if d.lower() in c.lower() if 'testing' in c.lower() if 'latex' in c.lower() ]) with open( str(result / (d + '_testing_latex_table_' + str(file_no + 1) + '.txt')), 'w') as fh: fh.write(df_4.to_latex(index=False)) file_no = len([ c for c in files if d.lower() in c.lower() if 'testing' in c.lower() if 'table' in c.lower() if 'csv' in c.lower() ]) with open( str(result / (d + '_testing_table_' + str(file_no + 1) + '.csv')), 'w') as fh: df_4.to_csv(fh, index=False) df_5 = pd.DataFrame(columns=['Metrics', 'Algorithm', 'Score']) metrics = [f for f in df_4.columns if f != 'Algorithm'] for i in range(len(df_4)): # import pdb # pdb.set_trace() if df_5.empty: df_5['Algorithm'] = [df_4.iloc[i]['Algorithm'] ] * len(metrics) df_5['Metrics'] = metrics df_5['Score'] = df_4.iloc[i][metrics].values else: df_t = pd.DataFrame() df_t['Algorithm'] = [df_4.iloc[i]['Algorithm'] ] * len(metrics) df_t['Metrics'] = metrics df_t['Score'] = df_4.iloc[i][metrics].values frame = [df_5, df_t] df_5 = pd.concat(frame) df_6 = df_5[~df_5['Metrics'].str.contains('Hits')] plt.figure() flatui = [ "#d46a7e", "#d5b60a", "#9b59b6", "#3498db", "#95a5a6", "#34495e", "#2ecc71", "#e74c3c" ] g = seaborn.barplot(x="Metrics", y='Score', hue="Algorithm", palette=flatui, data=df_6) g.legend(loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=6) g.tick_params(labelsize=6) # ax = seaborn.lineplot(x="Metrics", y='Score', hue="Algorithm", # markers=True, dashes=False, data=df_5) files_lwcase = [f.lower() for f in files] file_no = len([ c for c in files_lwcase if d.lower() in c if 'testing' in c if 'rank_plot' in c ]) plt.savefig( str(result / (d + '_testing_rank_plot_' + str(file_no + 1) + '.pdf')), bbox_inches='tight', dpi=300) # plt.show() df_6 = df_5[df_5['Metrics'].str.contains('Hits')] plt.figure() flatui = [ "#3498db", "#95a5a6", "#34495e", "#2ecc71", "#e74c3c", "#d46a7e", "#d5b60a", "#9b59b6" ] g = seaborn.barplot(x="Metrics", y='Score', hue="Algorithm", palette=flatui, data=df_6) g.legend(loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=6) g.tick_params(labelsize=6) files_lwcase = [f.lower() for f in files] file_no = len([ c for c in files_lwcase if d.lower() in c if 'testing' in c if 'hits_plot' in c ]) plt.savefig( str(result / (d + '_testing_hits_plot_' + str(file_no + 1) + '.pdf')), bbox_inches='tight', dpi=300) # plt.show() @staticmethod def draw_embedding(embs, names, resultpath, algos, show_label): """Function to draw the embedding. Args: embs (matrix): Two dimesnional embeddings. names (list):List of string name. resultpath (str):Path where the result will be save. algos (str): Name of the algorithms which generated the algorithm. show_label (bool): If True, prints the string names of the entities and relations. """ pos = {} node_color_mp = {} unique_ent = set(names) colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys()) tot_col = len(colors) j = 0 for i, e in enumerate(unique_ent): node_color_mp[e] = colors[j] j += 1 if j >= tot_col: j = 0 G = nx.Graph() hm_ent = {} for i, ent in enumerate(names): hm_ent[i] = ent G.add_node(i) pos[i] = embs[i] colors = [] for n in list(G.nodes): colors.append(node_color_mp[hm_ent[n]]) plt.figure() nodes_draw = nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=50) nodes_draw.set_edgecolor('k') if show_label: nx.draw_networkx_labels(G, pos, font_size=8) if not os.path.exists(resultpath): os.mkdir(resultpath) files = os.listdir(resultpath) file_no = len([c for c in files if algos + '_embedding_plot' in c]) filename = algos + '_embedding_plot_' + str(file_no) + '.png' plt.savefig(str(resultpath / filename), bbox_inches='tight', dpi=300) # plt.show() @staticmethod def draw_embedding_rel_space(h_emb, r_emb, t_emb, h_name, r_name, t_name, resultpath, algos, show_label): """Function to draw the embedding in relation space. Args: h_emb (matrix): Two dimesnional embeddings of head. r_emb (matrix): Two dimesnional embeddings of relation. t_emb (matrix): Two dimesnional embeddings of tail. h_name (list):List of string name of the head. r_name (list):List of string name of the relation. t_name (list):List of string name of the tail. resultpath (str):Path where the result will be save. algos (str): Name of the algorithms which generated the algorithm. show_label (bool): If True, prints the string names of the entities and relations. """ pos = {} node_color_mp_ent = {} node_color_mp_rel = {} unique_ent = set(h_name) | set(t_name) unique_rel = set(r_name) colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys()) tot_col = len(colors) j = 0 for i, e in enumerate(unique_ent): node_color_mp_ent[e] = colors[j] j += 1 if j >= tot_col: j = 0 tot_col = len(colors) j = 0 for i, r in enumerate(unique_rel): node_color_mp_rel[r] = colors[j] j += 1 if j >= tot_col: j = 0 G = nx.DiGraph() idx = 0 head_colors = [] rel_colors = [] tail_colors = [] head_nodes = [] tail_nodes = [] rel_nodes = [] for i, _ in enumerate(h_name): G.add_edge(idx, idx + 1) G.add_edge(idx + 1, idx + 2) head_nodes.append(idx) rel_nodes.append(idx + 1) tail_nodes.append(idx + 2) head_colors.append(node_color_mp_ent[h_name[i]]) rel_colors.append(node_color_mp_rel[r_name[i]]) tail_colors.append(node_color_mp_ent[t_name[i]]) pos[idx] = h_emb[i] pos[idx + 1] = r_emb[i] pos[idx + 2] = t_emb[i] idx += 3 plt.figure() nodes_draw = nx.draw_networkx_nodes(G, pos, nodelist=head_nodes, node_color=head_colors, node_shape='o', node_size=50) nodes_draw.set_edgecolor('k') nodes_draw = nx.draw_networkx_nodes(G, pos, nodelist=rel_nodes, node_color=rel_colors, node_size=50, node_shape='D', with_labels=show_label) nodes_draw.set_edgecolor('k') nodes_draw = nx.draw_networkx_nodes(G, pos, nodelist=tail_nodes, node_color=tail_colors, node_shape='*', node_size=50) nodes_draw.set_edgecolor('k') if show_label: nx.draw_networkx_labels(G, pos, font_size=8) nx.draw_networkx_edges(G, pos, arrows=True, width=0.5, alpha=0.5) if not os.path.exists(resultpath): os.mkdir(resultpath) files = os.listdir(resultpath) file_no = len([c for c in files if algos + '_embedding_plot' in c]) plt.savefig(str(resultpath / (algos + '_embedding_plot_' + str(file_no) + '.png')), bbox_inches='tight', dpi=300)
class Trainer: """ Class for handling the training of the algorithms. Args: model (object): KGE model object Examples: >>> from pykg2vec.utils.trainer import Trainer >>> from pykg2vec.models.TransE import TransE >>> trainer = Trainer(TransE()) >>> trainer.build_model() >>> trainer.train_model() """ _logger = Logger().get_logger(__name__) def __init__(self, model, config): self.model = model self.config = config self.training_results = [] self.evaluator = None self.generator = None def build_model(self): """function to build the model""" self.model.to(self.config.device) if self.config.optimizer == "adam": self.optimizer = optim.Adam( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "sgd": self.optimizer = optim.SGD( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "adagrad": self.optimizer = optim.Adagrad( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "rms": self.optimizer = optim.RMSprop( self.model.parameters(), lr=self.config.learning_rate, ) else: raise NotImplementedError("No support for %s optimizer" % self.config.optimizer) self.config.summary() self.early_stopper = EarlyStopper(self.config.patience, Monitor.FILTERED_MEAN_RANK) ''' Training related functions:''' def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t): pos_preds = self.model(pos_h, pos_r, pos_t) neg_preds = self.model(neg_h, neg_r, neg_t) if self.config.sampling == 'adversarial_negative_sampling': # RotatE: Adversarial Negative Sampling and alpha is the temperature. pos_preds = -pos_preds neg_preds = -neg_preds pos_preds = F.logsigmoid(pos_preds) neg_preds = neg_preds.view((-1, self.config.neg_rate)) softmax = nn.Softmax(dim=1)(neg_preds * self.config.alpha).detach() neg_preds = torch.sum(softmax * (F.logsigmoid(-neg_preds)), dim=-1) loss = -neg_preds.mean() - pos_preds.mean() else: # others that use margin-based & pairwise loss function. (uniform or bern) loss = pos_preds + self.config.margin - neg_preds loss = torch.max(loss, torch.zeros_like(loss)).sum() if hasattr(self.model, 'get_reg'): # now only NTN uses regularizer, # other pairwise based KGE methods use normalization to regularize parameters. loss += self.model.get_reg() return loss def train_step_projection(self, h, r, t, hr_t, tr_h): if self.model.model_name.lower( ) == "conve" or self.model.model_name.lower() == "tucker": if hasattr(self.config, 'label_smoothing'): hr_t = hr_t * (1.0 - self.config.label_smoothing ) + 1.0 / self.config.tot_entity tr_h = tr_h * (1.0 - self.config.label_smoothing ) + 1.0 / self.config.tot_entity pred_tails = self.model(h, r, direction="tail") # (h, r) -> hr_t forward pred_heads = self.model( t, r, direction="head") # (t, r) -> tr_h backward loss_tails = torch.mean(F.binary_cross_entropy(pred_tails, hr_t)) loss_heads = torch.mean(F.binary_cross_entropy(pred_heads, tr_h)) loss = loss_tails + loss_heads else: loss_tails = self.model(h, r, hr_t, direction="tail") # (h, r) -> hr_t forward loss_heads = self.model( t, r, tr_h, direction="head") # (t, r) -> tr_h backward loss = loss_tails + loss_heads if hasattr(self.model, 'get_reg'): # now only complex distmult uses regularizer in algorithms, loss += self.model.get_reg() return loss def train_step_pointwise(self, h, r, t, y): preds = self.model(h, r, t) loss = F.softplus(y * preds).mean() if hasattr(self.model, 'get_reg' ): # for complex & complex-N3 & DistMult & CP & ANALOGY loss += self.model.get_reg(h, r, t) return loss def train_model(self, monitor=Monitor.FILTERED_MEAN_RANK): """Function to train the model.""" self.generator = Generator(self.model, self.config) self.evaluator = Evaluator(self.model, self.config) if self.config.load_from_data: self.load_model() for cur_epoch_idx in range(self.config.epochs): self._logger.info("Epoch[%d/%d]" % (cur_epoch_idx, self.config.epochs)) self.train_model_epoch(cur_epoch_idx) if cur_epoch_idx % self.config.test_step == 0: self.model.eval() metrics = self.evaluator.mini_test(cur_epoch_idx) if self.early_stopper.should_stop(metrics): ### Early Stop Mechanism ### start to check if the metric is still improving after each mini-test. ### Example, if test_step == 5, the trainer will check metrics every 5 epoch. break self.evaluator.full_test(cur_epoch_idx) self.evaluator.metric_calculator.save_test_summary( self.model.model_name) self.generator.stop() self.save_training_result() if self.config.save_model: self.save_model() if self.config.disp_result: self.display() self.export_embeddings() return cur_epoch_idx # the runned epoches. def tune_model(self): """Function to tune the model.""" current_loss = float("inf") self.generator = Generator(self.model, self.config) self.evaluator = Evaluator(self.model, self.config, tuning=True) for cur_epoch_idx in range(self.config.epochs): current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True) self.evaluator.full_test(cur_epoch_idx) self.generator.stop() return current_loss def train_model_epoch(self, epoch_idx, tuning=False): """Function to train the model for one epoch.""" acc_loss = 0 num_batch = self.config.tot_train_triples // self.config.batch_size if not self.config.debug else 10 self.generator.start_one_epoch(num_batch) progress_bar = tqdm(range(num_batch)) for _ in progress_bar: data = list(next(self.generator)) self.model.train() self.optimizer.zero_grad() if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED: h = torch.LongTensor(data[0]).to(self.config.device) r = torch.LongTensor(data[1]).to(self.config.device) t = torch.LongTensor(data[2]).to(self.config.device) hr_t = data[3].to(self.config.device) tr_h = data[4].to(self.config.device) loss = self.train_step_projection(h, r, t, hr_t, tr_h) elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED: h = torch.LongTensor(data[0]).to(self.config.device) r = torch.LongTensor(data[1]).to(self.config.device) t = torch.LongTensor(data[2]).to(self.config.device) y = torch.LongTensor(data[3]).to(self.config.device) loss = self.train_step_pointwise(h, r, t, y) elif self.model.training_strategy == TrainingStrategy.PAIRWISE_BASED: pos_h = torch.LongTensor(data[0]).to(self.config.device) pos_r = torch.LongTensor(data[1]).to(self.config.device) pos_t = torch.LongTensor(data[2]).to(self.config.device) neg_h = torch.LongTensor(data[3]).to(self.config.device) neg_r = torch.LongTensor(data[4]).to(self.config.device) neg_t = torch.LongTensor(data[5]).to(self.config.device) loss = self.train_step_pairwise(pos_h, pos_r, pos_t, neg_h, neg_r, neg_t) else: raise NotImplementedError("Unknown training strategy: %s" % self.model.training_strategy) loss.backward() self.optimizer.step() acc_loss += loss.item() if not tuning: progress_bar.set_description('acc_loss: %f, cur_loss: %f' % (acc_loss, loss)) self.training_results.append([epoch_idx, acc_loss]) return acc_loss def enter_interactive_mode(self): self.build_model() self.load_model() self.evaluator = Evaluator(self.model, self.config) self._logger.info("""The training/loading of the model has finished! Now enter interactive mode :) ----- Example 1: trainer.infer_tails(1,10,topk=5)""" ) self.infer_tails(1, 10, topk=5) self._logger.info("""----- Example 2: trainer.infer_heads(10,20,topk=5)""" ) self.infer_heads(10, 20, topk=5) self._logger.info("""----- Example 3: trainer.infer_rels(1,20,topk=5)""" ) self.infer_rels(1, 20, topk=5) def exit_interactive_mode(self): self._logger.info( "Thank you for trying out inference interactive script :)") def infer_tails(self, h, r, topk=5): tails = self.evaluator.test_tail_rank(h, r, topk).cpu().numpy() logs = [""] logs.append("(head, relation)->({},{}) :: Inferred tails->({})".format( h, r, ",".join([str(i) for i in tails]))) logs.append("") idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs.append("head: %s" % idx2ent[h]) logs.append("relation: %s" % idx2rel[r]) for idx, tail in enumerate(tails): logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail])) self._logger.info("\n".join(logs)) return {tail: idx2ent[tail] for tail in tails} def infer_heads(self, r, t, topk=5): heads = self.evaluator.test_head_rank(r, t, topk).cpu().numpy() logs = [""] logs.append("(relation,tail)->({},{}) :: Inferred heads->({})".format( t, r, ",".join([str(i) for i in heads]))) logs.append("") idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs.append("tail: %s" % idx2ent[t]) logs.append("relation: %s" % idx2rel[r]) for idx, head in enumerate(heads): logs.append("%dth predicted head: %s" % (idx, idx2ent[head])) self._logger.info("\n".join(logs)) return {head: idx2ent[head] for head in heads} def infer_rels(self, h, t, topk=5): if self.model.model_name.lower() in [ "proje_pointwise", "conve", "tucker" ]: self._logger.info( "%s model doesn't support relation inference in nature.") return rels = self.evaluator.test_rel_rank(h, t, topk).cpu().numpy() logs = [""] logs.append("(head,tail)->({},{}) :: Inferred rels->({})".format( h, t, ",".join([str(i) for i in rels]))) logs.append("") idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs.append("head: %s" % idx2ent[h]) logs.append("tail: %s" % idx2ent[t]) for idx, rel in enumerate(rels): logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel])) self._logger.info("\n".join(logs)) return {rel: idx2rel[rel] for rel in rels} # ''' Procedural functions:''' def save_model(self): """Function to save the model.""" saved_path = self.config.path_tmp / self.model.model_name saved_path.mkdir(parents=True, exist_ok=True) torch.save(self.model.state_dict(), str(saved_path / 'model.vec.pt')) def load_model(self): """Function to load the model.""" saved_path = self.config.path_tmp / self.model.model_name if saved_path.exists(): self.model.load_state_dict( torch.load(str(saved_path / 'model.vec.pt'))) self.model.eval() def display(self): """Function to display embedding.""" options = { "ent_only_plot": True, "rel_only_plot": not self.config.plot_entity_only, "ent_and_rel_plot": not self.config.plot_entity_only } if self.config.plot_embedding: viz = Visualization(self.model, self.config, vis_opts=options) viz.plot_embedding(resultpath=self.config.path_figures, algos=self.model.model_name, show_label=False) if self.config.plot_training_result: viz = Visualization(self.model, self.config) viz.plot_train_result() if self.config.plot_testing_result: viz = Visualization(self.model, self.config) viz.plot_test_result() def export_embeddings(self): """ Export embeddings in tsv and pandas pickled format. With tsvs (both label, vector files), you can: 1) Use those pretained embeddings for your applications. 2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/) Pandas dataframes can be read with pd.read_pickle('desired_file.pickle') """ save_path = self.config.path_embeddings / self.model.model_name save_path.mkdir(parents=True, exist_ok=True) idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file: for label in idx2ent.values(): l_export_file.write(label + "\n") with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file: for label in idx2rel.values(): l_export_file.write(label + "\n") for named_embedding in self.model.parameter_list: all_ids = list(range(0, int(named_embedding.weight.shape[0]))) stored_name = named_embedding.name if len(named_embedding.shape) == 2: all_embs = named_embedding.weight.detach().cpu().numpy() with open(str(save_path / ("%s.tsv" % stored_name)), 'w') as v_export_file: for idx in all_ids: v_export_file.write( "\t".join([str(x) for x in all_embs[idx]]) + "\n") def save_training_result(self): """Function that saves training result""" files = os.listdir(str(self.config.path_result)) l = len([ f for f in files if self.model.model_name in f if 'Training' in f ]) df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss']) with open( str(self.config.path_result / (self.model.model_name + '_Training_results_' + str(l) + '.csv')), 'w') as fh: df.to_csv(fh)
class MetricCalculator: ''' MetricCalculator aims to 1) address all the statistic tasks. 2) provide interfaces for querying results. MetricCalculator is expected to be used by "evaluation_process". ''' _logger = Logger().get_logger(__name__) def __init__(self, config): self.config = config self.hr_t = config.knowledge_graph.read_cache_data('hr_t') self.tr_h = config.knowledge_graph.read_cache_data('tr_h') # (f)mr : (filtered) mean rank # (f)mrr : (filtered) mean reciprocal rank # (f)hit : (filtered) hit-k ratio self.mr = {} self.fmr = {} self.mrr = {} self.fmrr = {} self.hit = {} self.fhit = {} self.epoch = None self.reset() def reset(self): # temporarily used buffers and indexes. self.rank_head = [] self.rank_tail = [] self.f_rank_head = [] self.f_rank_tail = [] self.epoch = None self.start_time = timeit.default_timer() def append_result(self, result): predict_tail = result[0] predict_head = result[1] h, r, t = result[2], result[3], result[4] self.epoch = result[5] t_rank, f_t_rank = self.get_tail_rank(predict_tail, h, r, t) h_rank, f_h_rank = self.get_head_rank(predict_head, h, r, t) self.rank_head.append(h_rank) self.rank_tail.append(t_rank) self.f_rank_head.append(f_h_rank) self.f_rank_tail.append(f_t_rank) def get_tail_rank(self, tail_candidate, h, r, t): """Function to evaluate the tail rank. Args: id_replace_tail (list): List of the predicted tails for the given head, relation pair h (int): head id r (int): relation id t (int): tail id hr_t (dict): list of tails for the given hwS and relation pari. Returns: Tensors: Returns tail rank and filetered tail rank """ trank = 0 ftrank = 0 for j in range(len(tail_candidate)): val = tail_candidate[-j - 1] if val != t: trank += 1 ftrank += 1 if val in self.hr_t[(h, r)]: ftrank -= 1 else: break return trank, ftrank def get_head_rank(self, head_candidate, h, r, t): """Function to evaluate the head rank. Args: head_candidate (list): List of the predicted head for the given tail, relation pair h (int): head id r (int): relation id t (int): tail id Returns: Tensors: Returns head rank and filetered head rank """ hrank = 0 fhrank = 0 for j in range(len(head_candidate)): val = head_candidate[-j - 1] if val != h: hrank += 1 fhrank += 1 if val in self.tr_h[(t, r)]: fhrank -= 1 else: break return hrank, fhrank def settle(self): head_ranks = np.asarray(self.rank_head, dtype=np.float32) + 1 tail_ranks = np.asarray(self.rank_tail, dtype=np.float32) + 1 head_franks = np.asarray(self.f_rank_head, dtype=np.float32) + 1 tail_franks = np.asarray(self.f_rank_tail, dtype=np.float32) + 1 ranks = np.concatenate((head_ranks, tail_ranks)) franks = np.concatenate((head_franks, tail_franks)) self.mr[self.epoch] = np.mean(ranks) self.mrr[self.epoch] = np.mean(np.reciprocal(ranks)) self.fmr[self.epoch] = np.mean(franks) self.fmrr[self.epoch] = np.mean(np.reciprocal(franks)) for hit in self.config.hits: self.hit[(self.epoch, hit)] = np.mean(ranks <= hit, dtype=np.float32) self.fhit[(self.epoch, hit)] = np.mean(franks <= hit, dtype=np.float32) def get_curr_scores(self): scores = { 'mr': self.mr[self.epoch], 'fmr': self.fmr[self.epoch], 'mrr': self.mrr[self.epoch], 'fmrr': self.fmrr[self.epoch] } return scores def save_test_summary(self, model_name): """Function to save the test of the summary. Args: model_name (str): specify the name of the model. """ files = os.listdir(str(self.config.path_result)) l = len([f for f in files if model_name in f if 'Testing' in f]) with open( str(self.config.path_result / (model_name + '_summary_' + str(l) + '.txt')), 'w') as fh: fh.write('----------------SUMMARY----------------\n') for key, val in self.config.__dict__.items(): if 'gpu' in key: continue if 'knowledge_graph' in key: continue if not isinstance(val, str): if isinstance(val, list): v_tmp = '[' for i, v in enumerate(val): if i == 0: v_tmp += str(v) else: v_tmp += ',' + str(v) v_tmp += ']' val = v_tmp else: val = str(val) fh.write(key + ':' + val + '\n') fh.write('-----------------------------------------\n') fh.write( "\n----------Metadata Info for Dataset:%s----------------" % self.config.knowledge_graph.dataset_name) fh.write("Total Training Triples :%d\n" % self.config.tot_train_triples) fh.write("Total Testing Triples :%d\n" % self.config.tot_test_triples) fh.write("Total validation Triples :%d\n" % self.config.tot_valid_triples) fh.write("Total Entities :%d\n" % self.config.tot_entity) fh.write("Total Relations :%d\n" % self.config.tot_relation) fh.write("---------------------------------------------") columns = [ 'Epoch', 'Mean Rank', 'Filtered Mean Rank', 'Mean Reciprocal Rank', 'Filtered Mean Reciprocal Rank' ] for hit in self.config.hits: columns += ['Hit-%d Ratio' % hit, 'Filtered Hit-%d Ratio' % hit] results = [] for epoch, _ in self.mr.items(): res_tmp = [ epoch, self.mr[epoch], self.fmr[epoch], self.mrr[epoch], self.fmrr[epoch] ] for hit in self.config.hits: res_tmp.append(self.hit[(epoch, hit)]) res_tmp.append(self.fhit[(epoch, hit)]) results.append(res_tmp) df = pd.DataFrame(results, columns=columns) with open( str(self.config.path_result / (model_name + '_Testing_results_' + str(l) + '.csv')), 'a') as fh: df.to_csv(fh) def display_summary(self): """Function to print the test summary.""" stop_time = timeit.default_timer() test_results = [] test_results.append('') test_results.append( "------Test Results for %s: Epoch: %d --- time: %.2f------------" % (self.config.dataset_name, self.epoch, stop_time - self.start_time)) test_results.append('--# of entities, # of relations: %d, %d' % (self.config.tot_entity, self.config.tot_relation)) test_results.append('--mr, filtered mr : %.4f, %.4f' % (self.mr[self.epoch], self.fmr[self.epoch])) test_results.append('--mrr, filtered mrr : %.4f, %.4f' % (self.mrr[self.epoch], self.fmrr[self.epoch])) for hit in self.config.hits: test_results.append('--hits%d : %.4f ' % (hit, (self.hit[(self.epoch, hit)]))) test_results.append('--filtered hits%d : %.4f ' % (hit, (self.fhit[(self.epoch, hit)]))) test_results.append( "---------------------------------------------------------") test_results.append('') self._logger.info("\n".join(test_results))
class Trainer: """ Class for handling the training of the algorithms. Args: model (object): KGE model object Examples: >>> from pykg2vec.utils.trainer import Trainer >>> from pykg2vec.models.pairwise import TransE >>> trainer = Trainer(TransE()) >>> trainer.build_model() >>> trainer.train_model() """ TRAINED_MODEL_FILE_NAME = "model.vec.pt" TRAINED_MODEL_CONFIG_NAME = "config.npy" _logger = Logger().get_logger(__name__) def __init__(self, model, config): self.model = model self.config = config self.best_metric = None self.monitor = None self.training_results = [] self.evaluator = None self.generator = None self.optimizer = None self.early_stopper = None def build_model(self, monitor=Monitor.FILTERED_MEAN_RANK): """function to build the model""" if self.config.load_from_data is not None: self.load_model(self.config.load_from_data) self.evaluator = Evaluator(self.model, self.config) self.model.to(self.config.device) if self.config.optimizer == "adam": self.optimizer = optim.Adam( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "sgd": self.optimizer = optim.SGD( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "adagrad": self.optimizer = optim.Adagrad( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "rms": self.optimizer = optim.RMSprop( self.model.parameters(), lr=self.config.learning_rate, ) elif self.config.optimizer == "riemannian": param_names = [ name for name, param in self.model.named_parameters() ] self.optimizer = RiemannianOptimizer(self.model.parameters(), lr=self.config.learning_rate, param_names=param_names) else: raise NotImplementedError("No support for %s optimizer" % self.config.optimizer) self.config.summary() self.early_stopper = EarlyStopper(self.config.patience, monitor) # Training related functions: def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t): pos_preds = self.model(pos_h, pos_r, pos_t) neg_preds = self.model(neg_h, neg_r, neg_t) if self.model.model_name.lower() == "rotate": loss = self.model.loss(pos_preds, neg_preds, self.config.neg_rate, self.config.alpha) else: loss = self.model.loss(pos_preds, neg_preds, self.config.margin) loss += self.model.get_reg(None, None, None) return loss def train_step_projection(self, h, r, t, hr_t, tr_h): if self.model.model_name.lower() in [ "conve", "tucker", "interacte", "hyper", "acre" ]: pred_tails = self.model(h, r, direction="tail") # (h, r) -> hr_t forward pred_heads = self.model( t, r, direction="head") # (t, r) -> tr_h backward if hasattr(self.config, 'label_smoothing'): loss = self.model.loss(pred_heads, pred_tails, tr_h, hr_t, self.config.label_smoothing, self.config.tot_entity) else: loss = self.model.loss(pred_heads, pred_tails, tr_h, hr_t, None, None) else: pred_tails = self.model(h, r, hr_t, direction="tail") # (h, r) -> hr_t forward pred_heads = self.model( t, r, tr_h, direction="head") # (t, r) -> tr_h backward loss = self.model.loss(pred_heads, pred_tails) loss += self.model.get_reg(h, r, t) return loss def train_step_pointwise(self, h, r, t, target): preds = self.model(h, r, t) loss = self.model.loss(preds, target.type(preds.type())) loss += self.model.get_reg(h, r, t) return loss def train_model(self): # for key, value in self.config.__dict__.items(): # print(key," ",value) #print(self.config.__dict__[""]) # pdb.set_trace() """Function to train the model.""" self.generator = Generator(self.model, self.config) self.monitor = Monitor.FILTERED_MEAN_RANK for cur_epoch_idx in range(self.config.epochs): self._logger.info("Epoch[%d/%d]" % (cur_epoch_idx, self.config.epochs)) self.train_model_epoch(cur_epoch_idx) if cur_epoch_idx % self.config.test_step == 0: self.model.eval() with torch.no_grad(): metrics = self.evaluator.mini_test(cur_epoch_idx) if self.early_stopper.should_stop(metrics): ### Early Stop Mechanism ### start to check if the metric is still improving after each mini-test. ### Example, if test_step == 5, the trainer will check metrics every 5 epoch. break # store the best model weights. if self.config.save_model: if self.best_metric is None: self.best_metric = metrics self.save_model() else: if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK: is_better = self.best_metric[ self.monitor.value] > metrics[ self.monitor.value] else: is_better = self.best_metric[ self.monitor.value] < metrics[ self.monitor.value] if is_better: self.save_model() self.best_metric = metrics self.model.eval() with torch.no_grad(): self.evaluator.full_test(cur_epoch_idx) self.evaluator.metric_calculator.save_test_summary( self.model.model_name) self.generator.stop() self.save_training_result() # if self.config.save_model: # self.save_model() if self.config.disp_result: self.display() self.export_embeddings() return cur_epoch_idx # the runned epoches. def tune_model(self): """Function to tune the model.""" current_loss = float("inf") self.generator = Generator(self.model, self.config) self.evaluator = Evaluator(self.model, self.config, tuning=True) for cur_epoch_idx in range(self.config.epochs): current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True) self.model.eval() with torch.no_grad(): self.evaluator.full_test(cur_epoch_idx) self.generator.stop() return current_loss def train_model_epoch(self, epoch_idx, tuning=False): """Function to train the model for one epoch.""" acc_loss = 0 num_batch = self.config.tot_train_triples // self.config.batch_size if not self.config.debug else 10 self.generator.start_one_epoch(num_batch) progress_bar = tqdm(range(num_batch)) for _ in progress_bar: data = list(next(self.generator)) self.model.train() self.optimizer.zero_grad() if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED: h = torch.LongTensor(data[0]).to(self.config.device) r = torch.LongTensor(data[1]).to(self.config.device) t = torch.LongTensor(data[2]).to(self.config.device) hr_t = data[3].to(self.config.device) tr_h = data[4].to(self.config.device) loss = self.train_step_projection(h, r, t, hr_t, tr_h) elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED: h = torch.LongTensor(data[0]).to(self.config.device) r = torch.LongTensor(data[1]).to(self.config.device) t = torch.LongTensor(data[2]).to(self.config.device) y = torch.LongTensor(data[3]).to(self.config.device) loss = self.train_step_pointwise(h, r, t, y) elif self.model.training_strategy == TrainingStrategy.PAIRWISE_BASED: pos_h = torch.LongTensor(data[0]).to(self.config.device) pos_r = torch.LongTensor(data[1]).to(self.config.device) pos_t = torch.LongTensor(data[2]).to(self.config.device) neg_h = torch.LongTensor(data[3]).to(self.config.device) neg_r = torch.LongTensor(data[4]).to(self.config.device) neg_t = torch.LongTensor(data[5]).to(self.config.device) loss = self.train_step_pairwise(pos_h, pos_r, pos_t, neg_h, neg_r, neg_t) else: raise NotImplementedError("Unknown training strategy: %s" % self.model.training_strategy) loss.backward() self.optimizer.step() acc_loss += loss.item() if not tuning: progress_bar.set_description('acc_loss: %f, cur_loss: %f' % (acc_loss, loss)) self.training_results.append([epoch_idx, acc_loss]) return acc_loss def enter_interactive_mode(self): self.build_model() self.load_model() self._logger.info("""The training/loading of the model has finished! Now enter interactive mode :) ----- Example 1: trainer.infer_tails(1,10,topk=5)""" ) self.infer_tails(1, 10, topk=5) self._logger.info("""----- Example 2: trainer.infer_heads(10,20,topk=5)""" ) self.infer_heads(10, 20, topk=5) self._logger.info("""----- Example 3: trainer.infer_rels(1,20,topk=5)""" ) self.infer_rels(1, 20, topk=5) def exit_interactive_mode(self): self._logger.info( "Thank you for trying out inference interactive script :)") def infer_tails(self, h, r, topk=5): tails = self.evaluator.test_tail_rank(h, r, topk).detach().cpu().numpy() idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs = [ "", "(head, relation)->({},{}) :: Inferred tails->({})".format( h, r, ",".join([str(i) for i in tails])), "", "head: %s" % idx2ent[h], "relation: %s" % idx2rel[r], ] for idx, tail in enumerate(tails): logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail])) self._logger.info("\n".join(logs)) return {tail: idx2ent[tail] for tail in tails} def infer_heads(self, r, t, topk=5): heads = self.evaluator.test_head_rank(r, t, topk).detach().cpu().numpy() idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs = [ "", "(relation,tail)->({},{}) :: Inferred heads->({})".format( t, r, ",".join([str(i) for i in heads])), "", "tail: %s" % idx2ent[t], "relation: %s" % idx2rel[r], ] for idx, head in enumerate(heads): logs.append("%dth predicted head: %s" % (idx, idx2ent[head])) self._logger.info("\n".join(logs)) return {head: idx2ent[head] for head in heads} def infer_rels(self, h, t, topk=5): if self.model.model_name.lower() in [ "proje_pointwise", "conve", "tucker" ]: self._logger.info( "%s model doesn't support relation inference in nature.") return {} rels = self.evaluator.test_rel_rank(h, t, topk).detach().cpu().numpy() idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') logs = [ "", "(head,tail)->({},{}) :: Inferred rels->({})".format( h, t, ",".join([str(i) for i in rels])), "", "head: %s" % idx2ent[h], "tail: %s" % idx2ent[t], ] for idx, rel in enumerate(rels): logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel])) self._logger.info("\n".join(logs)) return {rel: idx2rel[rel] for rel in rels} # ''' Procedural functions:''' def save_model(self): """Function to save the model.""" saved_path = self.config.path_tmp / self.model.model_name saved_path.mkdir(parents=True, exist_ok=True) torch.save(self.model.state_dict(), str(saved_path / self.TRAINED_MODEL_FILE_NAME)) # Save hyper-parameters into a yaml file with the model save_path_config = saved_path / self.TRAINED_MODEL_CONFIG_NAME np.save(save_path_config, self.config) def load_model(self, model_path=None): """Function to load the model.""" if model_path is None: model_path_file = self.config.path_tmp / self.model.model_name / self.TRAINED_MODEL_FILE_NAME model_path_config = self.config.path_tmp / self.model.model_name / self.TRAINED_MODEL_CONFIG_NAME else: model_path = Path(model_path) model_path_file = model_path / self.TRAINED_MODEL_FILE_NAME model_path_config = model_path / self.TRAINED_MODEL_CONFIG_NAME if model_path_file.exists() and model_path_config.exists(): config_temp = np.load(model_path_config, allow_pickle=True).item() config_temp.__dict__['load_from_data'] = self.config.__dict__[ 'load_from_data'] self.config = config_temp _, model_def = Importer().import_model_config( self.config.model_name.lower()) self.model = model_def(**self.config.__dict__) self.model.load_state_dict(torch.load(str(model_path_file))) self.model.eval() else: raise ValueError("Cannot load model from %s" % model_path_file) def display(self): """Function to display embedding.""" options = { "ent_only_plot": True, "rel_only_plot": not self.config.plot_entity_only, "ent_and_rel_plot": not self.config.plot_entity_only } if self.config.plot_embedding: viz = Visualization(self.model, self.config, vis_opts=options) viz.plot_embedding(resultpath=self.config.path_figures, algos=self.model.model_name, show_label=False) if self.config.plot_training_result: viz = Visualization(self.model, self.config) viz.plot_train_result() if self.config.plot_testing_result: viz = Visualization(self.model, self.config) viz.plot_test_result() def export_embeddings(self): """ Export embeddings in tsv and pandas pickled format. With tsvs (both label, vector files), you can: 1) Use those pretained embeddings for your applications. 2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/) Pandas dataframes can be read with pd.read_pickle('desired_file.pickle') """ save_path = self.config.path_embeddings / self.model.model_name save_path.mkdir(parents=True, exist_ok=True) idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity') idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation') with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file: for label in idx2ent.values(): l_export_file.write(label + "\n") with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file: for label in idx2rel.values(): l_export_file.write(label + "\n") for named_embedding in self.model.parameter_list: all_ids = list(range(0, int(named_embedding.weight.shape[0]))) stored_name = named_embedding.name if len(named_embedding.weight.shape) == 2: all_embs = named_embedding.weight.detach().detach().cpu( ).numpy() with open(str(save_path / ("%s.tsv" % stored_name)), 'w') as v_export_file: for idx in all_ids: v_export_file.write( "\t".join([str(x) for x in all_embs[idx]]) + "\n") def save_training_result(self): """Function that saves training result""" files = os.listdir(str(self.config.path_result)) l = len([ f for f in files if self.model.model_name in f if 'Training' in f ]) df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss']) with open( str(self.config.path_result / (self.model.model_name + '_Training_results_' + str(l) + '.csv')), 'w') as fh: df.to_csv(fh)
class Evaluator: """Class to perform evaluation of the model. Args: model (object): Model object tuning (bool): Flag to denoting tuning if True Examples: >>> from pykg2vec.utils.evaluator import Evaluator >>> evaluator = Evaluator(model=model, tuning=True) >>> evaluator.test_batch(Session(), 0) >>> acc = evaluator.output_queue.get() >>> evaluator.stop() """ _logger = Logger().get_logger(__name__) def __init__(self, model, config, tuning=False): self.model = model self.config = config self.tuning = tuning self.test_data = self.config.knowledge_graph.read_cache_data( 'triplets_test') self.eval_data = self.config.knowledge_graph.read_cache_data( 'triplets_valid') self.metric_calculator = MetricCalculator(self.config) def test_tail_rank(self, h, r, topk=-1): if hasattr(self.model, 'predict_tail_rank'): rank = self.model.predict_tail_rank( torch.LongTensor([h]).to(self.config.device), torch.LongTensor([r]).to(self.config.device), topk=topk) return rank.squeeze(0) h_batch = torch.LongTensor([h]).repeat([self.config.tot_entity ]).to(self.config.device) r_batch = torch.LongTensor([r]).repeat([self.config.tot_entity ]).to(self.config.device) entity_array = torch.LongTensor(list(range( self.config.tot_entity))).to(self.config.device) preds = self.model.forward(h_batch, r_batch, entity_array) _, rank = torch.topk(preds, k=topk) return rank def test_head_rank(self, r, t, topk=-1): if hasattr(self.model, 'predict_head_rank'): rank = self.model.predict_head_rank( torch.LongTensor([t]).to(self.config.device), torch.LongTensor([r]).to(self.config.device), topk=topk) return rank.squeeze(0) entity_array = torch.LongTensor(list(range( self.config.tot_entity))).to(self.config.device) r_batch = torch.LongTensor([r]).repeat([self.config.tot_entity ]).to(self.config.device) t_batch = torch.LongTensor([t]).repeat([self.config.tot_entity ]).to(self.config.device) preds = self.model.forward(entity_array, r_batch, t_batch) _, rank = torch.topk(preds, k=topk) return rank def test_rel_rank(self, h, t, topk=-1): if hasattr(self.model, 'predict_rel_rank'): # TODO: This is not implemented for conve, convkb, proje_pointwise, tucker, interacte and hyper rank = self.model.predict_rel_rank(h.to(self.config.device), t.to(self.config.device), topk=topk) return rank.squeeze(0) h_batch = torch.LongTensor([h]).repeat([self.config.tot_relation ]).to(self.config.device) rel_array = torch.LongTensor(list(range(self.config.tot_relation))).to( self.config.device) t_batch = torch.LongTensor([t]).repeat([self.config.tot_relation ]).to(self.config.device) preds = self.model.forward(h_batch, rel_array, t_batch) _, rank = torch.topk(preds, k=topk) return rank def mini_test(self, epoch=None): if self.config.test_num == 0: tot_valid_to_test = len(self.eval_data) else: tot_valid_to_test = min(self.config.test_num, len(self.eval_data)) if self.config.debug: tot_valid_to_test = 10 self._logger.info("Mini-Testing on [%d/%d] Triples in the valid set." % (tot_valid_to_test, len(self.eval_data))) return self.test(self.eval_data, tot_valid_to_test, epoch=epoch) def full_test(self, epoch=None): tot_valid_to_test = len(self.test_data) if self.config.debug: tot_valid_to_test = 10 self._logger.info("Full-Testing on [%d/%d] Triples in the test set." % (tot_valid_to_test, len(self.test_data))) return self.test(self.test_data, tot_valid_to_test, epoch=epoch) def test(self, data, num_of_test, epoch=None): self.metric_calculator.reset() progress_bar = tqdm(range(num_of_test)) for i in progress_bar: h, r, t = data[i].h, data[i].r, data[i].t # generate head batch and predict heads. h_tensor = torch.LongTensor([h]) r_tensor = torch.LongTensor([r]) t_tensor = torch.LongTensor([t]) hrank = self.test_head_rank(r_tensor, t_tensor, self.config.tot_entity) trank = self.test_tail_rank(h_tensor, r_tensor, self.config.tot_entity) result_data = [ trank.detach().cpu().numpy(), hrank.detach().cpu().numpy(), h, r, t, epoch ] self.metric_calculator.append_result(result_data) self.metric_calculator.settle() self.metric_calculator.display_summary() if self.metric_calculator.epoch >= self.config.epochs - 1: self.metric_calculator.save_test_summary(self.model.model_name) return self.metric_calculator.get_curr_scores()
class BaysOptimizer(object): """Bayesian optimizer class for tuning hyperparameter. This class implements the Bayesian Optimizer for tuning the hyper-parameter. Args: args (object): The Argument Parser object providing arguments. name_dataset (str): The name of the dataset. sampling (str): sampling to be used for generating negative triples Examples: >>> from pykg2vec.config.hyperparams import KGETuneArgParser >>> from pykg2vec.utils.bayesian_optimizer import BaysOptimizer >>> model = Complex() >>> args = KGETuneArgParser().get_args(sys.argv[1:]) >>> bays_opt = BaysOptimizer(args=args) >>> bays_opt.optimize() """ _logger = Logger().get_logger(__name__) def __init__(self, args=None): """store the information of database""" if args.model.lower() in ["tucker", "tucker_v2", "conve", "convkb", "proje_pointwise"]: raise Exception("Model %s has not been supported in tuning hyperparameters!" % args.model) model_name = args.model.lower() self.args = args self.knowledge_graph = KnowledgeGraph(dataset=args.dataset_name, custom_dataset_path=args.dataset_path) hyper_params = None try: self.model_obj = getattr(importlib.import_module(model_path + ".%s" % moduleMap[model_name]), modelMap[model_name]) self.config_obj = getattr(importlib.import_module(config_path), configMap[model_name]) hyper_params = getattr(importlib.import_module(hyper_param_path), hypMap[model_name])() except ModuleNotFoundError: self._logger.error("%s not implemented! Select from: %s" % \ (model_name, ' '.join(map(str, modelMap.values())))) from pykg2vec.config.config import KGEArgParser kge_args = KGEArgParser().get_args([]) kge_args.dataset_name = args.dataset_name kge_args.debug = self.args.debug config = self.config_obj(kge_args) model = self.model_obj(config) self.trainer = Trainer(model) self.search_space = hyper_params.search_space self.max_evals = self.args.max_number_trials if not self.args.debug else 1 def optimize(self): """Function that performs bayesian optimization""" trials = Trials() self.best_result = fmin(fn=self.get_loss, space=self.search_space, trials=trials, algo=tpe.suggest, max_evals=self.max_evals) columns = list(self.search_space.keys()) results = pd.DataFrame(columns=['iteration'] + columns + ['loss']) for idx, trial in enumerate(trials.trials): row = [] row.append(idx) translated_eval = space_eval(self.search_space, {k: v[0] for k, v in trial['misc']['vals'].items()}) for k in columns: row.append(translated_eval[k]) row.append(trial['result']['loss']) results.loc[idx] = row path = self.trainer.config.path_result / self.trainer.model.model_name path.mkdir(parents=True, exist_ok=True) results.to_csv(str(path / "trials.csv"), index=False) self._logger.info(results) self._logger.info('Found Golden Setting:') self._logger.info(space_eval(self.search_space, self.best_result)) def return_best(self): """Function to return the best hyper-parameters""" return space_eval(self.search_space, self.best_result) def get_loss(self, params): """Function that defines and acquires the loss""" # copy the hyperparameters to trainer config and hyperparameter set. for key, value in params.items(): self.trainer.config.__dict__[key] = value self.trainer.config.hyperparameters[key] = value # configure common setting for a tuning training. self.trainer.config.disp_result = False self.trainer.config.disp_summary = False self.trainer.config.save_model = False # do not overwrite test numbers if set if self.trainer.config.test_num is None: self.trainer.config.test_num = 1000 if self.args.debug: self.trainer.config.epochs = 1 self.trainer.config.hyperparameters['epochs'] = 1 # start the trial. self.trainer.build_model() loss = self.trainer.tune_model() return {'loss': loss, 'status': STATUS_OK}