def __init__(self, dataset_dir: str, num_workers: int, train_batch_size: int, val_batch_size: int, negative_sample_size: int, *args, **kwargs): """Initiates a Knowledge Graph dataset. Args: dataset_dir: str path of the dataset directory to use num_workers: int number of workers to use train_batch_size: int batch size to use for training val_batch_size: int batch size to use for validation and test negative_sample_size: int size of the negative samples """ super().__init__(*args, **kwargs) self.dataset_dir = dataset_dir self.num_workers = num_workers self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.negative_sample_size = negative_sample_size # Build dictionaries to translate entities/relations to their ID self.entity2id = load_entities( os.path.join(self.dataset_dir, "entities.dict")) self.relation2id = load_relations( os.path.join(self.dataset_dir, "relations.dict")) # Load training, validation and test triples self.train_triples = read_triple( os.path.join(self.dataset_dir, "train.txt"), self.entity2id, self.relation2id, ) self.val_triples = read_triple( os.path.join(self.dataset_dir, "valid.txt"), self.entity2id, self.relation2id, ) self.test_triples = read_triple( os.path.join(self.dataset_dir, "test.txt"), self.entity2id, self.relation2id)
def run_rgcn( dataset_path: str, ckpt_file: str, cfg: dict, ): """Runs experiment with the R-GCN model. Args: dataset_path: str path to the dataset ckpt_file: str checkpoint file to the pretrained model cfg: dict configuration dictionary to use """ cfg_model = cfg["rgcn"]["model"] cfg_data = cfg["rgcn"]["data"] cfg_training = cfg["rgcn"]["training"] ## Load the dataset # Build dictionaries to translate entities/relations to their ID entity2id = load_entities(os.path.join(dataset_path, "entities.dict")) relation2id = load_relations(os.path.join(dataset_path, "relations.dict")) # Load training, validation and test triples train_triples = read_triple( os.path.join(dataset_path, "train.txt"), entity2id, relation2id ) val_triples = read_triple( os.path.join(dataset_path, "valid.txt"), entity2id, relation2id ) test_triples = read_triple( os.path.join(dataset_path, "test.txt"), entity2id, relation2id ) # Build the data objects used by the model all_triples = torch.LongTensor(train_triples + val_triples + test_triples) train_triples = np.array(train_triples, dtype=int) val_triples = torch.LongTensor(val_triples) test_triples = torch.LongTensor(test_triples) # Build the test graph test_graph = build_test_graph(len(entity2id), len(relation2id), train_triples) # Create a model instance model = RGCN( len(entity2id), len(relation2id), cfg_model["n_bases"], cfg_model["dropout"], cfg_model["reg_ratio"], ) # Load the pretrained model if not os.path.exists(ckpt_file): optimizer = torch.optim.Adam( model.parameters(), lr=cfg_training["learning_rate"] ) best_mrr = 0.0 for i in tqdm(range(cfg_training["n_epochs"]), desc="Training epochs"): model.train() optimizer.zero_grad() loss = train_epoch( train_triples, model, cfg_data, cfg_model["reg_ratio"], len(entity2id), len(relation2id), ) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), cfg_training["grad_norm"]) optimizer.step() # Evaluate the model val_metrics = validate(val_triples, test_graph, model, all_triples) # Save model checkpoint if best if val_metrics["mrr"] > best_mrr: best_mrr = val_metrics["mrr"] torch.save(model.state_dict(), ckpt_file) # Test the pretrained model model_test = model.load_state_dict(torch.load(ckpt_file)) metrics = test(test_triples, model, test_graph, all_triples) print("Test metrics: ", metrics)