def load_from_npz(cls, path: str, dataset: Dataset = None) -> 'Embedding': with np.load(path) as data: if dataset is not None: embs = cls(dataset, None) else: embs = cls(Dataset(), None) embs.entity_embeddings = data['entity_embeddings'] try: embs.relationship_embeddings = data['relationship_embeddings'] except KeyError: pass return embs
def test_init(self): ds = Dataset(self.nt_filename, temp_dir="tests/tmp") tempdir = os.path.join(os.curdir, "tests/tmp", "da8c86a9fd1a62dc6bb6979203498a31") self.assertTrue(os.path.exists(os.path.join(tempdir, "entity2id.txt"))) self.assertTrue( os.path.exists(os.path.join(tempdir, "relation2id.txt"))) self.assertTrue(os.path.exists(os.path.join(tempdir, "train2id.txt"))) self.assertEqual(ds.size, 8) self.assertEqual(len(ds), 8) self.assertEqual(ds.ent_count, 11) self.assertEqual(ds.rel_count, 6) self.assertEqual(ds.shape, (11, 6))
def main(): # load dataset dataset = Dataset("../benchmarks/fb15k.nt") # load embeddings embeddings = Embedding(dataset, TransE) embeddings.restore(prefix='../checkpoints/TransE/TransE') # alternatively load embeddings from numpy matrix embs = Embedding(dataset, TransE) embs.load_embeddings_from_npy('../embeddings/TransE/TransE.npy') # query embeddings print(embeddings['/m/02f75t']) print(embs['/m/02f75t']) print() print(embeddings['foobar']) print(embs['foobar'])
def build_dataset(generate_validation_test, validation_file, test_file, out, file_in): """Create npz dataset file""" file_in = Path(file_in) if not file_in.exists(): click.echo(f'The file {file_in} does not exist') return sys.exit(1) if out is None: out = f'./{file_in.with_suffix(".npz")}' if validation_file is not None and test_file is not None: dataset = Dataset(train_file=str(file_in), valid_file=validation_file, test_file=test_file, generate_valid_test=True) elif validation_file is None and test_file is None and generate_validation_test: dataset = Dataset(train_file=str(file_in), generate_valid_test=True) else: dataset = Dataset(train_file=str(file_in)) dataset.to_npz(out_path=out)
def __init__(self, dataset: Dataset = None, model_class: type = None, out_path=None, **kwargs): self.dataset = dataset or Dataset() self.model_class = model_class # self.__model = None self.__config = None self.__library = Library.get_library() # Training args self.neg_ent = 1 self.neg_rel = 0 self.bern = True self.workers = 1 self.folds = 20 self.epochs = 50 self.optimizer = "SGD" self.per_process_gpu_memory_fraction = 0.5 self.learning_rate = 0.01 # Model specific parameters self.dimension = 50 # ComplEx, DistMult, HolE, RESCAL, TransD, TransE, TransH self.ent_dim = 50 # TransR self.rel_dim = 10 # TransR self.margin = 1.0 # HolE, RESCAL, TransD, TransE, TransH, TransR self.weight = 0.0001 # ComplEx, DistMult # used to provide easier access to embeddings. self.entity_embeddings = None self.relationship_embeddings = None self.rankings: pd.DataFrame = None self.out_path = out_path # Apply kwargs for key, value in kwargs.items(): if hasattr(self, key): setattr(self, key, value) if model_class is not None: self.__init_config()
def compute(model, n_batches, epochs, neg_ent, neg_rel, bern, workers, optimizer, dims, margin, eval, out, json, valid_file, test_file, file_path): """Initializes the repository.""" file_path = Path(file_path) if file_path.suffix == '.npz': dataset = Dataset.from_npz(file_path) elif valid_file is not None and test_file is not None: dataset = Dataset(train_file=str(file_path), valid_file=valid_file, test_file=test_file, generate_valid_test=True) elif valid_file is None and test_file is None and eval: dataset = Dataset(train_file=str(file_path), generate_valid_test=True) else: dataset = Dataset(train_file=str(file_path)) click.echo("Start training using the following parameters: ") click.echo("-----------------------------------------------") click.echo(f"Knowledge Base: {file_path}") click.echo(f"Batch number: {n_batches} => {int(dataset.size / n_batches)} total batch size") click.echo(f"Epochs: {epochs}") click.echo(f"Neg_Ent: {neg_ent}") click.echo(f"Neg_Rel: {neg_rel}") click.echo(f"bern: {bern}") click.echo(f"Workers: {workers}") click.echo(f"Optimizer: {optimizer}") click.echo(f"Dimensionality: {dims}") click.echo(f"Margin: {margin}") click.echo(f"Output directory: {out}") click.echo("-----------------------------------------------") embedding = Embedding( dataset, get_model(model), folds=n_batches, epochs=epochs, neg_ent=neg_ent, neg_rel=neg_rel, bern=bern, workers=workers, optimizer=optimizer, dimension=dims, # TransE-specific margin=margin, # TransE-specific out_path=out ) checkpoint_path = Path(f'./checkpoints/{model}') out_path = Path(f'{out}/{model}/{dataset.name}') if not out_path.exists(): click.echo(f'Creating output path: {out_path}') out_path.mkdir(parents=True) # Train the model. It is saved in the process. if not checkpoint_path.exists(): click.echo(f'Creating checkpoint directory: {checkpoint_path}') checkpoint_path.mkdir(parents=True) # if dataset is not written out, do so # if not (out_path / f'{dataset.name}_dataset.npz').exists(): # dataset.to_npz(out_path / f'{dataset.name}_dataset.npz') embedding.train(prefix=str(checkpoint_path / dataset.name)) # Save the embedding to a JSON file if json: embedding.save_to_json(f'{out_path}/{dataset.name}_{model.lower()}_{optimizer.lower()}_{dims}_embs.json') # Save the embedding as numpy (.npz) file archive_name = f'{out_path}/{dataset.name}_{model.lower()}_{optimizer.lower()}_{dims}_embs.npz' embedding.save_to_npz(archive_name) if eval: rank_predictions = embedding.get_predictions() # rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions.csv') results = calc_metrics(rank_predictions=rank_predictions) if (out_path / f'{dataset.name}_metrics.csv').exists(): df = pd.read_csv(str(out_path / f'{dataset.name}_metrics.csv'), index_col=0) prev_epochs = df.iloc[-1]['epochs'] results['epochs'] = int(prev_epochs + epochs) df = df.append(results, ignore_index=True) df.to_csv(str(out_path / f'{dataset.name}_metrics.csv')) rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions_{int(prev_epochs + epochs)}.csv') print(df) else: results['epochs'] = epochs results.to_csv(str(out_path / f'{dataset.name}_metrics.csv')) rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions_{epochs}.csv') print(results)
from pyke.dataset import Dataset from pyke.embedding import Embedding from pyke.models import DistMult # Read the dataset dataset = Dataset("./benchmarks/fb15k.nt") embedding = Embedding( dataset, DistMult, folds=100, epochs=20, neg_ent=1, neg_rel=0, bern=False, workers=4, dimension=50, # DistMult-specific weight=0.0001, # DistMult-specific learning_rate=0.1, optimizer="Adagrad", ) # Train the model. It is saved in the process. embedding.train(prefix="./DistMult") # Save the embedding to a JSON file embedding.save_to_json("DistMult.json")
def setUp(self): self.nt_filename = "tests/resources/test.nt" self.dataset = Dataset(self.nt_filename, temp_dir="tests/tmp")