Esempio n. 1
0
 def load_from_npz(cls, path: str, dataset: Dataset = None) -> 'Embedding':
     with np.load(path) as data:
         if dataset is not None:
             embs = cls(dataset, None)
         else:
             embs = cls(Dataset(), None)
         embs.entity_embeddings = data['entity_embeddings']
         try:
             embs.relationship_embeddings = data['relationship_embeddings']
         except KeyError:
             pass
         return embs
Esempio n. 2
0
    def test_init(self):
        ds = Dataset(self.nt_filename, temp_dir="tests/tmp")
        tempdir = os.path.join(os.curdir, "tests/tmp",
                               "da8c86a9fd1a62dc6bb6979203498a31")

        self.assertTrue(os.path.exists(os.path.join(tempdir, "entity2id.txt")))
        self.assertTrue(
            os.path.exists(os.path.join(tempdir, "relation2id.txt")))
        self.assertTrue(os.path.exists(os.path.join(tempdir, "train2id.txt")))
        self.assertEqual(ds.size, 8)
        self.assertEqual(len(ds), 8)
        self.assertEqual(ds.ent_count, 11)
        self.assertEqual(ds.rel_count, 6)
        self.assertEqual(ds.shape, (11, 6))
Esempio n. 3
0
def main():
    # load dataset
    dataset = Dataset("../benchmarks/fb15k.nt")

    # load embeddings
    embeddings = Embedding(dataset, TransE)
    embeddings.restore(prefix='../checkpoints/TransE/TransE')

    # alternatively load embeddings from numpy matrix
    embs = Embedding(dataset, TransE)
    embs.load_embeddings_from_npy('../embeddings/TransE/TransE.npy')

    # query embeddings
    print(embeddings['/m/02f75t'])
    print(embs['/m/02f75t'])
    print()
    print(embeddings['foobar'])
    print(embs['foobar'])
Esempio n. 4
0
def build_dataset(generate_validation_test,
                  validation_file,
                  test_file,
                  out,
                  file_in):
    """Create npz dataset file"""
    file_in = Path(file_in)
    if not file_in.exists():
        click.echo(f'The file {file_in} does not exist')
        return sys.exit(1)

    if out is None:
        out = f'./{file_in.with_suffix(".npz")}'

    if validation_file is not None and test_file is not None:
        dataset = Dataset(train_file=str(file_in), valid_file=validation_file, test_file=test_file, generate_valid_test=True)
    elif validation_file is None and test_file is None and generate_validation_test:
        dataset = Dataset(train_file=str(file_in), generate_valid_test=True)
    else:
        dataset = Dataset(train_file=str(file_in))
    dataset.to_npz(out_path=out)
Esempio n. 5
0
    def __init__(self,
                 dataset: Dataset = None,
                 model_class: type = None,
                 out_path=None,
                 **kwargs):
        self.dataset = dataset or Dataset()
        self.model_class = model_class
        # self.__model = None
        self.__config = None
        self.__library = Library.get_library()
        # Training args
        self.neg_ent = 1
        self.neg_rel = 0
        self.bern = True
        self.workers = 1
        self.folds = 20
        self.epochs = 50
        self.optimizer = "SGD"
        self.per_process_gpu_memory_fraction = 0.5
        self.learning_rate = 0.01
        # Model specific parameters
        self.dimension = 50  # ComplEx, DistMult, HolE, RESCAL, TransD, TransE, TransH
        self.ent_dim = 50  # TransR
        self.rel_dim = 10  # TransR
        self.margin = 1.0  # HolE, RESCAL, TransD, TransE, TransH, TransR
        self.weight = 0.0001  # ComplEx, DistMult
        # used to provide easier access to embeddings.
        self.entity_embeddings = None
        self.relationship_embeddings = None
        self.rankings: pd.DataFrame = None
        self.out_path = out_path

        # Apply kwargs
        for key, value in kwargs.items():
            if hasattr(self, key):
                setattr(self, key, value)
        if model_class is not None:
            self.__init_config()
Esempio n. 6
0
def compute(model,
            n_batches,
            epochs,
            neg_ent,
            neg_rel,
            bern,
            workers,
            optimizer,
            dims,
            margin,
            eval,
            out,
            json,
            valid_file,
            test_file,
            file_path):
    """Initializes the repository."""
    file_path = Path(file_path)

    if file_path.suffix == '.npz':
        dataset = Dataset.from_npz(file_path)
    elif valid_file is not None and test_file is not None:
        dataset = Dataset(train_file=str(file_path), valid_file=valid_file, test_file=test_file, generate_valid_test=True)
    elif valid_file is None and test_file is None and eval:
        dataset = Dataset(train_file=str(file_path), generate_valid_test=True)
    else:
        dataset = Dataset(train_file=str(file_path))

    click.echo("Start training using the following parameters: ")
    click.echo("-----------------------------------------------")
    click.echo(f"Knowledge Base: {file_path}")
    click.echo(f"Batch number: {n_batches} => {int(dataset.size / n_batches)} total batch size")
    click.echo(f"Epochs: {epochs}")
    click.echo(f"Neg_Ent: {neg_ent}")
    click.echo(f"Neg_Rel: {neg_rel}")
    click.echo(f"bern: {bern}")
    click.echo(f"Workers: {workers}")
    click.echo(f"Optimizer: {optimizer}")
    click.echo(f"Dimensionality: {dims}")
    click.echo(f"Margin: {margin}")
    click.echo(f"Output directory: {out}")
    click.echo("-----------------------------------------------")

    embedding = Embedding(
        dataset,
        get_model(model),
        folds=n_batches,
        epochs=epochs,
        neg_ent=neg_ent,
        neg_rel=neg_rel,
        bern=bern,
        workers=workers,
        optimizer=optimizer,
        dimension=dims,  # TransE-specific
        margin=margin,  # TransE-specific
        out_path=out
    )

    checkpoint_path = Path(f'./checkpoints/{model}')
    out_path = Path(f'{out}/{model}/{dataset.name}')

    if not out_path.exists():
        click.echo(f'Creating output path: {out_path}')
        out_path.mkdir(parents=True)

    # Train the model. It is saved in the process.
    if not checkpoint_path.exists():
        click.echo(f'Creating checkpoint directory: {checkpoint_path}')
        checkpoint_path.mkdir(parents=True)

    # if dataset is not written out, do so
    # if not (out_path / f'{dataset.name}_dataset.npz').exists():
    #    dataset.to_npz(out_path / f'{dataset.name}_dataset.npz')

    embedding.train(prefix=str(checkpoint_path / dataset.name))

    # Save the embedding to a JSON file
    if json:
        embedding.save_to_json(f'{out_path}/{dataset.name}_{model.lower()}_{optimizer.lower()}_{dims}_embs.json')
    # Save the embedding as numpy (.npz) file
    archive_name = f'{out_path}/{dataset.name}_{model.lower()}_{optimizer.lower()}_{dims}_embs.npz'
    embedding.save_to_npz(archive_name)

    if eval:
        rank_predictions = embedding.get_predictions()
        # rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions.csv')

        results = calc_metrics(rank_predictions=rank_predictions)
        if (out_path / f'{dataset.name}_metrics.csv').exists():
            df = pd.read_csv(str(out_path / f'{dataset.name}_metrics.csv'), index_col=0)
            prev_epochs = df.iloc[-1]['epochs']
            results['epochs'] = int(prev_epochs + epochs)
            df = df.append(results, ignore_index=True)
            df.to_csv(str(out_path / f'{dataset.name}_metrics.csv'))
            rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions_{int(prev_epochs + epochs)}.csv')
            print(df)
        else:
            results['epochs'] = epochs
            results.to_csv(str(out_path / f'{dataset.name}_metrics.csv'))
            rank_predictions.to_csv(f'{out_path}/{dataset.name}_rank_predictions_{epochs}.csv')
            print(results)
Esempio n. 7
0
from pyke.dataset import Dataset
from pyke.embedding import Embedding
from pyke.models import DistMult

# Read the dataset
dataset = Dataset("./benchmarks/fb15k.nt")
embedding = Embedding(
    dataset,
    DistMult,
    folds=100,
    epochs=20,
    neg_ent=1,
    neg_rel=0,
    bern=False,
    workers=4,
    dimension=50,  # DistMult-specific
    weight=0.0001,  # DistMult-specific
    learning_rate=0.1,
    optimizer="Adagrad",
)

# Train the model. It is saved in the process.
embedding.train(prefix="./DistMult")

# Save the embedding to a JSON file
embedding.save_to_json("DistMult.json")
Esempio n. 8
0
 def setUp(self):
     self.nt_filename = "tests/resources/test.nt"
     self.dataset = Dataset(self.nt_filename, temp_dir="tests/tmp")