コード例 #1
0
ファイル: leakage.py プロジェクト: vishalbelsare/pykeen
def _main():
    """Test unleaking FB15K.

    Run with ``python -m pykeen.triples.leakage``.
    """
    from pykeen.datasets import get_dataset

    logging.basicConfig(format="pykeen: %(message)s", level=logging.INFO)

    fb15k = get_dataset(dataset="fb15k")
    click.echo(fb15k.summary_str())

    n = 401  # magic 401 from the paper
    train, test, validate = unleak(fb15k.training,
                                   fb15k.testing,
                                   fb15k.validation,
                                   n=n)
    click.echo("")
    click.echo(
        EagerDataset(train, test,
                     validate).summary_str(title="FB15k (cleaned)"))

    fb15k237 = get_dataset(dataset="fb15k237")
    click.echo("\nSummary FB15K-237")
    click.echo(fb15k237.summary_str())
コード例 #2
0
def iter_datasets(test: bool = False) -> Iterable[Dataset]:
    it = tqdm(datasets[:5] if test else datasets, desc='Datasets')
    for dataset in it:
        dataset_instance = get_dataset(dataset=dataset)
        it.write(f'loaded {dataset_instance.get_normalized_name()}')
        it.set_postfix(dataset=dataset_instance.get_normalized_name())
        yield dataset_instance
コード例 #3
0
 def _pre_instantiation_hook(
     self, kwargs: MutableMapping[str, Any]
 ) -> MutableMapping[str, Any]:  # noqa: D102
     kwargs = super()._pre_instantiation_hook(kwargs=kwargs)
     kwargs["labels"] = sorted(
         get_dataset(dataset="nations").entity_to_id.keys())
     return kwargs
コード例 #4
0
def _main(trials: int = 15):
    from pykeen.datasets import get_dataset
    import numpy as np
    import itertools as itt
    from tqdm import tqdm

    n_comb = trials * (trials - 1) // 2
    print(f'Number of combinations: {trials} n Choose 2 = {n_comb}')

    for dataset_name in [
            'nations',
            'umls',
            'kinships',
            'codexsmall',
            'wn18',
    ]:
        reference_dataset = get_dataset(dataset=dataset_name)
        remixed_datasets = [
            reference_dataset.remix(random_state=random_state)
            for random_state in range(trials)
        ]
        similarities = [
            a.similarity(b) for a, b in tqdm(
                itt.combinations(remixed_datasets, r=2),
                total=n_comb,
                desc=dataset_name,
            )
        ]
        print(
            f'[{dataset_name}] Similarities Mean: {np.mean(similarities):.3f}')
        print(
            f'[{dataset_name}] Similarities Std.: {np.std(similarities):.3f}')
        print(
            f'[{dataset_name}] Relative Std.: {np.std(similarities) / np.mean(similarities):.3%}'
        )
コード例 #5
0
ファイル: leakage.py プロジェクト: wuxiaoxue/pykeen
def _main():
    """Test unleaking FB15K.

    Run with ``python -m pykeen.triples.leakage``.
    """
    from pykeen.datasets import get_dataset
    logging.basicConfig(format='pykeen: %(message)s', level=logging.INFO)

    fb15k = get_dataset(dataset='fb15k')
    fb15k.summarize()

    n = 401  # magic 401 from the paper
    train, test, validate = unleak(fb15k.training, fb15k.testing, fb15k.validation, n=n)
    print()
    EagerDataset(train, test, validate).summarize(title='FB15k (cleaned)')

    fb15k237 = get_dataset(dataset='fb15k237')
    print('\nSummary FB15K-237')
    fb15k237.summarize()
コード例 #6
0
def _main():
    """Test unleaking FB15K.

    Run with ``python -m pykeen.triples.leakage``.
    """
    from pykeen.datasets import get_dataset
    logging.basicConfig(format='pykeen: %(message)s', level=logging.INFO)

    print('Summary FB15K')
    train, test, validate = get_dataset(dataset='fb15k')
    summarize(train, test, validate)

    print('\nSummary FB15K (cleaned)')
    train, test, validate = unleak(train, test, validate,
                                   n=401)  # magic 401 from the paper
    summarize(train, test, validate)

    print('\nSummary FB15K-237')
    train, test, validate = get_dataset(dataset='fb15k237')
    summarize(train, test, validate)
コード例 #7
0
ファイル: utils.py プロジェクト: shunsunsun/benchmarking
def get_model_size(  # noqa: C901
    *,
    dataset: Union[None, str, Type[Dataset]] = None,
    dataset_kwargs: Optional[Mapping[str, Any]] = None,
    training: Optional[TriplesFactory] = None,
    testing: Optional[TriplesFactory] = None,
    validation: Optional[TriplesFactory] = None,
    model: Union[str, Type[Model]],
    model_kwargs: Optional[Mapping[str, Any]] = None,
    loss: Union[None, str, Type[Loss]] = None,
    loss_kwargs: Optional[Mapping[str, Any]] = None,
    regularizer: Union[None, str, Type[Regularizer]] = None,
    regularizer_kwargs: Optional[Mapping[str, Any]] = None,
    **_kwargs,
) -> int:
    """Make a model instance, similarly to how the pipelin is started, then return the model size."""
    device = resolve_device('cpu')
    dataset = get_dataset(
        dataset=dataset,
        dataset_kwargs=dataset_kwargs,
        training=training,
        testing=testing,
        validation=validation,
    )

    if model_kwargs is None:
        model_kwargs = {}

    if regularizer is not None:
        regularizer_cls = get_regularizer_cls(regularizer)
        model_kwargs['regularizer'] = regularizer_cls(
            device=device,
            **(regularizer_kwargs or {}),
        )

    if loss is not None:
        loss_cls = get_loss_cls(loss)
        model_kwargs['loss'] = loss_cls(**(loss_kwargs or {}))

    model = get_model_cls(model)
    model_instance: Model = model(
        random_seed=0,
        preferred_device=device,
        triples_factory=dataset.training,
        **model_kwargs,
    )
    return model_instance.num_parameter_bytes
コード例 #8
0
ファイル: deteriorate.py プロジェクト: vishalbelsare/pykeen
def _main(trials: int):
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from tabulate import tabulate

    from pykeen.constants import PYKEEN_EXPERIMENTS
    from pykeen.datasets import get_dataset

    n_comb = trials * (trials - 1) // 2
    logger.info(f"Number of combinations: {trials} n Choose 2 = {n_comb}")

    ns = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4]
    rows = []
    for dataset_name in [
        # 'kinships',
        "nations",
        # 'umls',
        # 'codexsmall',
        # 'wn18',
    ]:
        dataset_rows = []
        reference_dataset = get_dataset(dataset=dataset_name)
        for n in ns:
            similarities = []
            for trial in range(trials):
                deteriorated_dataset = reference_dataset.deteriorate(n=n, random_state=trial)
                sim = reference_dataset.similarity(deteriorated_dataset)
                similarities.append(sim)
                rows.append((dataset_name, n, trial, sim))
            dataset_rows.append((n, np.mean(similarities), np.std(similarities)))
        click.echo(tabulate(dataset_rows, headers=[f"{dataset_name} N", "Mean", "Std"]))

    df = pd.DataFrame(rows, columns=["name", "n", "trial", "sim"])
    tsv_path = PYKEEN_EXPERIMENTS / "deteriorating.tsv"
    png_path = PYKEEN_EXPERIMENTS / "deteriorating.png"
    click.echo(f"writing to {tsv_path}")
    df.to_csv(tsv_path, sep="\t", index=False)
    sns.lineplot(data=df, x="n", y="sim", hue="name")
    plt.savefig(png_path, dpi=300)
コード例 #9
0
def _main(trials):
    import itertools as itt

    import numpy as np
    from tqdm import tqdm

    from pykeen.datasets import get_dataset

    n_comb = trials * (trials - 1) // 2
    click.echo(f"Number of combinations: {trials} n Choose 2 = {n_comb}")

    for dataset_name in [
            "nations",
            "umls",
            "kinships",
            "codexsmall",
            "wn18",
    ]:
        reference_dataset = get_dataset(dataset=dataset_name)
        remixed_datasets = [
            reference_dataset.remix(random_state=random_state)
            for random_state in range(trials)
        ]
        similarities = [
            a.similarity(b) for a, b in tqdm(
                itt.combinations(remixed_datasets, r=2),
                total=n_comb,
                desc=dataset_name,
            )
        ]
        click.echo(
            f"[{dataset_name}] Similarities Mean: {np.mean(similarities):.3f}")
        click.echo(
            f"[{dataset_name}] Similarities Std.: {np.std(similarities):.3f}")
        click.echo(
            f"[{dataset_name}] Relative Std.: {np.std(similarities) / np.mean(similarities):.3%}"
        )
コード例 #10
0
def verify(dataset: str):
    """Verify dataset integrity."""
    data = []
    keys = None
    for name, dataset in _iter_datasets(regex_name_filter=dataset):
        dataset_instance = get_dataset(dataset=dataset)
        data.append(
            list(
                itt.chain(
                    [name],
                    itt.chain.from_iterable(
                        (triples_factory.num_entities,
                         triples_factory.num_relations)
                        for _, triples_factory in sorted(
                            dataset_instance.factory_dict.items())),
                )))
        keys = keys or sorted(dataset_instance.factory_dict.keys())
    if not keys:
        return
    df = pandas.DataFrame(
        data=data,
        columns=["name"] + [
            f"num_{part}_{a}" for part in keys
            for a in ("entities", "relations")
        ],
    )
    valid = None
    for part, a in itt.product(("validation", "testing"),
                               ("entities", "relations")):
        this_valid = df[f"num_training_{a}"] == df[f"num_{part}_{a}"]
        if valid is None:
            valid = this_valid
        else:
            valid = valid & this_valid
    df["valid"] = valid
    print(df.to_markdown())
コード例 #11
0
def main(replicates: int, force: bool):
    import pykeen.triples.splitting
    pykeen.triples.splitting.logger.setLevel(logging.ERROR)
    import pykeen.triples.triples_factory
    pykeen.triples.triples_factory.logger.setLevel(logging.ERROR)
    import pykeen.utils
    pykeen.utils.logger.setLevel(logging.ERROR)

    git_hash = get_git_hash()
    methods = ['cleanup', 'coverage']
    ratios = [0.8]

    click.echo(f'output directory: {SPLITTING_DIRECTORY.as_posix()}')
    rows = []
    outer_it = tqdm(sorted(datasets), desc='Dataset')
    for dataset in outer_it:
        dataset_path = RESULTS_DIRECTORY / f'{dataset}.tsv'
        if dataset_path.exists() and not force:
            _log(f'loading pre-calculated {dataset} from {dataset_path}')
            df = pd.read_csv(dataset_path, sep='\t')
            rows.extend(df.values)
            continue

        _log(f'loading {dataset}')
        t = time.time()
        dataset = get_dataset(dataset=dataset)
        dataset_name = dataset.__class__.__name__
        ccl = [
            dataset.training.mapped_triples,
            dataset.testing.mapped_triples,
            dataset.validation.mapped_triples,
        ]
        load_time = time.time() - t
        _log(f'done loading {dataset_name} after {load_time:.3f} seconds')
        _log(f'concatenating {dataset_name}')
        t = time.time()
        mapped_triples: torch.LongTensor = torch.cat(ccl, dim=0)
        concat_time = time.time() - t
        _log(
            f'done concatenating {dataset_name} after {concat_time:.3f} seconds'
        )
        _log(f'deleting {dataset_name}')
        del dataset
        _log(f'done deleting {dataset_name}')

        dataset_rows = []
        inner_it = itt.product(methods, ratios, range(1, 1 + replicates))
        inner_it = tqdm(
            inner_it,
            total=len(methods) * len(ratios) * replicates,
            desc=f'{dataset_name} ({intword(mapped_triples.shape[0])})',
        )
        for method, ratio, replicate in inner_it:
            t = time.time()
            results = split(
                mapped_triples,
                ratios=[ratio, (1 - ratio) / 2],
                method=method,
                random_state=replicate,
            )
            split_time = time.time() - t
            dataset_rows.append((
                git_hash,
                dataset_name,
                mapped_triples.shape[0],
                load_time,
                concat_time,
                method,
                ratio,
                replicate,
                split_time,
                results[0].shape[0],
                results[1].shape[0],
                results[2].shape[0],
            ))
            del results

        _log(f'writing to {dataset_path}')
        pd.DataFrame(dataset_rows, columns=columns).to_csv(dataset_path,
                                                           sep='\t',
                                                           index=False)
        rows.extend(dataset_rows)

    df = pd.DataFrame(rows, columns=columns)
    df.to_csv(tsv_path, sep='\t', index=False)
    _make_1(df, git_hash)
    _make_2(df, git_hash)
コード例 #12
0
def run_inverse_stability_workflow(dataset: str,
                                   model: str,
                                   training_loop: str,
                                   random_seed=0,
                                   device="cpu"):
    """Run an inverse stability experiment."""
    dataset_instance: Dataset = get_dataset(
        dataset=dataset,
        dataset_kwargs=dict(create_inverse_triples=True, ),
    )
    dataset_name = dataset_instance.get_normalized_name()
    model_cls: Type[Model] = model_resolver.lookup(model)
    model_name = model_cls.__name__.lower()

    dataset_dir = INVERSE_STABILITY / dataset_name
    dataset_dir.mkdir(exist_ok=True, parents=True)

    pipeline_result = pipeline(
        dataset=dataset_instance,
        model=model,
        training_loop=training_loop,
        training_kwargs=dict(
            num_epochs=1000,
            use_tqdm_batch=False,
        ),
        stopper="early",
        stopper_kwargs=dict(patience=5, frequency=5),
        random_seed=random_seed,
        device=device,
    )
    test_tf = dataset_instance.testing
    model = pipeline_result.model
    # Score with original triples
    scores_forward = model.score_hrt(test_tf.mapped_triples)
    scores_forward_np = scores_forward.detach().numpy()[:, 0]

    # Score with inverse triples
    scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples)
    scores_inverse_np = scores_inverse.detach().numpy()[:, 0]

    scores_path = dataset_dir / f"{model_name}_{training_loop}_scores.tsv"
    df = pd.DataFrame(
        list(
            zip(
                itt.repeat(training_loop),
                itt.repeat(dataset_name),
                itt.repeat(model_name),
                scores_forward_np,
                scores_inverse_np,
            )),
        columns=["training_loop", "dataset", "model", "forward", "inverse"],
    )
    df.to_csv(scores_path, sep="\t", index=False)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(data=df,
                 x="forward",
                 label="Forward",
                 ax=ax,
                 color="blue",
                 stat="density")
    sns.histplot(data=df,
                 x="inverse",
                 label="Inverse",
                 ax=ax,
                 color="orange",
                 stat="density")
    ax.set_title(f"{dataset_name} - {model_name} - {training_loop}")
    ax.set_xlabel("Score")
    plt.legend()
    plt.savefig(dataset_dir / f"{model_name}_{training_loop}_overlay.png",
                dpi=300)
    plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density")
    ax.set_title(f"{dataset_name} - {model_name} - {training_loop}")
    ax.set_xlabel("Forward - Inverse Score Difference")
    plt.savefig(dataset_dir / f"{model_name}_{training_loop}_residuals.png",
                dpi=300)
    plt.close(fig)

    return df
コード例 #13
0
def _analyze(dataset, force, countplots, directory: Union[None, str,
                                                          pathlib.Path]):
    from pykeen.datasets import get_dataset
    from pykeen.constants import PYKEEN_DATASETS
    from . import analysis
    from tqdm import tqdm
    import pandas as pd
    import docdata
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
    except ImportError:
        raise ImportError(
            dedent("""\
            Please install plotting dependencies by

                pip install pykeen[plotting]

            or directly by

                pip install matplotlib seaborn
        """))

    # Raise matplotlib level
    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    if directory is None:
        directory = PYKEEN_DATASETS
    else:
        directory = pathlib.Path(directory)
        directory.mkdir(exist_ok=True, parents=True)

    dataset_instance = get_dataset(dataset=dataset)
    d = directory.joinpath(dataset_instance.__class__.__name__.lower(),
                           'analysis')
    d.mkdir(parents=True, exist_ok=True)

    dfs = {}
    it = tqdm(analysis.__dict__.items(), leave=False, desc='Stats')
    for name, func in it:
        if not name.startswith('get') or not name.endswith('df'):
            continue
        it.set_postfix(func=name)
        key = name[len('get_'):-len('_df')]
        path = d.joinpath(key).with_suffix('.tsv')
        if path.exists() and not force:
            df = pd.read_csv(path, sep='\t')
        else:
            df = func(dataset=dataset_instance)
            df.to_csv(d.joinpath(key).with_suffix('.tsv'),
                      sep='\t',
                      index=False)
        dfs[key] = df

    fig, ax = plt.subplots(1, 1)
    sns.scatterplot(
        data=dfs['relation_injectivity'],
        x='head',
        y='tail',
        size='support',
        hue='support',
        ax=ax,
    )
    ax.set_title(
        f'{docdata.get_docdata(dataset_instance.__class__)["name"]} Relation Injectivity'
    )
    fig.tight_layout()
    fig.savefig(d.joinpath('relation_injectivity.svg'))
    plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    sns.scatterplot(
        data=dfs['relation_functionality'],
        x='functionality',
        y='inverse_functionality',
        ax=ax,
    )
    ax.set_title(
        f'{docdata.get_docdata(dataset_instance.__class__)["name"]} Relation Functionality'
    )
    fig.tight_layout()
    fig.savefig(d.joinpath('relation_functionality.svg'))
    plt.close(fig)

    if countplots:
        entity_count_df = (dfs['entity_count'].groupby(
            'entity_label').sum().reset_index().sort_values('count',
                                                            ascending=False))
        fig, ax = plt.subplots(1, 1)
        sns.barplot(data=entity_count_df, y='entity_label', x='count', ax=ax)
        ax.set_ylabel('')
        ax.set_xscale('log')
        fig.tight_layout()
        fig.savefig(d.joinpath('entity_counts.svg'))
        plt.close(fig)

        relation_count_df = (dfs['relation_count'].groupby(
            'relation_label').sum().reset_index().sort_values('count',
                                                              ascending=False))
        fig, ax = plt.subplots(1, 1)
        sns.barplot(data=relation_count_df,
                    y='relation_label',
                    x='count',
                    ax=ax)
        ax.set_ylabel('')
        ax.set_xscale('log')
        fig.tight_layout()
        fig.savefig(d.joinpath('relation_counts.svg'))
        plt.close(fig)
コード例 #14
0
ファイル: inverse_stability.py プロジェクト: sunny1401/pykeen
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device='cpu'):
    """Run an inverse stability experiment."""
    dataset: Dataset = get_dataset(
        dataset=dataset,
        dataset_kwargs=dict(
            create_inverse_triples=True,
        ),
    )
    dataset_name = dataset.get_normalized_name()
    model_cls: Type[Model] = get_model_cls(model)
    model_name = model_cls.__name__.lower()

    dataset_dir = INVERSE_STABILITY / dataset_name
    dataset_dir.mkdir(exist_ok=True, parents=True)

    pipeline_result = pipeline(
        dataset=dataset,
        model=model,
        training_loop=training_loop,
        training_kwargs=dict(
            num_epochs=1000,
            use_tqdm_batch=False,
        ),
        stopper='early',
        stopper_kwargs=dict(patience=5, frequency=5),
        random_seed=random_seed,
        device=device,
    )
    test_tf = dataset.testing
    model = pipeline_result.model
    # Score with original triples
    scores_forward = model.score_hrt(test_tf.mapped_triples)
    scores_forward_np = scores_forward.detach().numpy()[:, 0]

    # Score with inverse triples
    scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples)
    scores_inverse_np = scores_inverse.detach().numpy()[:, 0]

    scores_path = dataset_dir / f'{model_name}_{training_loop}_scores.tsv'
    df = pd.DataFrame(
        list(zip(
            itt.repeat(training_loop),
            itt.repeat(dataset_name),
            itt.repeat(model_name),
            scores_forward_np,
            scores_inverse_np,
        )),
        columns=['training_loop', 'dataset', 'model', 'forward', 'inverse'],
    )
    df.to_csv(scores_path, sep='\t', index=False)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(data=df, x='forward', label='Forward', ax=ax, color='blue', stat="density")
    sns.histplot(data=df, x='inverse', label='Inverse', ax=ax, color='orange', stat="density")
    ax.set_title(f'{dataset_name} - {model_name} - {training_loop}')
    ax.set_xlabel('Score')
    plt.legend()
    plt.savefig(dataset_dir / f'{model_name}_{training_loop}_overlay.png', dpi=300)
    plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density")
    ax.set_title(f'{dataset_name} - {model_name} - {training_loop}')
    ax.set_xlabel('Forward - Inverse Score Difference')
    plt.savefig(dataset_dir / f'{model_name}_{training_loop}_residuals.png', dpi=300)
    plt.close(fig)

    return df