def _main(): """Test unleaking FB15K. Run with ``python -m pykeen.triples.leakage``. """ from pykeen.datasets import get_dataset logging.basicConfig(format="pykeen: %(message)s", level=logging.INFO) fb15k = get_dataset(dataset="fb15k") click.echo(fb15k.summary_str()) n = 401 # magic 401 from the paper train, test, validate = unleak(fb15k.training, fb15k.testing, fb15k.validation, n=n) click.echo("") click.echo( EagerDataset(train, test, validate).summary_str(title="FB15k (cleaned)")) fb15k237 = get_dataset(dataset="fb15k237") click.echo("\nSummary FB15K-237") click.echo(fb15k237.summary_str())
def iter_datasets(test: bool = False) -> Iterable[Dataset]: it = tqdm(datasets[:5] if test else datasets, desc='Datasets') for dataset in it: dataset_instance = get_dataset(dataset=dataset) it.write(f'loaded {dataset_instance.get_normalized_name()}') it.set_postfix(dataset=dataset_instance.get_normalized_name()) yield dataset_instance
def _pre_instantiation_hook( self, kwargs: MutableMapping[str, Any] ) -> MutableMapping[str, Any]: # noqa: D102 kwargs = super()._pre_instantiation_hook(kwargs=kwargs) kwargs["labels"] = sorted( get_dataset(dataset="nations").entity_to_id.keys()) return kwargs
def _main(trials: int = 15): from pykeen.datasets import get_dataset import numpy as np import itertools as itt from tqdm import tqdm n_comb = trials * (trials - 1) // 2 print(f'Number of combinations: {trials} n Choose 2 = {n_comb}') for dataset_name in [ 'nations', 'umls', 'kinships', 'codexsmall', 'wn18', ]: reference_dataset = get_dataset(dataset=dataset_name) remixed_datasets = [ reference_dataset.remix(random_state=random_state) for random_state in range(trials) ] similarities = [ a.similarity(b) for a, b in tqdm( itt.combinations(remixed_datasets, r=2), total=n_comb, desc=dataset_name, ) ] print( f'[{dataset_name}] Similarities Mean: {np.mean(similarities):.3f}') print( f'[{dataset_name}] Similarities Std.: {np.std(similarities):.3f}') print( f'[{dataset_name}] Relative Std.: {np.std(similarities) / np.mean(similarities):.3%}' )
def _main(): """Test unleaking FB15K. Run with ``python -m pykeen.triples.leakage``. """ from pykeen.datasets import get_dataset logging.basicConfig(format='pykeen: %(message)s', level=logging.INFO) fb15k = get_dataset(dataset='fb15k') fb15k.summarize() n = 401 # magic 401 from the paper train, test, validate = unleak(fb15k.training, fb15k.testing, fb15k.validation, n=n) print() EagerDataset(train, test, validate).summarize(title='FB15k (cleaned)') fb15k237 = get_dataset(dataset='fb15k237') print('\nSummary FB15K-237') fb15k237.summarize()
def _main(): """Test unleaking FB15K. Run with ``python -m pykeen.triples.leakage``. """ from pykeen.datasets import get_dataset logging.basicConfig(format='pykeen: %(message)s', level=logging.INFO) print('Summary FB15K') train, test, validate = get_dataset(dataset='fb15k') summarize(train, test, validate) print('\nSummary FB15K (cleaned)') train, test, validate = unleak(train, test, validate, n=401) # magic 401 from the paper summarize(train, test, validate) print('\nSummary FB15K-237') train, test, validate = get_dataset(dataset='fb15k237') summarize(train, test, validate)
def get_model_size( # noqa: C901 *, dataset: Union[None, str, Type[Dataset]] = None, dataset_kwargs: Optional[Mapping[str, Any]] = None, training: Optional[TriplesFactory] = None, testing: Optional[TriplesFactory] = None, validation: Optional[TriplesFactory] = None, model: Union[str, Type[Model]], model_kwargs: Optional[Mapping[str, Any]] = None, loss: Union[None, str, Type[Loss]] = None, loss_kwargs: Optional[Mapping[str, Any]] = None, regularizer: Union[None, str, Type[Regularizer]] = None, regularizer_kwargs: Optional[Mapping[str, Any]] = None, **_kwargs, ) -> int: """Make a model instance, similarly to how the pipelin is started, then return the model size.""" device = resolve_device('cpu') dataset = get_dataset( dataset=dataset, dataset_kwargs=dataset_kwargs, training=training, testing=testing, validation=validation, ) if model_kwargs is None: model_kwargs = {} if regularizer is not None: regularizer_cls = get_regularizer_cls(regularizer) model_kwargs['regularizer'] = regularizer_cls( device=device, **(regularizer_kwargs or {}), ) if loss is not None: loss_cls = get_loss_cls(loss) model_kwargs['loss'] = loss_cls(**(loss_kwargs or {})) model = get_model_cls(model) model_instance: Model = model( random_seed=0, preferred_device=device, triples_factory=dataset.training, **model_kwargs, ) return model_instance.num_parameter_bytes
def _main(trials: int): import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from tabulate import tabulate from pykeen.constants import PYKEEN_EXPERIMENTS from pykeen.datasets import get_dataset n_comb = trials * (trials - 1) // 2 logger.info(f"Number of combinations: {trials} n Choose 2 = {n_comb}") ns = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4] rows = [] for dataset_name in [ # 'kinships', "nations", # 'umls', # 'codexsmall', # 'wn18', ]: dataset_rows = [] reference_dataset = get_dataset(dataset=dataset_name) for n in ns: similarities = [] for trial in range(trials): deteriorated_dataset = reference_dataset.deteriorate(n=n, random_state=trial) sim = reference_dataset.similarity(deteriorated_dataset) similarities.append(sim) rows.append((dataset_name, n, trial, sim)) dataset_rows.append((n, np.mean(similarities), np.std(similarities))) click.echo(tabulate(dataset_rows, headers=[f"{dataset_name} N", "Mean", "Std"])) df = pd.DataFrame(rows, columns=["name", "n", "trial", "sim"]) tsv_path = PYKEEN_EXPERIMENTS / "deteriorating.tsv" png_path = PYKEEN_EXPERIMENTS / "deteriorating.png" click.echo(f"writing to {tsv_path}") df.to_csv(tsv_path, sep="\t", index=False) sns.lineplot(data=df, x="n", y="sim", hue="name") plt.savefig(png_path, dpi=300)
def _main(trials): import itertools as itt import numpy as np from tqdm import tqdm from pykeen.datasets import get_dataset n_comb = trials * (trials - 1) // 2 click.echo(f"Number of combinations: {trials} n Choose 2 = {n_comb}") for dataset_name in [ "nations", "umls", "kinships", "codexsmall", "wn18", ]: reference_dataset = get_dataset(dataset=dataset_name) remixed_datasets = [ reference_dataset.remix(random_state=random_state) for random_state in range(trials) ] similarities = [ a.similarity(b) for a, b in tqdm( itt.combinations(remixed_datasets, r=2), total=n_comb, desc=dataset_name, ) ] click.echo( f"[{dataset_name}] Similarities Mean: {np.mean(similarities):.3f}") click.echo( f"[{dataset_name}] Similarities Std.: {np.std(similarities):.3f}") click.echo( f"[{dataset_name}] Relative Std.: {np.std(similarities) / np.mean(similarities):.3%}" )
def verify(dataset: str): """Verify dataset integrity.""" data = [] keys = None for name, dataset in _iter_datasets(regex_name_filter=dataset): dataset_instance = get_dataset(dataset=dataset) data.append( list( itt.chain( [name], itt.chain.from_iterable( (triples_factory.num_entities, triples_factory.num_relations) for _, triples_factory in sorted( dataset_instance.factory_dict.items())), ))) keys = keys or sorted(dataset_instance.factory_dict.keys()) if not keys: return df = pandas.DataFrame( data=data, columns=["name"] + [ f"num_{part}_{a}" for part in keys for a in ("entities", "relations") ], ) valid = None for part, a in itt.product(("validation", "testing"), ("entities", "relations")): this_valid = df[f"num_training_{a}"] == df[f"num_{part}_{a}"] if valid is None: valid = this_valid else: valid = valid & this_valid df["valid"] = valid print(df.to_markdown())
def main(replicates: int, force: bool): import pykeen.triples.splitting pykeen.triples.splitting.logger.setLevel(logging.ERROR) import pykeen.triples.triples_factory pykeen.triples.triples_factory.logger.setLevel(logging.ERROR) import pykeen.utils pykeen.utils.logger.setLevel(logging.ERROR) git_hash = get_git_hash() methods = ['cleanup', 'coverage'] ratios = [0.8] click.echo(f'output directory: {SPLITTING_DIRECTORY.as_posix()}') rows = [] outer_it = tqdm(sorted(datasets), desc='Dataset') for dataset in outer_it: dataset_path = RESULTS_DIRECTORY / f'{dataset}.tsv' if dataset_path.exists() and not force: _log(f'loading pre-calculated {dataset} from {dataset_path}') df = pd.read_csv(dataset_path, sep='\t') rows.extend(df.values) continue _log(f'loading {dataset}') t = time.time() dataset = get_dataset(dataset=dataset) dataset_name = dataset.__class__.__name__ ccl = [ dataset.training.mapped_triples, dataset.testing.mapped_triples, dataset.validation.mapped_triples, ] load_time = time.time() - t _log(f'done loading {dataset_name} after {load_time:.3f} seconds') _log(f'concatenating {dataset_name}') t = time.time() mapped_triples: torch.LongTensor = torch.cat(ccl, dim=0) concat_time = time.time() - t _log( f'done concatenating {dataset_name} after {concat_time:.3f} seconds' ) _log(f'deleting {dataset_name}') del dataset _log(f'done deleting {dataset_name}') dataset_rows = [] inner_it = itt.product(methods, ratios, range(1, 1 + replicates)) inner_it = tqdm( inner_it, total=len(methods) * len(ratios) * replicates, desc=f'{dataset_name} ({intword(mapped_triples.shape[0])})', ) for method, ratio, replicate in inner_it: t = time.time() results = split( mapped_triples, ratios=[ratio, (1 - ratio) / 2], method=method, random_state=replicate, ) split_time = time.time() - t dataset_rows.append(( git_hash, dataset_name, mapped_triples.shape[0], load_time, concat_time, method, ratio, replicate, split_time, results[0].shape[0], results[1].shape[0], results[2].shape[0], )) del results _log(f'writing to {dataset_path}') pd.DataFrame(dataset_rows, columns=columns).to_csv(dataset_path, sep='\t', index=False) rows.extend(dataset_rows) df = pd.DataFrame(rows, columns=columns) df.to_csv(tsv_path, sep='\t', index=False) _make_1(df, git_hash) _make_2(df, git_hash)
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device="cpu"): """Run an inverse stability experiment.""" dataset_instance: Dataset = get_dataset( dataset=dataset, dataset_kwargs=dict(create_inverse_triples=True, ), ) dataset_name = dataset_instance.get_normalized_name() model_cls: Type[Model] = model_resolver.lookup(model) model_name = model_cls.__name__.lower() dataset_dir = INVERSE_STABILITY / dataset_name dataset_dir.mkdir(exist_ok=True, parents=True) pipeline_result = pipeline( dataset=dataset_instance, model=model, training_loop=training_loop, training_kwargs=dict( num_epochs=1000, use_tqdm_batch=False, ), stopper="early", stopper_kwargs=dict(patience=5, frequency=5), random_seed=random_seed, device=device, ) test_tf = dataset_instance.testing model = pipeline_result.model # Score with original triples scores_forward = model.score_hrt(test_tf.mapped_triples) scores_forward_np = scores_forward.detach().numpy()[:, 0] # Score with inverse triples scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples) scores_inverse_np = scores_inverse.detach().numpy()[:, 0] scores_path = dataset_dir / f"{model_name}_{training_loop}_scores.tsv" df = pd.DataFrame( list( zip( itt.repeat(training_loop), itt.repeat(dataset_name), itt.repeat(model_name), scores_forward_np, scores_inverse_np, )), columns=["training_loop", "dataset", "model", "forward", "inverse"], ) df.to_csv(scores_path, sep="\t", index=False) fig, ax = plt.subplots(1, 1) sns.histplot(data=df, x="forward", label="Forward", ax=ax, color="blue", stat="density") sns.histplot(data=df, x="inverse", label="Inverse", ax=ax, color="orange", stat="density") ax.set_title(f"{dataset_name} - {model_name} - {training_loop}") ax.set_xlabel("Score") plt.legend() plt.savefig(dataset_dir / f"{model_name}_{training_loop}_overlay.png", dpi=300) plt.close(fig) fig, ax = plt.subplots(1, 1) sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density") ax.set_title(f"{dataset_name} - {model_name} - {training_loop}") ax.set_xlabel("Forward - Inverse Score Difference") plt.savefig(dataset_dir / f"{model_name}_{training_loop}_residuals.png", dpi=300) plt.close(fig) return df
def _analyze(dataset, force, countplots, directory: Union[None, str, pathlib.Path]): from pykeen.datasets import get_dataset from pykeen.constants import PYKEEN_DATASETS from . import analysis from tqdm import tqdm import pandas as pd import docdata try: import matplotlib.pyplot as plt import seaborn as sns except ImportError: raise ImportError( dedent("""\ Please install plotting dependencies by pip install pykeen[plotting] or directly by pip install matplotlib seaborn """)) # Raise matplotlib level logging.getLogger('matplotlib').setLevel(logging.WARNING) if directory is None: directory = PYKEEN_DATASETS else: directory = pathlib.Path(directory) directory.mkdir(exist_ok=True, parents=True) dataset_instance = get_dataset(dataset=dataset) d = directory.joinpath(dataset_instance.__class__.__name__.lower(), 'analysis') d.mkdir(parents=True, exist_ok=True) dfs = {} it = tqdm(analysis.__dict__.items(), leave=False, desc='Stats') for name, func in it: if not name.startswith('get') or not name.endswith('df'): continue it.set_postfix(func=name) key = name[len('get_'):-len('_df')] path = d.joinpath(key).with_suffix('.tsv') if path.exists() and not force: df = pd.read_csv(path, sep='\t') else: df = func(dataset=dataset_instance) df.to_csv(d.joinpath(key).with_suffix('.tsv'), sep='\t', index=False) dfs[key] = df fig, ax = plt.subplots(1, 1) sns.scatterplot( data=dfs['relation_injectivity'], x='head', y='tail', size='support', hue='support', ax=ax, ) ax.set_title( f'{docdata.get_docdata(dataset_instance.__class__)["name"]} Relation Injectivity' ) fig.tight_layout() fig.savefig(d.joinpath('relation_injectivity.svg')) plt.close(fig) fig, ax = plt.subplots(1, 1) sns.scatterplot( data=dfs['relation_functionality'], x='functionality', y='inverse_functionality', ax=ax, ) ax.set_title( f'{docdata.get_docdata(dataset_instance.__class__)["name"]} Relation Functionality' ) fig.tight_layout() fig.savefig(d.joinpath('relation_functionality.svg')) plt.close(fig) if countplots: entity_count_df = (dfs['entity_count'].groupby( 'entity_label').sum().reset_index().sort_values('count', ascending=False)) fig, ax = plt.subplots(1, 1) sns.barplot(data=entity_count_df, y='entity_label', x='count', ax=ax) ax.set_ylabel('') ax.set_xscale('log') fig.tight_layout() fig.savefig(d.joinpath('entity_counts.svg')) plt.close(fig) relation_count_df = (dfs['relation_count'].groupby( 'relation_label').sum().reset_index().sort_values('count', ascending=False)) fig, ax = plt.subplots(1, 1) sns.barplot(data=relation_count_df, y='relation_label', x='count', ax=ax) ax.set_ylabel('') ax.set_xscale('log') fig.tight_layout() fig.savefig(d.joinpath('relation_counts.svg')) plt.close(fig)
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device='cpu'): """Run an inverse stability experiment.""" dataset: Dataset = get_dataset( dataset=dataset, dataset_kwargs=dict( create_inverse_triples=True, ), ) dataset_name = dataset.get_normalized_name() model_cls: Type[Model] = get_model_cls(model) model_name = model_cls.__name__.lower() dataset_dir = INVERSE_STABILITY / dataset_name dataset_dir.mkdir(exist_ok=True, parents=True) pipeline_result = pipeline( dataset=dataset, model=model, training_loop=training_loop, training_kwargs=dict( num_epochs=1000, use_tqdm_batch=False, ), stopper='early', stopper_kwargs=dict(patience=5, frequency=5), random_seed=random_seed, device=device, ) test_tf = dataset.testing model = pipeline_result.model # Score with original triples scores_forward = model.score_hrt(test_tf.mapped_triples) scores_forward_np = scores_forward.detach().numpy()[:, 0] # Score with inverse triples scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples) scores_inverse_np = scores_inverse.detach().numpy()[:, 0] scores_path = dataset_dir / f'{model_name}_{training_loop}_scores.tsv' df = pd.DataFrame( list(zip( itt.repeat(training_loop), itt.repeat(dataset_name), itt.repeat(model_name), scores_forward_np, scores_inverse_np, )), columns=['training_loop', 'dataset', 'model', 'forward', 'inverse'], ) df.to_csv(scores_path, sep='\t', index=False) fig, ax = plt.subplots(1, 1) sns.histplot(data=df, x='forward', label='Forward', ax=ax, color='blue', stat="density") sns.histplot(data=df, x='inverse', label='Inverse', ax=ax, color='orange', stat="density") ax.set_title(f'{dataset_name} - {model_name} - {training_loop}') ax.set_xlabel('Score') plt.legend() plt.savefig(dataset_dir / f'{model_name}_{training_loop}_overlay.png', dpi=300) plt.close(fig) fig, ax = plt.subplots(1, 1) sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density") ax.set_title(f'{dataset_name} - {model_name} - {training_loop}') ax.set_xlabel('Forward - Inverse Score Difference') plt.savefig(dataset_dir / f'{model_name}_{training_loop}_residuals.png', dpi=300) plt.close(fig) return df