def __init__( self, training_url: str, testing_url: str, validation_url: str, cache_root: Optional[str] = None, force: bool = False, eager: bool = False, create_inverse_triples: bool = False, load_triples_kwargs: Optional[Mapping[str, Any]] = None, download_kwargs: Optional[Mapping[str, Any]] = None, ): """Initialize dataset. :param training_url: The URL of the training file :param testing_url: The URL of the testing file :param validation_url: The URL of the validation file :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``. :param force: If true, redownload any cached files :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through to :func:`pykeen.triples.utils.load_triples`. :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download` """ self.cache_root = self._help_cache(cache_root) self.training_url = training_url self.testing_url = testing_url self.validation_url = validation_url training_path = self.cache_root.joinpath( name_from_url(self.training_url)) testing_path = self.cache_root.joinpath(name_from_url( self.testing_url)) validation_path = self.cache_root.joinpath( name_from_url(self.validation_url)) download_kwargs = {} if download_kwargs is None else dict( download_kwargs) download_kwargs.setdefault("backend", "urllib") for url, path in [ (self.training_url, training_path), (self.testing_url, testing_path), (self.validation_url, validation_path), ]: if force or not path.is_file(): download(url, path, **download_kwargs) super().__init__( training_path=training_path, testing_path=testing_path, validation_path=validation_path, eager=eager, create_inverse_triples=create_inverse_triples, load_triples_kwargs=load_triples_kwargs, )
def __init__( self, training_url: str, testing_url: str, validation_url: str, cache_root: Optional[str] = None, stream: bool = True, force: bool = False, eager: bool = False, create_inverse_triples: bool = False, load_triples_kwargs: Optional[Mapping[str, Any]] = None, ): """Initialize dataset. :param training_url: The URL of the training file :param testing_url: The URL of the testing file :param validation_url: The URL of the validation file :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param stream: Use :mod:`requests` be used for download if true otherwise use :mod:`urllib` :param force: If true, redownload any cached files :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through to :func:`pykeen.triples.utils.load_triples`. """ self.cache_root = self._help_cache(cache_root) self.training_url = training_url self.testing_url = testing_url self.validation_url = validation_url training_path = os.path.join(self.cache_root, name_from_url(self.training_url)) testing_path = os.path.join(self.cache_root, name_from_url(self.testing_url)) validation_path = os.path.join(self.cache_root, name_from_url(self.validation_url)) for url, path in [ (self.training_url, training_path), (self.testing_url, testing_path), (self.validation_url, validation_path), ]: if os.path.exists(path) and not force: continue _urlretrieve(url, path, stream=stream) super().__init__( training_path=training_path, testing_path=testing_path, validation_path=validation_path, eager=eager, create_inverse_triples=create_inverse_triples, load_triples_kwargs=load_triples_kwargs, )
def __init__( self, training_url: str, testing_url: str, validation_url: str, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, stream: bool = True, force: bool = False, ): """Initialize dataset. :param training_url: The URL of the training file :param testing_url: The URL of the testing file :param validation_url: The URL of the validation file :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param stream: :param force: """ self.cache_root = self._help_cache(cache_root) self.training_url = training_url self.testing_url = testing_url self.validation_url = validation_url training_path = os.path.join(self.cache_root, name_from_url(self.training_url)) testing_path = os.path.join(self.cache_root, name_from_url(self.testing_url)) validation_path = os.path.join(self.cache_root, name_from_url(self.validation_url)) for url, path in [ (self.training_url, training_path), (self.testing_url, testing_path), (self.validation_url, validation_path), ]: if os.path.exists(path) and not force: continue _urlretrieve(url, path, stream=stream) super().__init__( training_path=training_path, testing_path=testing_path, validation_path=validation_path, eager=eager, create_inverse_triples=create_inverse_triples, )
def ensure_path( prefix: str, *parts: str, url: str, version: VersionHint = None, name: Optional[str] = None, force: bool = False, stream: bool = False, urlretrieve_kwargs: Optional[Mapping[str, Any]] = None, error_on_missing: bool = False, ) -> str: """Download a file if it doesn't exist.""" if name is None: name = name_from_url(url) path = prefix_directory_join(prefix, *parts, name=name, version=version) if not path.exists() and error_on_missing: raise FileNotFoundError if not path.exists() or force: _urlretrieve(url=url, path=path, stream=stream, **(urlretrieve_kwargs or {})) return path.as_posix()
def test_name_from_url(self): """Test :func:`name_from_url`.""" data = [ ("test.tsv", "https://example.com/test.tsv"), ("test.tsv", "https://example.com/deeper/test.tsv"), ("test.tsv.gz", "https://example.com/deeper/test.tsv.gz"), ] for name, url in data: with self.subTest(name=name, url=url): self.assertEqual(name, name_from_url(url))
def ensure_path( prefix: str, url: str, *, path: Optional[str] = None, use_requests: bool = False, force: bool = False, bucket: Optional[str] = None, s3_client: Optional[BaseClient] = None, ) -> str: """Download a file if it doesn't exist. :param force: If set to true, will re-download from source and re-upload to S3 """ if path is None: path = name_from_url(url) path = prefix_directory_join(prefix, path) if not os.path.exists(path) or force: if bucket is not None: # try downloading from AWS if available s3_client = _ensure_s3_client(s3_client) s3_key = _get_s3_key(prefix, path) if not _has_file(s3_client, bucket=bucket, key=s3_key) and not force: logger.info('downloading from AWS (bucket=%s): %s to %s', bucket, s3_key, path) s3_client.download_file(bucket, s3_key, path) return path logger.info('downloading from source %s to %s', url, path) if use_requests: res = requests.get(url) with open(path, 'wb') as file: file.write(res.content) else: urlretrieve(url, path) # noqa:S310 if bucket is not None: s3_client = _ensure_s3_client(s3_client) s3_key = _get_s3_key(prefix, path) if _has_file(s3_client, bucket=bucket, key=s3_key) and not force: logger.debug('already available on S3. Not uploading again.') return path logger.info('uploading to AWS (bucket=%s): %s to %s', bucket, path, s3_key) s3_client.upload_file(path, bucket, s3_key) return path
def __init__( self, relative_training_path: Union[str, pathlib.PurePath], relative_testing_path: Union[str, pathlib.PurePath], relative_validation_path: Union[str, pathlib.PurePath], url: Optional[str] = None, name: Optional[str] = None, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, ): """Initialize dataset. :param relative_training_path: The path inside the zip file for the training data :param relative_testing_path: The path inside the zip file for the testing data :param relative_validation_path: The path inside the zip file for the validation data :param url: The url where to download the dataset from :param name: The name of the file. If not given, tries to get the name from the end of the URL :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :raises ValueError: if there's no URL specified and there is no data already at the calculated path """ self.cache_root = self._help_cache(cache_root) self.name = name or name_from_url(url) self.path = self.cache_root.joinpath(self.name) logger.debug("file path at %s", self.path) self.url = url if not self.path.is_file() and not self.url: raise ValueError( f"must specify url to download from since path does not exist: {self.path}" ) self.relative_training_path = pathlib.PurePath(relative_training_path) self.relative_testing_path = pathlib.PurePath(relative_testing_path) self.relative_validation_path = pathlib.PurePath( relative_validation_path) self.create_inverse_triples = create_inverse_triples if eager: self._load() self._load_validation()
def __init__( self, url: str, name: Optional[str] = None, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, random_state: TorchRandomHint = None, read_csv_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize dataset. :param url: The url where to download the dataset from :param name: The name of the file. If not given, tries to get the name from the end of the URL :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param random_state: An optional random state to make the training/testing/validation split reproducible. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :raises ValueError: if there's no URL specified and there is no data already at the calculated path """ super().__init__( cache_root=cache_root, create_inverse_triples=create_inverse_triples, random_state=random_state, eager=False, # because it gets hooked below ) self.name = name or name_from_url(url) self.read_csv_kwargs = read_csv_kwargs or {} self.read_csv_kwargs.setdefault("sep", "\t") self.url = url if not self._get_path().is_file() and not self.url: raise ValueError( f"must specify url to download from since path does not exist: {self._get_path()}" ) if eager: self._load()
def __init__( self, url: str, name: Optional[str] = None, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, random_state: TorchRandomHint = None, read_csv_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize dataset. :param url: The url where to download the dataset from :param name: The name of the file. If not given, tries to get the name from the end of the URL :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. """ super().__init__( cache_root=cache_root, create_inverse_triples=create_inverse_triples, random_state=random_state, eager=False, # because it gets hooked below ) self.name = name or name_from_url(url) self.read_csv_kwargs = read_csv_kwargs or {} self.read_csv_kwargs.setdefault('sep', '\t') self.url = url if not os.path.exists(self._get_path()) and not self.url: raise ValueError( f'must specify url to download from since path does not exist: {self._get_path()}' ) if eager: self._load()
def __init__( self, relative_training_path: str, relative_testing_path: str, relative_validation_path: str, url: Optional[str] = None, name: Optional[str] = None, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, ): """Initialize dataset. :param url: The url where to download the dataset from :param name: The name of the file. If not given, tries to get the name from the end of the URL :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. """ self.cache_root = self._help_cache(cache_root) self.name = name or name_from_url(url) self.path = os.path.join(self.cache_root, self.name) logger.debug('file path at %s', self.path) self.url = url if not os.path.exists(self.path) and not self.url: raise ValueError( f'must specify url to download from since path does not exist: {self.path}' ) self.relative_training_path = relative_training_path self.relative_testing_path = relative_testing_path self.relative_validation_path = relative_validation_path self.create_inverse_triples = create_inverse_triples if eager: self._load() self._load_validation()
def __init__( self, url: str, relative_path: str, name: Optional[str] = None, cache_root: Optional[str] = None, eager: bool = False, create_inverse_triples: bool = False, delimiter: Optional[str] = None, random_state: TorchRandomHint = None, randomize_cleanup: bool = False, ): """Initialize dataset. :param url: The url where to download the dataset from :param name: The name of the file. If not given, tries to get the name from the end of the URL :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param relative_path: The path inside the archive to the contained dataset. :param random_state: An optional random state to make the training/testing/validation split reproducible. :param delimiter: The delimiter for the contained dataset. """ self.cache_root = self._help_cache(cache_root) self.name = name or name_from_url(url) self.random_state = random_state self.delimiter = delimiter or '\t' self.randomize_cleanup = randomize_cleanup self.url = url self.create_inverse_triples = create_inverse_triples self._relative_path = relative_path if eager: self._load()