def _load_helper( self, relative_path: pathlib.PurePath, entity_to_id: Optional[Mapping[str, Any]] = None, relation_to_id: Optional[Mapping[str, Any]] = None, ) -> TriplesFactory: if not self.path.is_file(): if self.url is None: raise ValueError("url should be set") logger.info("downloading data from %s to %s", self.url, self.path) download(url=self.url, path=self.path) with zipfile.ZipFile(file=self.path) as zf: # relative paths within zip file's always follow Posix path, even on Windows with zf.open(relative_path.as_posix()) as file: logger.debug("loading %s", relative_path) df = pd.read_csv( file, usecols=[ self.head_column, self.relation_column, self.tail_column ], header=self.header, sep=self.sep, ) return TriplesFactory.from_labeled_triples( triples=df.values, create_inverse_triples=self.create_inverse_triples, metadata={"path": relative_path}, entity_to_id=entity_to_id, relation_to_id=relation_to_id, )
def _load(self) -> None: path = self.cache_root.joinpath("OpenEA_dataset_v2.0.zip") # ensure file is present if not path.is_file() or self.force: logger.info(f"Downloading file from Dropbox (Link: {self.__class__.DROPBOX_LINK})") download(url=self.__class__.DROPBOX_LINK, path=path, hexdigests={"sha512": self.SHA512}) df = read_zipfile_csv( path=path, inner_path=str(self._relative_path_relations), header=None, names=[LABEL_HEAD, LABEL_RELATION, LABEL_TAIL], sep="\t", encoding="utf8", dtype=str, ) # create triples factory tf = TriplesFactory.from_labeled_triples( triples=df.values, create_inverse_triples=self.create_inverse_triples, metadata={"path": path}, ) # split self._training, self._testing, self._validation = cast( Tuple[TriplesFactory, TriplesFactory, TriplesFactory], tf.split( ratios=self.ratios, random_state=self.random_state, ), ) logger.info("[%s] done splitting data from %s", self.__class__.__name__, path)
def _load(self) -> None: if not self._get_path().is_file(): download(self.url, self._get_path()) # noqa:S310 _actual_path = self.cache_root.joinpath(self._relative_path) if not _actual_path.is_file(): logger.error( '[%s] untaring from %s (%s) to %s', self.__class__.__name__, self._get_path(), self._relative_path, _actual_path, ) with tarfile.open(self._get_path()) as tar_file: # tarfile does not like pathlib tar_file.extract(str(self._relative_path), self.cache_root) df = pd.read_csv(_actual_path, sep=self.delimiter) tf_path = self._get_path() tf = TriplesFactory.from_labeled_triples( triples=df.values, create_inverse_triples=self.create_inverse_triples, metadata={'path': tf_path}, ) self._training, self._testing, self._validation = cast( Tuple[TriplesFactory, TriplesFactory, TriplesFactory], tf.split( ratios=self.ratios, random_state=self.random_state, ), ) logger.info('[%s] done splitting data from %s', self.__class__.__name__, tf_path)
def __init__( self, training_url: str, testing_url: str, validation_url: str, cache_root: Optional[str] = None, force: bool = False, eager: bool = False, create_inverse_triples: bool = False, load_triples_kwargs: Optional[Mapping[str, Any]] = None, download_kwargs: Optional[Mapping[str, Any]] = None, ): """Initialize dataset. :param training_url: The URL of the training file :param testing_url: The URL of the testing file :param validation_url: The URL of the validation file :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``. :param force: If true, redownload any cached files :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through to :func:`pykeen.triples.utils.load_triples`. :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download` """ self.cache_root = self._help_cache(cache_root) self.training_url = training_url self.testing_url = testing_url self.validation_url = validation_url training_path = self.cache_root.joinpath( name_from_url(self.training_url)) testing_path = self.cache_root.joinpath(name_from_url( self.testing_url)) validation_path = self.cache_root.joinpath( name_from_url(self.validation_url)) download_kwargs = {} if download_kwargs is None else dict( download_kwargs) download_kwargs.setdefault("backend", "urllib") for url, path in [ (self.training_url, training_path), (self.testing_url, testing_path), (self.validation_url, validation_path), ]: if force or not path.is_file(): download(url, path, **download_kwargs) super().__init__( training_path=training_path, testing_path=testing_path, validation_path=validation_path, eager=eager, create_inverse_triples=create_inverse_triples, load_triples_kwargs=load_triples_kwargs, )
def _get_df(self) -> pd.DataFrame: path = self._get_path() if not path.is_file(): download(self.url, self._get_path()) # noqa:S310 with zipfile.ZipFile(path) as zip_file: with zip_file.open(self._relative_path.as_posix()) as file: df = pd.read_csv(file, sep=self.delimiter) return df
def __init__( self, training_url: str, testing_url: str, validation_url: str, cache_root: Optional[str] = None, stream: bool = True, force: bool = False, eager: bool = False, create_inverse_triples: bool = False, load_triples_kwargs: Optional[Mapping[str, Any]] = None, ): """Initialize dataset. :param training_url: The URL of the training file :param testing_url: The URL of the testing file :param validation_url: The URL of the validation file :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``. :param stream: Use :mod:`requests` be used for download if true otherwise use :mod:`urllib` :param force: If true, redownload any cached files :param eager: Should the data be loaded eagerly? Defaults to false. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through to :func:`pykeen.triples.utils.load_triples`. """ self.cache_root = self._help_cache(cache_root) self.training_url = training_url self.testing_url = testing_url self.validation_url = validation_url training_path = os.path.join(self.cache_root, name_from_url(self.training_url)) testing_path = os.path.join(self.cache_root, name_from_url(self.testing_url)) validation_path = os.path.join(self.cache_root, name_from_url(self.validation_url)) for url, path in [ (self.training_url, training_path), (self.testing_url, testing_path), (self.validation_url, validation_path), ]: if os.path.exists(path) and not force: continue download(url, path, stream=stream, backend='requests') super().__init__( training_path=training_path, testing_path=testing_path, validation_path=validation_path, eager=eager, create_inverse_triples=create_inverse_triples, load_triples_kwargs=load_triples_kwargs, )
def test_hash_success(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, )
def test_hash_error(self): """Test hash error on download.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.mismatching_md5_hexdigest, }, )
def test_hash_remote_error(self): """Test hash error on download.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_WRONG_MD5.as_uri(), }, hexdigests_strict=True, )
def test_hash_remote_verbose_success(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_VERBOSE_MD5.as_uri(), }, hexdigests_strict=False, ) self.assertTrue(self.path.exists())
def test_hash_remote_verbose_failure(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_VERBOSE_MD5.as_uri(), }, hexdigests_strict=True, )
def _get_df(self) -> pd.DataFrame: if not self._get_path().is_file(): logger.info("downloading data from %s to %s", self.url, self._get_path()) download(url=self.url, path=self._get_path()) # noqa:S310 df = pd.read_csv(self._get_path(), **self.read_csv_kwargs) usecols = self.read_csv_kwargs.get("usecols") if usecols is not None: logger.info("reordering columns: %s", usecols) df = df[usecols] return df
def _get_df(self) -> pd.DataFrame: if not os.path.exists(self._get_path()): logger.info('downloading data from %s to %s', self.url, self._get_path()) download(url=self.url, path=self._get_path()) # noqa:S310 df = pd.read_csv(self._get_path(), **self.read_csv_kwargs) usecols = self.read_csv_kwargs.get('usecols') if usecols is not None: logger.info('reordering columns: %s', usecols) df = df[usecols] return df
def test_force(self): """Test overwriting wrong file.""" # now if force=True it should not bother with the hash check self.path.write_text("test file content") self.assertTrue(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, force=True, )
def test_override_hash_error(self): """Test hash error on download.""" self.path.write_text("test file content") self.assertTrue(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, force=False, )
def test_override_hash_remote_error(self): """Test hash error on download.""" self.path.write_text("test file content") self.assertTrue(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_MD5.as_uri(), }, hexdigests_strict=True, force=False, )
def _get_df(self) -> pd.DataFrame: if not self._get_path().is_file(): download(self.url, self._get_path()) # noqa:S310 _actual_path = self.cache_root.joinpath(self._relative_path) if not _actual_path.is_file(): logger.error( "[%s] untaring from %s (%s) to %s", self.__class__.__name__, self._get_path(), self._relative_path, _actual_path, ) with tarfile.open(self._get_path()) as tar_file: # tarfile does not like pathlib tar_file.extract(str(self._relative_path), self.cache_root) df = pd.read_csv(_actual_path, sep=self.delimiter) return df
def _ensure_obo_path(prefix: str, url: Optional[str] = None, force: bool = False) -> str: """Get the path to the OBO file and download if missing.""" if url is not None: warnings.warn('Should make curations in the bioregistry instead', DeprecationWarning) path = get_prefix_obo_path(prefix).as_posix() download(url=url, path=path, force=force) return path curated_url = get_curated_urls().get(prefix) if curated_url: logger.debug('[%s] checking for OBO at curated URL: %s', prefix, curated_url) return ensure_path(prefix, url=curated_url, force=force) path = get_prefix_obo_path(prefix) if os.path.exists(path): logger.debug('[%s] OBO already exists at %s', prefix, path) return path.as_posix() obofoundry = get_obofoundry(mappify=True) entry = obofoundry.get(prefix) if entry is None: raise NoOboFoundry(f'OBO Foundry is missing the prefix: {prefix}') build = entry.get('build') if build is None: raise MissingOboBuild(f'OBO Foundry is missing a build for: {prefix}') url = build.get('source_url') if url is None: raise MissingOboBuild( f'OBO Foundry build is missing a URL for: {prefix}, {build}') return ensure_path(prefix, url=url, force=force)
def _load_helper(self, relative_path: str) -> TriplesFactory: if not os.path.exists(self.path): if self.url is None: raise ValueError('url should be set') logger.info('downloading data from %s to %s', self.url, self.path) download(url=self.url, path=self.path) with zipfile.ZipFile(file=self.path) as zf: with zf.open(relative_path) as file: logger.debug('loading %s', relative_path) df = pd.read_csv( file, usecols=[ self.head_column, self.relation_column, self.tail_column ], header=self.header, sep=self.sep, ) return TriplesFactory.from_labeled_triples( triples=df.values, create_inverse_triples=self.create_inverse_triples, metadata={'path': relative_path}, )