def test_download(self, mock_url_retrieve, mock_fname): with tempfile.TemporaryDirectory() as destination: full_path = os.path.join(destination, "client/") mock_fname.return_value = self._mock_fname("tar.gz") mock_url_retrieve.return_value = self._make_tarfile(destination).read() utils.download(url="url", default_extension="csv", destination=full_path) self.assertTrue(Path(full_path, "test_tar.csv").is_file()) mock_fname.return_value = self._mock_fname("zip") mock_url_retrieve.return_value = self._make_zipfile(destination).read() utils.download(url="url", default_extension="csv", destination=full_path) self.assertTrue(Path(full_path, "test_zip.csv").is_file()) mock_fname.return_value = self._mock_fname("gz") mock_url_retrieve.return_value = self._make_gzipfile(destination).read() utils.download(url="url", default_extension="csv", destination=full_path) self.assertTrue(Path(full_path, "test_gz.csv").is_file()) mock_fname.return_value = self._mock_fname("csv") mock_url_retrieve.return_value = self._make_csv(destination).read() utils.download( url="url", default_extension="csv", unzip=False, destination=full_path ) self.assertTrue(Path(full_path, "test_csv.csv").is_file()) mock_fname.return_value = self._mock_fname("gz") mock_url_retrieve.return_value = self._make_gzipfile(destination).read() utils.download( url="url", default_extension="csv", unzip=False, destination=full_path ) self.assertTrue(Path(full_path, "test_gz.gz").is_file()) mock_fname.return_value = self._mock_fname("csv") self.assertRaises(ValueError, utils.download, "url", "csv", True, full_path)
def _download(url: str, default_extension: str, unzip: bool, destination: str): click.echo(f"Download {cfg.DATASET} dataset in {destination}") download( url=url, default_extension=default_extension, unzip=unzip, destination=destination, )
def load_dataset( url: Optional[str] = None, path: Optional[str] = None, usecols: Optional[List[str]] = None, pipeline_cols: Optional[List[str]] = None, destination: str = None, ) -> Tuple[pd.DataFrame, pd.Series]: """ Load Pyro Risks training datasets. Download and load Pyro Risks training datasets. Args: url: Training dataset URL. Defaults to None. path: Dataset full path. Defaults to None. usecols: Subset of the dataset columns. Defaults to None. pipeline_cols: Subset of the dataset used for training. Defaults to None. destination: folder where the dataset should be saved. Defaults to None. Returns: Tuple[pd.DataFrame, pd.Series] """ url = cfg.ERA5T_VIIRS_PIPELINE if url is None else url path = os.path.join(cfg.DATA_REGISTRY, cfg.DATASET) if path is None else path usecols = ([cfg.DATE_VAR, cfg.ZONE_VAR, cfg.TARGET] + cfg.PIPELINE_ERA5T_VARS if usecols is None else usecols) pipeline_cols = ([cfg.DATE_VAR, cfg.ZONE_VAR] + cfg.PIPELINE_ERA5T_VARS if pipeline_cols is None else pipeline_cols) destination = cfg.DATA_REGISTRY if destination is None else destination if not os.path.isfile(path): download(url=url, default_extension="csv", unzip=False, destination=destination) df = pd.read_csv(path, usecols=usecols) df["day"] = df["day"].apply(lambda x: datetime.strptime( str(x), "%Y-%m-%d") if not pd.isnull(x) else x) X = df[pipeline_cols] y = df[cfg.TARGET] return X, y