def _verify_tag_path_exist(adls_file_system_client: core.AzureDLFileSystem, path: str): """ Verify that the tag path exists, if not the `adls_file_system_client.info` will raise a FileNotFound error. Parameters ---------- adls_file_system_client: core.AzureDLFileSystem the AzureDLFileSystem client to use path : str Path of tag to be checked if exists. """ adls_file_system_client.info(f"{path}")
def read_tag_files(adls_file_system_client: core.AzureDLFileSystem, tag: SensorTag, years: range) -> pd.Series: """ Download tag files for the given years into dataframes, and return as one dataframe. Parameters ---------- adls_file_system_client: core.AzureDLFileSystem the AzureDLFileSystem client to use tag: SensorTag the tag to download data for years: range range object providing years to include Returns ------- pd.Series: Series with all years for one tag. """ tag_base_path = NcsReader.base_path_from_asset(tag.asset) if not tag_base_path: raise ValueError(f"Unable to find base path from tag {tag} ") all_years = [] for year in years: file_path = tag_base_path + f"/{tag.name}/{tag.name}_{year}.csv" logger.info(f"Parsing file {file_path}") info = adls_file_system_client.info(file_path) file_size = info.get("length") / (1024**2) logger.info(f"File size: {file_size:.2f}MB") with adls_file_system_client.open(file_path, "rb") as f: df = pd.read_csv( f, sep=";", header=None, names=["Sensor", tag.name, "Timestamp", "Status"], usecols=[tag.name, "Timestamp"], dtype={tag.name: np.float32}, parse_dates=["Timestamp"], date_parser=lambda col: pd.to_datetime(col, utc=True), index_col="Timestamp", ) all_years.append(df) logger.info(f"Done parsing file {file_path}") combined = pd.concat(all_years) # There often comes duplicated timestamps, keep the last if combined.index.duplicated().any(): combined = combined[~combined.index.duplicated(keep="last")] return combined[tag.name]
def read_tag_files( self, adls_file_system_client: core.AzureDLFileSystem, tag: SensorTag, years: range, dry_run: Optional[bool] = False, remove_status_codes: Optional[list] = [0], dl_base_path: Optional[str] = None, ) -> pd.Series: """ Download tag files for the given years into dataframes, and return as one dataframe. Parameters ---------- adls_file_system_client: core.AzureDLFileSystem the AzureDLFileSystem client to use tag: SensorTag the tag to download data for years: range range object providing years to include dry_run: Optional[bool] if True, don't download data, just check info, log, and return remove_status_codes: Optional[list] Removes data with Status code(s) in the list. By default it removes data with Status code 0. dl_base_path: Optional[str] Base bath used to override the asset to path dictionary. Useful for demos and other non-production settings. Returns ------- pd.Series: Series with all years for one tag. """ tag_base_path = (dl_base_path if dl_base_path else NcsReader.base_path_from_asset(tag.asset)) if not tag_base_path: raise ValueError(f"Unable to find base path from tag {tag} ") all_years = [] logger.info(f"Downloading tag: {tag} for years: {years}") tag_name_encoded = quote(tag.name, safe=" ") NcsReader._verify_tag_path_exist( adls_file_system_client, f"{tag_base_path}/{tag_name_encoded}/") dir_path = f"{tag_base_path}/{tag_name_encoded}" for year in years: file_path = None file_lookup = None for v in self.file_lookups: file_path = v.lookup(adls_file_system_client, dir_path, tag_name_encoded, year) if file_path is not None: file_lookup = v break if file_lookup is None: continue file_type = file_lookup.file_type logger.info(f"Parsing file {file_path}") try: info = adls_file_system_client.info(file_path) file_size = info.get("length") / (1024**2) logger.debug( f"File size for file {file_path}: {file_size:.2f}MB") if dry_run: logger.info("Dry run only, returning empty frame early") return pd.Series() before_downloading = timeit.default_timer() with adls_file_system_client.open(file_path, "rb") as f: df = file_type.read_df(f) df = df.rename(columns={"Value": tag.name}) df = df[~df["Status"].isin(remove_status_codes)] df.sort_index(inplace=True) all_years.append(df) logger.info( f"Done in {(timeit.default_timer()-before_downloading):.2f} sec {file_path}" ) except FileNotFoundError as e: logger.debug(f"{file_path} not found, skipping it: {e}") try: combined = pd.concat(all_years) except Exception as e: logger.debug(f"Not able to concatinate all years: {e}.") return pd.Series(name=tag.name, data=None) # There often comes duplicated timestamps, keep the last if combined.index.duplicated().any(): combined = combined[~combined.index.duplicated(keep="last")] return combined[tag.name]
def is_file(client: core.AzureDLFileSystem, path: str): try: info = client.info(path) except FileNotFoundError: return False return info["type"] == "FILE"
def read_tag_files( adls_file_system_client: core.AzureDLFileSystem, tag: SensorTag, years: range, dry_run: Optional[bool] = False, remove_status_codes: Optional[list] = [0], dl_base_path: Optional[str] = None, ) -> pd.Series: """ Download tag files for the given years into dataframes, and return as one dataframe. Parameters ---------- adls_file_system_client: core.AzureDLFileSystem the AzureDLFileSystem client to use tag: SensorTag the tag to download data for years: range range object providing years to include dry_run: Optional[bool] if True, don't download data, just check info, log, and return remove_status_codes: Optional[list] Removes data with Status code(s) in the list. By default it removes data with Status code 0. dl_base_path: Optional[str] Base bath used to override the asset to path dictionary. Useful for demos and other non-production settings. Returns ------- pd.Series: Series with all years for one tag. """ tag_base_path = ( dl_base_path if dl_base_path else NcsReader.base_path_from_asset(tag.asset) ) if not tag_base_path: raise ValueError(f"Unable to find base path from tag {tag} ") all_years = [] logger.info(f"Downloading tag: {tag} for years: {years}") tag_name_encoded = quote(tag.name, safe=" ") NcsReader._verify_tag_path_exist( adls_file_system_client, f"{tag_base_path}/{tag_name_encoded}/" ) for year in years: file_path = ( f"{tag_base_path}/{tag_name_encoded}/{tag_name_encoded}_{year}.csv" ) logger.info(f"Parsing file {file_path}") try: info = adls_file_system_client.info(file_path) file_size = info.get("length") / (1024 ** 2) logger.debug(f"File size for file {file_path}: {file_size:.2f}MB") if dry_run: logger.info("Dry run only, returning empty frame early") return pd.Series() with adls_file_system_client.open(file_path, "rb") as f: df = pd.read_csv( f, sep=";", header=None, names=["Sensor", tag.name, "Timestamp", "Status"], usecols=[tag.name, "Timestamp", "Status"], dtype={tag.name: np.float32}, parse_dates=["Timestamp"], date_parser=lambda col: pd.to_datetime(col, utc=True), index_col="Timestamp", ) df = df[~df["Status"].isin(remove_status_codes)] all_years.append(df) logger.info(f"Done parsing file {file_path}") except FileNotFoundError as e: logger.debug(f"{file_path} not found, skipping it: {e}") try: combined = pd.concat(all_years) except Exception as e: logger.debug(f"Not able to concatinate all years: {e}.") return pd.Series(name=tag.name, data=None) # There often comes duplicated timestamps, keep the last if combined.index.duplicated().any(): combined = combined[~combined.index.duplicated(keep="last")] return combined[tag.name]