def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') log_config.configure(log_level) log = get_logger() log.debug(f'data_dir: {data_path} out_dir: {out_path}') parser = argparse.ArgumentParser() parser.add_argument('--yearindex') parser.add_argument('--monthindex') parser.add_argument('--dayindex') parser.add_argument('--locindex') parser.add_argument('--subdirindex') args = parser.parse_args() year_index = int(args.yearindex) month_index = int(args.monthindex) day_index = int(args.dayindex) location_index = int(args.locindex) data_type_index = int(args.subdirindex) config = Config(data_path=data_path, out_path=out_path, year_index=year_index, month_index=month_index, day_index=day_index, location_index=location_index, data_type_index=data_type_index, relative_path_index=0, window_size=0) variable_pad = VariablePad(config) variable_pad.pad()
def main() -> None: """Add the location group name from the location file into the path.""" env = environs.Env() source_path: Path = env.path('SOURCE_PATH') group: str = env.str('GROUP') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') location_index: int = env.int('LOCATION_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') log_config.configure(log_level) log.debug( f'source_path: {source_path} group: {group} out_path: {out_path}') config = Config(source_path=source_path, out_path=out_path, group=group, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, location_index=location_index, data_type_index=data_type_index) location_group_path = LocationGroupPath(config) location_group_path.add_groups_to_paths()
def setUp(self): log_config.configure('DEBUG') self.setUpPyfakefs() self.in_path = Path('/dir/in') self.out_path = Path('/out') self.metadata_path = Path('prt/2019/10') file_name_1 = 'GRSM_prt_6974_2019-10-02.parquet' file_name_2 = 'UNDE_prt_6848_2019-10-02.parquet' file_name_3 = 'WREF_prt_6848_2019-10-02.parquet' file_name_4 = 'CPER_prt_6848_2019-10-03.parquet' data_path_1 = Path(self.in_path, self.metadata_path, '02/6974/data', file_name_1) actual_data_file_path = Path(os.path.dirname(__file__), file_name_1) self.fs.add_real_file(actual_data_file_path, target_path=data_path_1) data_path_2 = Path(self.in_path, self.metadata_path, '02/6848/data', file_name_2) actual_data_file_path = Path(os.path.dirname(__file__), file_name_2) self.fs.add_real_file(actual_data_file_path, target_path=data_path_2) data_path_3 = Path(self.in_path, self.metadata_path, '02/6848/data', file_name_3) actual_data_file_path = Path(os.path.dirname(__file__), file_name_3) self.fs.add_real_file(actual_data_file_path, target_path=data_path_3) data_path_4 = Path(self.in_path, self.metadata_path, '03/6848/data', file_name_4) actual_data_file_path = Path(os.path.dirname(__file__), file_name_4) self.fs.add_real_file(actual_data_file_path, target_path=data_path_4)
def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH') schema_path: Path = env.path('SCHEMA_PATH') out_path: Path = env.path('OUT_PATH') parse_calibration = env.bool('PARSE_CALIBRATION') log_level: str = env.log_level('LOG_LEVEL', 'INFO') source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') test_mode: bool = env.bool("TEST_MODE") log.debug( f'data_path: {data_path} schema_path: {schema_path} out_path: {out_path}' ) log_config.configure(log_level) config = Config(data_path=data_path, schema_path=schema_path, out_path=out_path, parse_calibration=parse_calibration, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, source_id_index=source_id_index, data_type_index=data_type_index, test_mode=test_mode) array_parser.parse(config)
def main() -> None: env = environs.Env() calibrated_path: Path = env.path('CALIBRATED_PATH') location_path: Path = env.path('LOCATION_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') log_config.configure(log_level) log.debug( f'calibrated_path: {calibrated_path} location_path: {location_path} out_path: {out_path}' ) config = Config(calibrated_path=calibrated_path, location_path=location_path, out_path=out_path, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, source_id_index=source_id_index, data_type_index=data_type_index) grouper = CalibratedLocationFileGrouper(config) grouper.group_files()
def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH', None) calibration_path: Path = env.path('CALIBRATION_PATH', None) out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') data_source_id_index: int = env.int('DATA_SOURCE_ID_INDEX') data_source_type_index: int = env.int('DATA_SOURCE_TYPE_INDEX') data_year_index: int = env.int('DATA_YEAR_INDEX') data_month_index: int = env.int('DATA_MONTH_INDEX') data_day_index: int = env.int('DATA_DAY_INDEX') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) log = structlog.get_logger() log.debug(f'data_path: {data_path} calibration_path: {calibration_path} out_path: {out_path}') config = Config(data_path=data_path, calibration_path=calibration_path, out_path=out_path, data_source_type_index=data_source_type_index, data_source_id_index=data_source_id_index, data_year_index=data_year_index, data_month_index=data_month_index, data_day_index=data_day_index, relative_path_index=relative_path_index) process_files(config)
def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH') location_path: Path = env.path('LOCATION_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') log_config.configure(log_level) log.debug( f'data_dir: {data_path} location_dir: {location_path} out_dir: {out_path}' ) config = Config(data_path=data_path, location_path=location_path, out_path=out_path, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, source_id_index=source_id_index) event_location_grouper = EventLocationGrouper(config) event_location_grouper.group_files()
def main() -> None: """Group data by related location groups.""" env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') group_index: int = env.int('GROUP_INDEX') location_index: int = env.int('LOCATION_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') log_config.configure(log_level) log.debug(f'data_path: {data_path} out_path: {out_path}') config = Config(data_path=data_path, out_path=out_path, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, group_index=group_index, location_index=location_index, data_type_index=data_type_index) related_location_grouper = RelatedLocationGrouper(config) related_location_grouper.group_files()
def main() -> None: env = environs.Env() log_level: str = env.log_level('LOG_LEVEL', 'INFO') prefix_index: int = env.int('PREFIX_INDEX') prefix_length: int = env.int('PREFIX_LENGTH') sort_index: int = env.int('SORT_INDEX') log_config.configure(log_level) package(prefix_index=prefix_index, prefix_length=prefix_length, sort_index=sort_index)
def main(): env = environs.Env() in_path = env.path('IN_PATH') out_path = env.path('OUT_PATH') log_level = env.str('LOG_LEVEL', 'INFO') # 30 percent duplication threshold for dedup by default dedup_threshold = env.float('DEDUP_THRESHOLD', 0.3) log_config.configure(log_level) linkmerge(in_path, out_path, dedup_threshold)
def main() -> None: env = environs.Env() location_path: Path = env.path('LOCATION_PATH') out_path: Path = env.path('OUT_PATH') schema_index: int = env.int('SCHEMA_INDEX') log_level: str = env.log_level('LOG_LEVEL', 'INFO') log_config.configure(log_level) link_location_files(location_path=location_path, out_path=out_path, schema_index=schema_index)
def main() -> None: env = environs.Env() in_path: Path = env.path('IN_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') indices: list = env.list('PATH_INDICES') log_config.configure(log_level) log = structlog.get_logger() log.debug(f'in_path: {in_path} out_path: {out_path}') order_paths(in_path, out_path, indices)
def main() -> None: env = environs.Env() in_path: Path = env.path('IN_PATH') out_path: Path = env.path('OUT_PATH') filter_dirs: list = env.list('FILTER_DIR') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) log = structlog.get_logger() log.debug(f'in_path: {in_path} filter_dirs: {filter_dirs} out_dir: {out_path}') filter_directory(in_path, out_path, filter_dirs, relative_path_index)
def main() -> None: env = environs.Env() config: str = env.str('CONFIG') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) filter_joiner = FilterJoiner(config=config, out_path=out_path, relative_path_index=relative_path_index) filter_joiner.join()
def main() -> None: env = environs.Env() in_path: Path = env.path('IN_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') location_index: int = env.int('LOCATION_INDEX') empty_file_suffix: str = env.str('EMPTY_FILE_SUFFIX') log_config.configure(log_level) linker = DataGapFillerLinker(in_path, out_path, relative_path_index, location_index, empty_file_suffix) linker.link_files()
def main() -> None: """Analyze padded time series data""" env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) log = get_logger() log.debug(f'data_path: {data_path} out_path: {out_path}') analyzer = PaddedTimeSeriesAnalyzer(data_path, out_path, relative_path_index) analyzer.analyze()
def setUp(self): """Set required files in mock filesystem.""" self.setUpPyfakefs() self.log = get_logger() log_config.configure('DEBUG') self.out_dir = Path('/tmp/out') input_root = Path('/tmp/in') site_month = Path('CPER/2019/01') self.input_dir = Path(input_root, site_month) # Data file self.source_file_name = 'NEON.D10.CPER.DP1.00041.001.001.501.001.ST_1_minute.2019-01.basic.20210720T001022Z.csv' self.target_file_name = 'NEON.DOM.SITE.DP1.00041.001/CPER/20190101T000000--20190201T000000/basic/NEON.D10.CPER.DP1.00041.001.001.501.001.ST_1_minute.2019-01.basic.20210720T001022Z.csv' data_path = Path(input_root, self.input_dir, self.source_file_name) self.fs.create_file(data_path)
def main() -> None: env = environs.Env() location_type: str = env.str('LOCATION_TYPE') db_url: str = env.str('DATABASE_URL') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') log_config.configure(log_level) with closing(connect(db_url)) as connection: get_named_locations_partial = partial(get_named_locations, connection=connection, location_type=location_type) load_locations(out_path=out_path, get_locations=get_named_locations_partial)
def main() -> None: env = environs.Env() source_path: Path = env.path('SOURCE_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') source_type_index: int = env.int('SOURCE_TYPE_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') log_config.configure(log_level) log.debug(f'source_path: {source_path} out_path: {out_path}') event_asset_loader = EventAssetLoader(source_path=source_path, out_path=out_path, source_type_index=source_type_index, source_id_index=source_id_index) event_asset_loader.link_event_files()
def main() -> None: env = environs.Env() out_path: Path = env.path('OUT_PATH') db_url: str = env.str('DATABASE_URL') log_level: str = env.log_level('LOG_LEVEL', 'INFO') log_config.configure(log_level) log.debug(f'out_path: {out_path}') with closing(connect(db_url)) as connection: get_assets_partial = partial(get_assets, connection) get_asset_locations_partial = partial(get_asset_locations, connection) location_asset_loader.write_files( get_assets=get_assets_partial, get_asset_locations=get_asset_locations_partial, out_path=out_path)
def main() -> None: """ Link files in the data path into the output path. A specification file with multiple inputs will use the same 'DATA_PATH' name to group the inputs. """ env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) log.debug(f'data_path: {data_path} out_path: {out_path}') group_files(path=data_path, out_path=out_path, relative_path_index=relative_path_index)
def main() -> None: """Read files from the list of related paths and link them into the output path.""" env = environs.Env() related_paths: list = env.list('RELATED_PATHS') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) log.debug(f'related_paths: {related_paths} out_path: {out_path}') paths = [] for p in related_paths: path = os.environ[p] paths.append(Path(path)) join_files(related_paths=paths, out_path=out_path, relative_path_index=relative_path_index)
def main() -> None: env = environs.Env() out_path: Path = env.path('OUT_PATH') bootstrap_server: str = env.str('BOOTSTRAP_SERVER') topic: str = env.str('TOPIC') group_id: str = env.str('GROUP_ID') auto_offset_reset: str = env.str('AUTO_OFFSET_RESET') enable_auto_commit: bool = env.bool('ENABLE_AUTO_COMMIT') log_level: str = env.log_level('LOG_LEVEL', 'INFO') log_config.configure(log_level) config = Config(out_path=out_path, bootstrap_server=bootstrap_server, topic=topic, group_id=group_id, auto_offset_reset=auto_offset_reset, enable_auto_commit=enable_auto_commit, is_test=False) read_messages_partial = partial(read_messages, config) run(config, open_pipe, read_messages_partial)
def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL') log_config.configure(log_level) log = get_logger() log.debug(f'data_dir: {data_path}') log.debug(f'out_dir: {out_path}') parser = argparse.ArgumentParser() parser.add_argument('--outputname') parser.add_argument('--dateindex') parser.add_argument('--locindex') args = parser.parse_args() egress = Egress(data_path, out_path, args.outputname, int(args.dateindex), int(args.locindex)) egress.upload()
def main() -> None: env = environs.Env() in_path: Path = env.path('IN_PATH') out_path: Path = env.path('OUT_PATH') context: str = env.str('CONTEXT') log_level: str = env.log_level('LOG_LEVEL', 'INFO') trim_index: int = env.int('TRIM_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') log_config.configure(log_level) log = structlog.get_logger() log.debug(f'in_path: {in_path} out_path: {out_path} context: {context}') config = Config(in_path=in_path, out_path=out_path, context=context, trim_index=trim_index, source_id_index=source_id_index, data_type_index=data_type_index) context_filter = ContextFilter(config) context_filter.filter_files()
def main() -> None: """ Link input paths into the output path. """ env = environs.Env() related_paths: list = env.list('RELATED_PATHS') data_path: Path = env.path('DATA_PATH') location_path: Path = env.path('LOCATION_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') year_index: int = env.int('YEAR_INDEX') loc_index: int = env.int('LOC_INDEX') grouploc_key: str = env.str('GROUPLOC_KEY') log_config.configure(log_level) log.debug(f'related_paths: {related_paths} out_path: {out_path}') paths = [] for p in related_paths: path = os.environ[p] paths.append(Path(path)) location_group(related_paths=paths, data_path=data_path, location_path=location_path, out_path=out_path, relative_path_index=relative_path_index, year_index=year_index, loc_index=loc_index, grouploc_key=grouploc_key)
def main() -> None: env = environs.Env() in_path: Path = env.path('IN_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') # default 30 percent duplication threshold duplication_threshold: float = env.float('DEDUPLICATION_THRESHOLD', 0.3) source_type_index: int = env.int('SOURCE_TYPE_INDEX') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') source_id_index: int = env.int('SOURCE_ID_INDEX') log_config.configure(log_level) config = Config(in_path=in_path, out_path=out_path, duplication_threshold=duplication_threshold, source_type_index=source_type_index, year_index=year_index, month_index=month_index, day_index=day_index, source_id_index=source_id_index) parquet_file_merger = ParquetFileMerger(config) parquet_file_merger.merge()
def main() -> None: env = environs.Env() data_path: Path = env.path('DATA_PATH') out_path: Path = env.path('OUT_PATH') log_level: str = env.log_level('LOG_LEVEL', 'INFO') window_size: int = env.int('WINDOW_SIZE') year_index: int = env.int('YEAR_INDEX') month_index: int = env.int('MONTH_INDEX') day_index: int = env.int('DAY_INDEX') location_index: int = env.int('LOCATION_INDEX') data_type_index: int = env.int('DATA_TYPE_INDEX') relative_path_index: int = env.int('RELATIVE_PATH_INDEX') log_config.configure(log_level) config = Config(data_path=data_path, out_path=out_path, relative_path_index=relative_path_index, year_index=year_index, month_index=month_index, day_index=day_index, location_index=location_index, data_type_index=data_type_index, window_size=window_size) constant_pad = ConstantPad(config) constant_pad.pad()
def main() -> None: env = environs.Env() log_level: str = env.log_level('LOG_LEVEL', 'INFO') year_index: int = env.int('YEAR_INDEX') log_config.configure(log_level) transform(year_index=year_index)