def test_attributes_to_csv(): ya_direct_campaigns_to_csv_config.export.credentials = yandex_direct_credentials config = ETLFlowConfig(**ya_direct_campaigns_to_csv_config.dict()) etl_flow = ETLOperator(config) list( etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1), max_pages=2))
def test_reports_to_clickhouse(): ya_direct_report_to_clickhouse_config.export.credentials = yandex_direct_credentials ya_direct_report_to_clickhouse_config.load.credentials = clickhouse_credentials config = ETLFlowConfig(**ya_direct_report_to_clickhouse_config.dict()) etl_flow = ETLOperator(config) list( etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1), max_pages=2))
def test_flow_sqlite_to_csv(sqlite_to_csv_config): etl_flow = ETLOperator(sqlite_to_csv_config) list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1))) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.read() # fmt: off assert data == '''id\tkey
def order_task(*args, **kwargs): count_flows = 4 worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_flows)] for worktime in worktimes: CONFIG.load.file_name = ( f"{test_thread_executor_yandex_metrika_logs.__name__}.tsv") flow = ETLOperator(CONFIG) generator = flow(start_period=worktime, end_period=worktime) yield generator
def order_task(*args, **kwargs): count_flows = 4 worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_flows)] for worktime in worktimes: yml_visits_to_csv_config.load.file_name = f"{test_executor.__name__}.tsv" flow = ETLOperator(yml_visits_to_csv_config) generator = flow(start_period=worktime, end_period=worktime, async_mode=True) yield generator
def test_jinja_template(): yml_visits_to_csv_config.name = "flow" yml_visits_to_csv_config.load.file_name = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") yml_visits_to_csv_config.load.add_data_before = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") yml_visits_to_csv_config.load.add_data_after = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") flow = ETLOperator(yml_visits_to_csv_config) assert flow.Load.file_name == "flow yandex_metrika_logs csv 2021-01-01.tsv" assert flow.Load.add_data_before == "flow yandex_metrika_logs csv 2021-01-01.tsv" assert flow.Load.add_data_after == "flow yandex_metrika_logs csv 2021-01-01.tsv"
def test_flow_csv_to_csv_with_columns(config_csv_to_csv_with_columns): etl_flow = ETLOperator(config_csv_to_csv_with_columns) list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1))) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.readlines() assert data == [ "col1\tcol2\n", '"1"\tnull\n', '"1"\t"2"\n', '"1"\t"2"\n', '"1"\t"2"\n', '"1"\t"2"\n', ]
def test_flow(): def export_func(start_period, end_period) -> Iterator[tuple[dict, list, list]]: yield ExportContext( columns=["col1"], data=[[start_period]], data_orient=DataOrient.values ) yield ExportContext( columns=["col1"], data=[[end_period]], data_orient=DataOrient.values ) YandexMetrikaLogsExport.__call__ = Mock(side_effect=export_func) yml_visits_to_csv_config.load.file_name = f"{test_flow.__name__}.tsv" yml_visits_to_csv_config.load.with_columns = True flow = ETLOperator(yml_visits_to_csv_config) list(flow(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)))
def test_codex_telegram(): def export_func(start_period, end_period) -> Iterator[tuple[dict, list, list]]: yield ({}, ["date"], [[start_period]]) yml_visits_to_csv_config.work.notifications = ETLFlowConfig.WorkPolicy.NotificationsPolicy( codex_telegram=ETLFlowConfig.WorkPolicy.NotificationsPolicy. CodexTelegramPolicy( links=[credentials["codex_telegram"]], on_success=True, )) config = ETLFlowConfig(**dict(yml_visits_to_csv_config)) YandexMetrikaLogsExport.__call__ = Mock(side_effect=export_func) etl_flow = ETLOperator(config) list( etl_flow(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 1)))
def order_etl_flow( *, logger: Logger, async_mode: bool = False, dry_run: bool = False ) -> Iterator: """Prepare flow function to be sent to the queue and executed""" from flowmaster.operators.etl.service import ETLOperator from flowmaster.operators.etl.policy import ETLFlowConfig for file_name, config in YamlHelper.iter_parse_file_from_dir( FLOW_CONFIGS_DIR, match=".etl.flow" ): if dry_run: if config.get("provider") != "fakedata": continue try: flow_config = ETLFlowConfig(name=file_name, **config) except pydantic.ValidationError as exc: logger.error("ValidationError: '%s': %s", file_name, exc) continue except Exception as exc: logger.error("Error: '%s': %s", file_name, exc) continue work = ETLWork(flow_config) for start_period, end_period in work.iter_period_for_execute(): etl_flow = ETLOperator(flow_config) etl_flow_iterator = etl_flow( start_period, end_period, async_mode=async_mode, dry_run=dry_run ) # The status is changed so that there is no repeated ordering of tasks. FlowItem.change_status( etl_flow.name, new_status=FlowStatus.run, from_time=start_period, to_time=end_period, ) logger.info( "Order ETL flow [%s]: %s %s", etl_flow.name, start_period, end_period ) yield etl_flow_iterator
def test_flow_fakedata(): config = ETLFlowConfig(**dict(fakedata_to_csv_config)) etl_flow = ETLOperator(config) list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)))