def ordering_etl_flow_tasks(*, dry_run: bool = False ) -> Iterator[ExecutorIterationTask]: """Prepare flow function to be sent to the queue and executed""" # TODO: избавиться от функции, переделать так, чтобы одна функция была для заказа from flowmaster.operators.etl.core import ETLOperator from flowmaster.operators.etl.policy import ETLNotebook for name in iter_active_notebook_filenames(): validate, text, notebook_dict, notebook, error = get_notebook(name) notebook: ETLNotebook if dry_run: if notebook.provider != "fakedata": continue if not validate: logger.error("ValidationError: '{}': {}", name, error) continue work = Work(notebook) for start_period, end_period in work.iter_period_for_execute(): flow = ETLOperator(notebook) etl_flow_task = flow.task(start_period, end_period, dry_run=dry_run) with prepare_items_for_order(flow, start_period, end_period): logger.info("Order ETL flow [{}]: {} {}", notebook.name, start_period, end_period) yield etl_flow_task
def test_flow(ya_metrika_logs_to_csv_notebook): from flowmaster.operators.etl.dataschema import ExportContext from flowmaster.operators.etl.providers import Providers from flowmaster.operators.etl.core import ETLOperator from flowmaster.operators.etl.enums import DataOrient def export_func(start_period, end_period, **kwargs) -> Iterator[tuple[dict, list, list]]: yield ExportContext(columns=["col1"], data=[[start_period]], data_orient=DataOrient.values) yield ExportContext(columns=["col1"], data=[[end_period]], data_orient=DataOrient.values) Providers.YandexMetrikaLogsProvider.export_class.__call__ = Mock( side_effect=export_func) ya_metrika_logs_to_csv_notebook.load.file_name = f"{test_flow.__name__}.tsv" ya_metrika_logs_to_csv_notebook.load.with_columns = True flow = ETLOperator(ya_metrika_logs_to_csv_notebook) flow.dry_run(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)) list( flow.task(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)))
def test_flow_flowmaster_items(flowmasterdata_items_to_csv_notebook): etl_flow = ETLOperator(flowmasterdata_items_to_csv_notebook) task = etl_flow.task(dt.datetime(2021, 2, 5), dt.datetime(2021, 2, 5)) list(task) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.readlines() count_items = len([ row for row in data if flowmasterdata_items_to_csv_notebook.name in row ]) assert count_items == 1
def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]: worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)] pools.append_pools({"two": pool_size}) for worktime in worktimes: ya_metrika_logs_to_csv_notebook.load.file_name = ( f"{test_pools.__name__}.tsv") ya_metrika_logs_to_csv_notebook.export.pools = ["two"] flow = ETLOperator(ya_metrika_logs_to_csv_notebook) task = flow.task(start_period=worktime, end_period=worktime) yield task
def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]: worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)] for worktime in worktimes: ya_metrika_logs_to_csv_notebook.load.file_name = ( f"{test_concurrency.__name__}.tsv") ya_metrika_logs_to_csv_notebook.work.concurrency = concurrency ya_metrika_logs_to_csv_notebook.export.concurrency = 4 ya_metrika_logs_to_csv_notebook.transform.concurrency = 4 ya_metrika_logs_to_csv_notebook.load.concurrency = 4 flow = ETLOperator(ya_metrika_logs_to_csv_notebook) yield flow.task(start_period=worktime, end_period=worktime)