def test_attributes_to_clickhouse(ya_direct_campaigns_to_clickhouse_notebook): from flowmaster.operators.etl.core import ETLOperator etl_flow = ETLOperator(ya_direct_campaigns_to_clickhouse_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1), max_pages=2)
def ordering_etl_flow_tasks(*, dry_run: bool = False ) -> Iterator[ExecutorIterationTask]: """Prepare flow function to be sent to the queue and executed""" # TODO: избавиться от функции, переделать так, чтобы одна функция была для заказа from flowmaster.operators.etl.core import ETLOperator from flowmaster.operators.etl.policy import ETLNotebook for name in iter_active_notebook_filenames(): validate, text, notebook_dict, notebook, error = get_notebook(name) notebook: ETLNotebook if dry_run: if notebook.provider != "fakedata": continue if not validate: logger.error("ValidationError: '{}': {}", name, error) continue work = Work(notebook) for start_period, end_period in work.iter_period_for_execute(): flow = ETLOperator(notebook) etl_flow_task = flow.task(start_period, end_period, dry_run=dry_run) with prepare_items_for_order(flow, start_period, end_period): logger.info("Order ETL flow [{}]: {} {}", notebook.name, start_period, end_period) yield etl_flow_task
def test_reports_to_csv(ya_direct_report_to_csv_notebook): from flowmaster.operators.etl.core import ETLOperator etl_flow = ETLOperator(ya_direct_report_to_csv_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1), max_pages=2)
def test_flow(ya_metrika_logs_to_csv_notebook): from flowmaster.operators.etl.dataschema import ExportContext from flowmaster.operators.etl.providers import Providers from flowmaster.operators.etl.core import ETLOperator from flowmaster.operators.etl.enums import DataOrient def export_func(start_period, end_period, **kwargs) -> Iterator[tuple[dict, list, list]]: yield ExportContext(columns=["col1"], data=[[start_period]], data_orient=DataOrient.values) yield ExportContext(columns=["col1"], data=[[end_period]], data_orient=DataOrient.values) Providers.YandexMetrikaLogsProvider.export_class.__call__ = Mock( side_effect=export_func) ya_metrika_logs_to_csv_notebook.load.file_name = f"{test_flow.__name__}.tsv" ya_metrika_logs_to_csv_notebook.load.with_columns = True flow = ETLOperator(ya_metrika_logs_to_csv_notebook) flow.dry_run(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)) list( flow.task(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)))
def test_flow_postgres_to_csv(postgres_to_csv_notebook): etl_flow = ETLOperator(postgres_to_csv_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.read() # fmt: off assert data == '''id\tkey
def test_flow_flowmaster_items(flowmasterdata_items_to_csv_notebook): etl_flow = ETLOperator(flowmasterdata_items_to_csv_notebook) task = etl_flow.task(dt.datetime(2021, 2, 5), dt.datetime(2021, 2, 5)) list(task) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.readlines() count_items = len([ row for row in data if flowmasterdata_items_to_csv_notebook.name in row ]) assert count_items == 1
def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]: worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)] pools.append_pools({"two": pool_size}) for worktime in worktimes: ya_metrika_logs_to_csv_notebook.load.file_name = ( f"{test_pools.__name__}.tsv") ya_metrika_logs_to_csv_notebook.export.pools = ["two"] flow = ETLOperator(ya_metrika_logs_to_csv_notebook) task = flow.task(start_period=worktime, end_period=worktime) yield task
def order_task(*args, **kwargs) -> Iterator[ExecutorIterationTask]: worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_task)] for worktime in worktimes: ya_metrika_logs_to_csv_notebook.load.file_name = ( f"{test_concurrency.__name__}.tsv") ya_metrika_logs_to_csv_notebook.work.concurrency = concurrency ya_metrika_logs_to_csv_notebook.export.concurrency = 4 ya_metrika_logs_to_csv_notebook.transform.concurrency = 4 ya_metrika_logs_to_csv_notebook.load.concurrency = 4 flow = ETLOperator(ya_metrika_logs_to_csv_notebook) yield flow.task(start_period=worktime, end_period=worktime)
def test_flow_csv_to_csv_with_columns(csv_to_csv_with_columns_notebook): etl_flow = ETLOperator(csv_to_csv_with_columns_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.readlines() assert data == [ "col1\tcol2\n", '"1"\tnull\n', '"1"\t"2"\n', '"1"\t"2"\n', '"1"\t"2"\n', '"1"\t"2"\n', ]
def test(google_sheets_to_csv_notebook): flow = ETLOperator(google_sheets_to_csv_notebook) list(flow(dt.datetime(2021, 7, 27), dt.datetime(2021, 7, 27))) with flow.Load.open_file() as file: text = file.read() assert text == ('col2\tdate\tcol1\n' '"2"\t"2021-01-01"\t"1"\n' '"2"\t"2021-01-01"\tnull\n' '"2"\tnull\tnull\n')
def order_task(*args, **kwargs): count_flows = 4 worktimes = [ pendulum.datetime(2021, 1, i + 1) for i in range(count_flows) ] for worktime in worktimes: NOTEBOOK.load.file_name = ( f"{test_thread_executor_yandex_metrika_logs.__name__}.tsv") flow = ETLOperator(NOTEBOOK) generator = flow(start_period=worktime, end_period=worktime) yield ExecutorIterationTask(generator)
def test_flow_flowmasterdata_pools(flowmasterdata_items_to_csv_notebook, flowmasterdata_pools_export_policy): flowmasterdata_items_to_csv_notebook.export = flowmasterdata_pools_export_policy etl_flow = ETLOperator(flowmasterdata_items_to_csv_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)) with etl_flow.Load.open_file(mode="r") as loadfile: data = loadfile.readlines() assert [ row for row in data if "____test_flowmasterdata_items_to_csv___export_concurrency__" in row ] assert [ row for row in data if "____test_flowmasterdata_items_to_csv___transform_concurrency__" in row ] assert [ row for row in data if "____test_flowmasterdata_items_to_csv___load_concurrency__" in row ] assert [row for row in data if "name\tsize\tlimit\tdatetime" in row]
def test_jinja_template(ya_metrika_logs_to_csv_notebook): ya_metrika_logs_to_csv_notebook.name = "flow" ya_metrika_logs_to_csv_notebook.load.file_name = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") ya_metrika_logs_to_csv_notebook.load.add_data_before = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") ya_metrika_logs_to_csv_notebook.load.add_data_after = ( "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv") flow = ETLOperator(ya_metrika_logs_to_csv_notebook) assert flow.Load.file_name == "flow yandex_metrika_logs csv 2021-01-01.tsv" assert flow.Load.add_data_before == "flow yandex_metrika_logs csv 2021-01-01.tsv" assert flow.Load.add_data_after == "flow yandex_metrika_logs csv 2021-01-01.tsv"
def test_codex_telegram(): def export_func(start_period, end_period) -> Iterator[tuple[dict, list, list]]: yield ({}, ["date"], [[start_period]]) ya_metrika_logs_to_csv_notebook.work.notifications = ETLNotebook.WorkPolicy.NotificationsPolicy( codex_telegram=ETLNotebook.WorkPolicy.NotificationsPolicy. CodexTelegramPolicy( links=[credentials["codex_telegram"]], on_success=True, )) notebook = ETLNotebook(**dict(ya_metrika_logs_to_csv_notebook)) Providers.YandexMetrikaLogsProvider.export_class.__call__ = Mock( side_effect=export_func) etl_flow = ETLOperator(notebook) list( etl_flow(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 1)))
def test(criteo_to_csv_notebook): flow = ETLOperator(criteo_to_csv_notebook) flow.dry_run(dt.datetime(2021, 7, 27), dt.datetime(2021, 7, 27)) with flow.Load.open_file() as file: assert file.read() == 'Day\tClicks\n"2021-07-27"\t"1927"\n'
def test_flow_fakedata(fakedata_to_csv_notebook): from flowmaster.operators.etl.core import ETLOperator etl_flow = ETLOperator(fakedata_to_csv_notebook) etl_flow.dry_run(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1))