Esempio n. 1
0
def test_attributes_to_csv():
    ya_direct_campaigns_to_csv_config.export.credentials = yandex_direct_credentials
    config = ETLFlowConfig(**ya_direct_campaigns_to_csv_config.dict())
    etl_flow = ETLOperator(config)
    list(
        etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1),
                 max_pages=2))
Esempio n. 2
0
def test_reports_to_clickhouse():
    ya_direct_report_to_clickhouse_config.export.credentials = yandex_direct_credentials
    ya_direct_report_to_clickhouse_config.load.credentials = clickhouse_credentials
    config = ETLFlowConfig(**ya_direct_report_to_clickhouse_config.dict())
    etl_flow = ETLOperator(config)
    list(
        etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1),
                 max_pages=2))
def test_flow_sqlite_to_csv(sqlite_to_csv_config):
    etl_flow = ETLOperator(sqlite_to_csv_config)
    list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)))

    with etl_flow.Load.open_file(mode="r") as loadfile:
        data = loadfile.read()

    # fmt: off
    assert data == '''id\tkey
    def order_task(*args, **kwargs):
        count_flows = 4
        worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_flows)]

        for worktime in worktimes:
            CONFIG.load.file_name = (
                f"{test_thread_executor_yandex_metrika_logs.__name__}.tsv")

            flow = ETLOperator(CONFIG)
            generator = flow(start_period=worktime, end_period=worktime)

            yield generator
    def order_task(*args, **kwargs):
        count_flows = 4
        worktimes = [dt.datetime(2021, 1, i + 1) for i in range(count_flows)]

        for worktime in worktimes:
            yml_visits_to_csv_config.load.file_name = f"{test_executor.__name__}.tsv"

            flow = ETLOperator(yml_visits_to_csv_config)
            generator = flow(start_period=worktime,
                             end_period=worktime,
                             async_mode=True)

            yield generator
Esempio n. 6
0
def test_jinja_template():
    yml_visits_to_csv_config.name = "flow"
    yml_visits_to_csv_config.load.file_name = (
        "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv")
    yml_visits_to_csv_config.load.add_data_before = (
        "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv")
    yml_visits_to_csv_config.load.add_data_after = (
        "{{name}} {{provider}} {{storage}} {{ datetime.date() }}.tsv")

    flow = ETLOperator(yml_visits_to_csv_config)

    assert flow.Load.file_name == "flow yandex_metrika_logs csv 2021-01-01.tsv"
    assert flow.Load.add_data_before == "flow yandex_metrika_logs csv 2021-01-01.tsv"
    assert flow.Load.add_data_after == "flow yandex_metrika_logs csv 2021-01-01.tsv"
Esempio n. 7
0
def test_flow_csv_to_csv_with_columns(config_csv_to_csv_with_columns):
    etl_flow = ETLOperator(config_csv_to_csv_with_columns)
    list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)))

    with etl_flow.Load.open_file(mode="r") as loadfile:
        data = loadfile.readlines()

    assert data == [
        "col1\tcol2\n",
        '"1"\tnull\n',
        '"1"\t"2"\n',
        '"1"\t"2"\n',
        '"1"\t"2"\n',
        '"1"\t"2"\n',
    ]
def test_flow():
    def export_func(start_period, end_period) -> Iterator[tuple[dict, list, list]]:
        yield ExportContext(
            columns=["col1"], data=[[start_period]], data_orient=DataOrient.values
        )
        yield ExportContext(
            columns=["col1"], data=[[end_period]], data_orient=DataOrient.values
        )

    YandexMetrikaLogsExport.__call__ = Mock(side_effect=export_func)

    yml_visits_to_csv_config.load.file_name = f"{test_flow.__name__}.tsv"
    yml_visits_to_csv_config.load.with_columns = True

    flow = ETLOperator(yml_visits_to_csv_config)

    list(flow(start_period=dt.datetime(2021, 1, 1), end_period=dt.datetime(2021, 1, 2)))
def test_codex_telegram():
    def export_func(start_period,
                    end_period) -> Iterator[tuple[dict, list, list]]:
        yield ({}, ["date"], [[start_period]])

    yml_visits_to_csv_config.work.notifications = ETLFlowConfig.WorkPolicy.NotificationsPolicy(
        codex_telegram=ETLFlowConfig.WorkPolicy.NotificationsPolicy.
        CodexTelegramPolicy(
            links=[credentials["codex_telegram"]],
            on_success=True,
        ))
    config = ETLFlowConfig(**dict(yml_visits_to_csv_config))

    YandexMetrikaLogsExport.__call__ = Mock(side_effect=export_func)
    etl_flow = ETLOperator(config)

    list(
        etl_flow(start_period=dt.datetime(2021, 1, 1),
                 end_period=dt.datetime(2021, 1, 1)))
Esempio n. 10
0
def order_etl_flow(
    *, logger: Logger, async_mode: bool = False, dry_run: bool = False
) -> Iterator:
    """Prepare flow function to be sent to the queue and executed"""
    from flowmaster.operators.etl.service import ETLOperator
    from flowmaster.operators.etl.policy import ETLFlowConfig

    for file_name, config in YamlHelper.iter_parse_file_from_dir(
        FLOW_CONFIGS_DIR, match=".etl.flow"
    ):
        if dry_run:
            if config.get("provider") != "fakedata":
                continue

        try:
            flow_config = ETLFlowConfig(name=file_name, **config)
        except pydantic.ValidationError as exc:
            logger.error("ValidationError: '%s': %s", file_name, exc)
            continue
        except Exception as exc:
            logger.error("Error: '%s': %s", file_name, exc)
            continue

        work = ETLWork(flow_config)

        for start_period, end_period in work.iter_period_for_execute():
            etl_flow = ETLOperator(flow_config)
            etl_flow_iterator = etl_flow(
                start_period, end_period, async_mode=async_mode, dry_run=dry_run
            )

            # The status is changed so that there is no repeated ordering of tasks.
            FlowItem.change_status(
                etl_flow.name,
                new_status=FlowStatus.run,
                from_time=start_period,
                to_time=end_period,
            )
            logger.info(
                "Order ETL flow [%s]: %s %s", etl_flow.name, start_period, end_period
            )

            yield etl_flow_iterator
def test_flow_fakedata():
    config = ETLFlowConfig(**dict(fakedata_to_csv_config))
    etl_flow = ETLOperator(config)
    list(etl_flow(dt.datetime(2021, 2, 1), dt.datetime(2021, 2, 1)))