Exemple #1
0
    def ewah_execute(self, context):
        time_range = {
            "since": self.data_from.strftime("%Y-%m-%d"),
            "until": self.data_until.strftime("%Y-%m-%d"),
        }

        FacebookAdsApi.init(**self.credentials)
        params = {
            "time_range": time_range,
            "time_increment": self.time_increment,
            "level": self.level,
            "limit": self.pagination_limit,
        }
        if self.breakdowns:
            params.update({"breakdowns": ",".join(self.breakdowns)})

        for account_id in self.account_ids:
            if self.execution_waittime_seconds:
                self.log.info("Delaying execution by {0} seconds...".format(
                    str(self.execution_waittime_seconds), ))
                now = datetime_utcnow_with_tz()
                while datetime_utcnow_with_tz() < (now + timedelta(
                        seconds=self.execution_waittime_seconds)):
                    time.sleep(1)

            account_object = AdAccount("act_{0}".format(str(account_id)))
            self.log.info(
                ("Requesting data for account_id={0} between {1} and {2}."
                 ).format(
                     str(account_id),
                     time_range["since"],
                     time_range["until"],
                 ))

            async_job = account_object.get_insights_async(
                fields=self.insight_fields,
                params=params,
            )
            job_remote_read = async_job.api_get()
            done_status = [
                "Job Completed",
                "Job Failed",
                "Job Skipped",
            ]
            while not (job_remote_read.get("async_status") in done_status):
                self.log.info(
                    "Asnyc job completion: {0}% (status: {1})".format(
                        str(job_remote_read.get("async_percent_completion")),
                        str(job_remote_read.get("async_status")),
                    ))
                time.sleep(self.async_job_read_frequency_seconds)
                job_remote_read = async_job.api_get()

            time.sleep(1)
            assert job_remote_read.get("async_status") == "Job Completed"
            data = self._clean_response_data(
                async_job.get_result(params={"limit":
                                             self.pagination_limit}, ))
            self.upload_data(data)
Exemple #2
0
    def ewah_execute(self, context):
        data_from = self.data_from or context["dag"].start_date
        data_until = self.data_until or datetime_utcnow_with_tz()

        format_str = "%Y-%m-%d"
        currency_str = "{0}{1}=X".format(*self.currency_pair)
        data = YahooFinancials([currency_str]).get_historical_price_data(
            data_from.strftime(format_str),
            data_until.strftime(format_str),
            self.frequency,
        )
        self.upload_data(data[currency_str]["prices"])
Exemple #3
0
    def execute(self, context):
        """Why this method is defined here:
        When executing a task, airflow calls this method. Generally, this
        method contains the "business logic" of the individual operator.
        However, EWAH may want to do some actions for all operators. Thus,
        the child operators shall have an ewah_execute() function which is
        called by this general execute() method.
        """

        # required for metadata in data upload
        self._execution_time = datetime_utcnow_with_tz()
        self._context = context

        self.uploader = self.uploader(
            EWAHBaseHook.get_connection(self.dwh_conn_id))

        if self.source_conn_id:
            # resolve conn id here & delete the object to avoid usage elsewhere
            self.source_conn = EWAHBaseHook.get_connection(self.source_conn_id)
            self.source_hook = self.source_conn.get_hook()
        del self.source_conn_id

        if self._CONN_TYPE:
            _msg = "Error - connection type must be {0}!".format(
                self._CONN_TYPE)
            assert self._CONN_TYPE == self.source_conn.conn_type, _msg

        temp_schema_name = self.target_schema_name + self.target_schema_suffix
        # Create a new copy of the target table.
        # This is so data is loaded into a new table and if data loading
        # fails, the original data is not corrupted. At a new try or re-run,
        # the original table is just copited anew.
        if not self.extract_strategy == EC.ES_FULL_REFRESH:
            # Full refresh always drops and replaces the tables completely
            self.uploader.copy_table(
                old_schema=self.target_schema_name,
                old_table=self.target_table_name,
                new_schema=temp_schema_name,
                new_table=self.target_table_name,
                database_name=self.target_database_name,
            )

        # set load_data_from and load_data_until as required
        data_from = ada(self.load_data_from)
        data_until = ada(self.load_data_until)
        if self.extract_strategy == EC.ES_INCREMENTAL:
            _tdz = timedelta(days=0)  # aka timedelta zero
            _ed = context["execution_date"]
            _ned = context["next_execution_date"]

            # normal incremental load
            _ed -= self.load_data_from_relative or _tdz
            data_from = max(_ed, data_from or _ed)
            if not self.test_if_target_table_exists():
                # Load data from scratch!
                data_from = ada(self.reload_data_from) or data_from

            _ned += self.load_data_until_relative or _tdz
            data_until = min(_ned, data_until or _ned)

        elif self.extract_strategy == EC.ES_FULL_REFRESH:
            # Values may still be set as static values
            data_from = ada(self.reload_data_from) or data_from

        else:
            _msg = "Must define load_data_from etc. behavior for load strategy!"
            raise Exception(_msg)

        self.data_from = data_from
        self.data_until = data_until
        # del variables to make sure they are not used later on
        del self.load_data_from
        del self.reload_data_from
        del self.load_data_until
        del self.load_data_from_relative
        del self.load_data_until_relative

        # Have an option to wait until a short period (e.g. 2 minutes) past
        # the incremental loading range timeframe to ensure that all data is
        # loaded, useful e.g. if APIs lag or if server timestamps are not
        # perfectly accurate.
        # When a DAG is executed as soon as possible, some data sources
        # may not immediately have up to date data from their API.
        # E.g. querying all data until 12.30pm only gives all relevant data
        # after 12.32pm due to some internal delays. In those cases, make
        # sure the (incremental loading) DAGs don't execute too quickly.
        if self.wait_for_seconds and self.extract_strategy == EC.ES_INCREMENTAL:
            wait_until = context.get("next_execution_date")
            if wait_until:
                wait_until += timedelta(seconds=self.wait_for_seconds)
                self.log.info("Awaiting execution until {0}...".format(
                    str(wait_until), ))
            while wait_until and datetime_utcnow_with_tz() < wait_until:
                # Only sleep a maximum of 5s at a time
                wait_for_timedelta = wait_until - datetime_utcnow_with_tz()
                time.sleep(max(0, min(wait_for_timedelta.total_seconds(), 5)))

        # execute operator
        if self.load_data_chunking_timedelta and data_from and data_until:
            # Chunking to avoid OOM
            assert data_until > data_from
            assert self.load_data_chunking_timedelta > timedelta(days=0)
            while self.data_from < data_until:
                self.data_until = self.data_from
                self.data_until += self.load_data_chunking_timedelta
                self.data_until = min(self.data_until, data_until)
                self.ewah_execute(context)
                self.data_from += self.load_data_chunking_timedelta
        else:
            self.ewah_execute(context)

        # if PostgreSQL and arg given: create indices
        for column in self.index_columns:
            assert self.dwh_engine == EC.DWH_ENGINE_POSTGRES
            # Use hashlib to create a unique 63 character string as index
            # name to avoid breaching index name length limits & accidental
            # duplicates / missing indices due to name truncation leading to
            # identical index names.
            self.uploader.dwh_hook.execute(
                self._INDEX_QUERY.format(
                    "__ewah_" + hashlib.blake2b(
                        (temp_schema_name + "." + self.target_table_name +
                         "." + column).encode(),
                        digest_size=28,
                    ).hexdigest(),
                    self.target_schema_name + self.target_schema_suffix,
                    self.target_table_name,
                    column,
                ))

        # commit only at the end, so that no data may be committed before an
        # error occurs.
        self.log.info("Now committing changes!")
        self.uploader.commit()
        self.uploader.close()
Exemple #4
0
def dag_factory_drop_and_replace(
    dag_name: str,
    dwh_engine: str,
    dwh_conn_id: str,
    start_date: datetime,
    el_operator: Type[EWAHBaseOperator],
    operator_config: dict,
    target_schema_name: str,
    target_schema_suffix: str = "_next",
    target_database_name: Optional[str] = None,
    default_args: Optional[dict] = None,
    schedule_interval: timedelta = timedelta(days=1),
    end_date: Optional[datetime] = None,
    read_right_users: Optional[Union[List[str], str]] = None,
    additional_dag_args: Optional[dict] = None,
    additional_task_args: Optional[dict] = None,
    logging_func: Optional[Callable] = None,
    **kwargs
) -> Tuple[DAG]:
    def raise_exception(msg: str) -> None:
        """Add information to error message before raising."""
        raise Exception("DAG: {0} - Error: {1}".format(dag_name, msg))

    logging_func = logging_func or print

    if kwargs:
        logging_func("unused config: {0}".format(str(kwargs)))

    additional_dag_args = additional_dag_args or {}
    additional_task_args = additional_task_args or {}

    if not read_right_users is None:
        if isinstance(read_right_users, str):
            read_right_users = [u.strip() for u in read_right_users.split(",")]
        if not isinstance(read_right_users, Iterable):
            raise_exception("read_right_users must be an iterable or string!")

    # fake catchup = True: between start_date and end_date is only one schedule_interval
    # --> run the full refreshs every schedule_interval at the same time instead of
    # having a drift in execution time!
    if end_date:
        end_date = min(end_date, datetime_utcnow_with_tz())
    else:
        end_date = datetime_utcnow_with_tz()
    start_date += int((end_date - start_date) / schedule_interval) * schedule_interval
    if start_date == end_date:
        # if the division result is a precise integer, that implies a definite end_date
        # --> adjust to get exactly one schedule_interval delta between start_date and
        # end_date to have one last run available (that should have run before end_date)
        start_date -= schedule_interval
    else:
        # Airflow executes at the END of the execution_date - start_date has to be
        # between exactly 1 and below 2 time schedule_interval before end_date!
        # end_date - 2*schedule_interval < start_date <= end_date - schedule_interval
        start_date -= schedule_interval
        # Make sure only one execution every runs scheduled but manual triggers work!
        end_date = start_date + 2 * schedule_interval - timedelta(seconds=1)

    dag = DAG(
        dag_name,
        catchup=True,  # See above
        default_args=default_args,
        max_active_runs=1,
        schedule_interval=schedule_interval,
        start_date=start_date,
        end_date=end_date,
        **additional_dag_args,
    )

    kickoff, final = etl_schema_tasks(
        dag=dag,
        dwh_engine=dwh_engine,
        dwh_conn_id=dwh_conn_id,
        target_schema_name=target_schema_name,
        target_schema_suffix=target_schema_suffix,
        target_database_name=target_database_name,
        read_right_users=read_right_users,
        **additional_task_args,
    )

    base_config = deepcopy(additional_task_args)
    base_config.update(operator_config.get("general_config", {}))
    with dag:
        for table in operator_config["tables"].keys():
            table_config = deepcopy(base_config)
            table_config.update(operator_config["tables"][table] or {})
            table_config.update(
                {
                    "task_id": "extract_load_" + re.sub(r"[^a-zA-Z0-9_]", "", table),
                    "dwh_engine": dwh_engine,
                    "dwh_conn_id": dwh_conn_id,
                    "extract_strategy": EC.ES_FULL_REFRESH,
                    "target_table_name": operator_config["tables"][table].get(
                        "target_table_name", table
                    ),
                    "target_schema_name": target_schema_name,
                    "target_schema_suffix": target_schema_suffix,
                    "target_database_name": target_database_name,
                }
            )
            table_task = el_operator(**table_config)
            kickoff >> table_task >> final

    return (dag,)
Exemple #5
0
def dag_factory_fullcremental(
        dag_name: str,
        dwh_engine: str,
        dwh_conn_id: str,
        airflow_conn_id: str,
        start_date: datetime,
        el_operator: Type[EWAHBaseOperator],
        operator_config: dict,
        target_schema_name: str,
        target_schema_suffix: str = "_next",
        target_database_name: Optional[str] = None,
        default_args: Optional[dict] = None,
        schedule_interval_full_refresh: timedelta = timedelta(days=1),
        schedule_interval_incremental: timedelta = timedelta(hours=1),
        end_date: Optional[datetime] = None,
        read_right_users: Optional[Union[List[str], str]] = None,
        additional_dag_args: Optional[dict] = None,
        additional_task_args: Optional[dict] = None,
        logging_func: Optional[Callable] = None,
        **kwargs) -> Tuple[DAG, DAG]:
    def raise_exception(msg: str) -> None:
        """Add information to error message before raising."""
        raise Exception("DAG: {0} - Error: {1}".format(dag_name, msg))

    logging_func = logging_func or print

    if kwargs:
        logging_func("unused config: {0}".format(str(kwargs)))

    additional_dag_args = additional_dag_args or {}
    additional_task_args = additional_task_args or {}

    if not read_right_users is None:
        if isinstance(read_right_users, str):
            read_right_users = [u.strip() for u in read_right_users.split(",")]
        if not isinstance(read_right_users, Iterable):
            raise_exception("read_right_users must be an iterable or string!")
    if not isinstance(schedule_interval_full_refresh, timedelta):
        raise_exception("schedule_interval_full_refresh must be timedelta!")
    if not isinstance(schedule_interval_incremental, timedelta):
        raise_exception("schedule_interval_incremental must be timedelta!")
    if schedule_interval_incremental >= schedule_interval_full_refresh:
        _msg = "schedule_interval_incremental must be shorter than "
        _msg += "schedule_interval_full_refresh!"
        raise_exception(_msg)
    """Calculate the datetimes and timedeltas for the two DAGs.

    Full Refresh: The start_date should be chosen such that there is always only
    one DAG execution to be executed at any given point in time.

    The Incremental DAG starts at the start date + schedule interval of the
    Full Refresh DAG, so that the Incremental executions only happen after
    the Full Refresh execution.
    """
    if not start_date.tzinfo:
        # if no timezone is given, assume UTC
        raise_exception("start_date must be timezone aware!")
    time_now = datetime_utcnow_with_tz() + schedule_interval_incremental / 2
    if start_date > time_now:
        # Start date for both is in the future
        start_date_fr = start_date
        start_date_inc = start_date
    else:
        _td = int((time_now - start_date) / schedule_interval_full_refresh) - 1
        start_date_fr = start_date + _td * schedule_interval_full_refresh
        start_date_inc = start_date_fr + schedule_interval_full_refresh

    dag_name_fr = dag_name + "_Periodic_Full_Refresh"
    dag_name_inc = dag_name + "_Intraperiod_Incremental"
    dags = (
        DAG(dag_name_fr,
            start_date=start_date_fr,
            end_date=end_date,
            schedule_interval=schedule_interval_full_refresh,
            catchup=True,
            max_active_runs=1,
            default_args=default_args,
            **additional_dag_args),
        DAG(dag_name_inc,
            start_date=start_date_inc,
            end_date=end_date,
            schedule_interval=schedule_interval_incremental,
            catchup=True,
            max_active_runs=1,
            default_args=default_args,
            **additional_dag_args),
    )

    kickoff_fr, final_fr = etl_schema_tasks(
        dag=dags[0],
        dwh_engine=dwh_engine,
        dwh_conn_id=dwh_conn_id,
        target_schema_name=target_schema_name,
        target_schema_suffix=target_schema_suffix,
        target_database_name=target_database_name,
        read_right_users=read_right_users,
        **additional_task_args)

    kickoff_inc, final_inc = etl_schema_tasks(
        dag=dags[1],
        dwh_engine=dwh_engine,
        dwh_conn_id=dwh_conn_id,
        target_schema_name=target_schema_name,
        target_schema_suffix=target_schema_suffix,
        target_database_name=target_database_name,
        read_right_users=read_right_users,
        **additional_task_args)

    sql_fr = """
        SELECT
             -- only run if there are no active DAGs that have to finish first
            CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END
        FROM public.dag_run
        WHERE state = 'running'
          AND (
                (dag_id = '{0}' AND execution_date < '{1}')
            OR  (dag_id = '{2}' AND execution_date < '{3}')
          )
    """.format(
        dags[0]._dag_id,  # fr
        "{{ execution_date }}",  # no previous full refresh, please!
        dags[1]._dag_id,  # inc
        "{{ next_execution_date }}",  # no old incremental running, please!
    )

    # Sense if a previous instance runs OR if any incremental loads run
    # except incremental load of the same time, which is expected and waits
    fr_snsr = EWAHSqlSensor(
        task_id="sense_run_validity",
        conn_id=airflow_conn_id,
        sql=sql_fr,
        dag=dags[0],
        poke_interval=5 * 60,
        mode="reschedule",  # don't block a worker and pool slot
        **additional_task_args)

    # Sense if a previous instance is complete excepts if its the first, then
    # check for a full refresh of the same time
    inc_ets = ExtendedETS(
        task_id="sense_run_validity",
        allowed_states=["success"],
        external_dag_id=dags[1]._dag_id,
        external_task_id=final_inc.task_id,
        execution_delta=schedule_interval_incremental,
        backfill_dag_id=dags[0]._dag_id,
        backfill_external_task_id=final_fr.task_id,
        backfill_execution_delta=schedule_interval_full_refresh,
        dag=dags[1],
        poke_interval=5 * 60,
        mode="reschedule",  # don't block a worker and pool slot
        **additional_task_args)

    fr_snsr >> kickoff_fr
    inc_ets >> kickoff_inc

    for table in operator_config["tables"].keys():
        arg_dict_inc = deepcopy(additional_task_args)
        arg_dict_inc.update(operator_config.get("general_config", {}))
        op_conf = operator_config["tables"][table] or {}
        arg_dict_inc.update(op_conf)
        arg_dict_inc.update({
            "extract_strategy":
            EC.ES_INCREMENTAL,
            "task_id":
            "extract_load_" + re.sub(r"[^a-zA-Z0-9_]", "", table),
            "dwh_engine":
            dwh_engine,
            "dwh_conn_id":
            dwh_conn_id,
            "target_table_name":
            op_conf.get("target_table_name", table),
            "target_schema_name":
            target_schema_name,
            "target_schema_suffix":
            target_schema_suffix,
            "target_database_name":
            target_database_name,
        })
        arg_dict_fr = deepcopy(arg_dict_inc)
        arg_dict_fr["extract_strategy"] = EC.ES_FULL_REFRESH

        task_fr = el_operator(dag=dags[0], **arg_dict_fr)
        task_inc = el_operator(dag=dags[1], **arg_dict_inc)

        kickoff_fr >> task_fr >> final_fr
        kickoff_inc >> task_inc >> final_inc

    return dags