Exemple #1
0
        active_env=Variable.get("active_env"),
        prod_webhook=Variable.get("active_env") == "prod",
    )


with DAG(
        dag_id,
        default_args=airflow_utils.get_default_args({
            "owner":
            "Gary Qi",
            "depends_on_past":
            False,
            "email": ["*****@*****.**"],
            "email_on_failure":
            False,
            "email_on_retry":
            False,
            "retries":
            0,
            "on_failure_callback":
            task_failure_slack_alert,
            "start_date":
            days_ago(1),
        }),
        description=
        "Take new earlyon.json from opendata.toronto.ca and put into datastore",
        schedule_interval="0 18 * * *",
        catchup=False,
        tags=["earlyon", "datasets"],
) as dag:
Exemple #2
0
    return "\n".join(message_lines)


def return_branch(**kwargs):
    msg = kwargs.pop("ti").xcom_pull(task_ids="build_message")

    if msg is None:
        return "no_need_for_notification"

    return "send_notification"


default_args = airflow_utils.get_default_args({
    "on_failure_callback":
    send_failure_msg,
    "start_date":
    job_settings["start_date"]
})

with DAG(
        job_name,
        default_args=default_args,
        description=job_settings["description"],
        schedule_interval=job_settings["schedule"],
        tags=["sustainment"],
        catchup=False,
) as dag:

    load_files = PythonOperator(
        task_id="load_file_list",
        python_callable=load_remote_files,
Exemple #3
0
def create_dag(d):
    def send_success_msg(**kwargs):
        msg = kwargs.pop("ti").xcom_pull(task_ids="build_message")
        airflow_utils.message_slack(
            name=PACKAGE_ID,
            message_type="success",
            msg=msg,
            prod_webhook=active_env == "prod",
            active_env=active_env,
        )

    def send_failure_msg(self):
        airflow_utils.message_slack(
            name=PACKAGE_ID,
            message_type="error",
            msg="Job not finished",
            prod_webhook=active_env == "prod",
            active_env=active_env,
        )

    def is_resource_new(**kwargs):
        package = kwargs.pop("ti").xcom_pull(task_ids="get_package")
        resource_name = kwargs.pop("resource_name")

        resource = [r for r in package["resources"] if r["name"] == resource_name]

        assert (
            len(resource) <= 1
        ), f"Found {len(resource)} named {resource_name}. Must be 1 or 0."

        if len(resource) == 1:
            return "do_not_create_new_resource"

        return "create_new_resource"

    def get_resource(**kwargs):
        package = ckan_utils.get_package(ckan=ckan, package_id=PACKAGE_ID)
        resource_name = kwargs.pop("resource_name")

        resource = [r for r in package["resources"] if r["name"] == resource_name][0]

        return resource

    def create_new_resource(**kwargs):
        ti = kwargs.pop("ti")
        package = ti.xcom_pull(task_ids="get_package")
        tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir"))
        resource_name = kwargs.pop("resource_name")

        save_path = tmp_dir / f"{resource_name}.zip"

        with zipfile.ZipFile(save_path, "w") as file:
            file
            pass

        logging.info(
            "New resource. Creating empty placeholder Zip file to upload with resource"
        )

        res = ckan.action.resource_create(
            package_id=package["name"],
            name=resource_name,
            is_preview=False,
            format="ZIP",
            extract_job=f"Airflow: {kwargs['dag'].dag_id}",
            upload=open(save_path, "rb"),
        )

        logging.info(res)

        return save_path

    def download_data(**kwargs):
        ti = kwargs.pop("ti")
        resource = ti.xcom_pull(task_ids="get_resource")
        tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir"))

        r = requests.get(resource["url"], stream=True)

        save_path = tmp_dir / f'src{Path(resource["url"]).suffix}'

        with open(save_path, "wb") as fd:
            for chunk in r.iter_content(
                chunk_size=128
            ):  # to-do: read up on chunk size here
                fd.write(chunk)

        return save_path

    def unzip_data(**kwargs):
        ti = kwargs.pop("ti")
        fp = ti.xcom_pull(task_ids="download_data")
        tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir"))

        target_dir = tmp_dir / "src"

        with zipfile.ZipFile(fp, "r") as f:
            f.extractall(target_dir)
        if target_dir.exists() is False:
            target_dir.mkdir()

        return target_dir

    def get_filename_date_format(**kwargs):
        period_range = kwargs["period_range"]

        if period_range == "weekly":
            filename_date_format = "%Y%m%d"
        elif period_range == "monthly":
            filename_date_format = "%Y%m"
        elif period_range == "yearly":
            filename_date_format = "%Y"

        return filename_date_format

    def determine_latest_period_loaded(**kwargs):
        ti = kwargs.pop("ti")
        data_fp = Path(ti.xcom_pull(task_ids="unzip_data"))
        filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format")

        dates_loaded = [
            datetime.strptime(p.name, filename_date_format)
            for p in data_fp.iterdir()
            if p.is_file() is False
        ]

        if not dates_loaded:
            return datetime(2018, 12, 30)

        return max(dates_loaded)

    def calculate_periods_to_load(**kwargs):
        ti = kwargs.pop("ti")
        latest_loaded = ti.xcom_pull(task_ids="determine_latest_period_loaded")
        period_range = kwargs["period_range"]

        def weeks(latest_loaded):
            logging.info("Calculating weeks to load")
            periods_to_load = []

            begin = latest_loaded + timedelta(days=1)
            end = begin + timedelta(days=6)

            while end < datetime.now():
                periods_to_load.append(
                    {
                        "begin": datetime.strftime(begin, "%Y/%m/%d/0"),
                        "end": datetime.strftime(end, "%Y/%m/%d/23"),
                    }
                )

                begin = end + timedelta(days=1)
                end = begin + timedelta(days=6)

            return periods_to_load

        def months(latest_loaded):
            logging.info("Calculating months to load")
            periods_to_load = []

            begin = latest_loaded + timedelta(days=32)
            month_end_day = calendar.monthrange(begin.year, begin.month)[1]
            end = datetime(begin.year, begin.month, month_end_day)

            while end < datetime.now():
                periods_to_load.append(
                    {
                        "begin": datetime.strftime(begin, "%Y/%m/1/0"),
                        "end": datetime.strftime(end, "%Y/%m/%d/23"),
                    }
                )

                begin = begin + timedelta(days=32)
                month_end_day = calendar.monthrange(begin.year, begin.month)[1]
                end = datetime(begin.year, begin.month, month_end_day)

            return periods_to_load

        def years(latest_loaded):
            logging.info("Calculating years to load")
            periods_to_load = []

            begin = datetime(latest_loaded.year + 1, 1, 1)
            end = datetime(begin.year, 12, 31)

            while end < datetime.now():
                periods_to_load.append(
                    {
                        "begin": datetime.strftime(begin, "%Y/1/1/0"),
                        "end": datetime.strftime(end, "%Y/12/31/23"),
                    }
                )

                begin = datetime(begin.year + 1, 1, 1)
                end = datetime(begin.year, 12, 31)

            return periods_to_load

        if period_range == "weekly":
            return weeks(latest_loaded)
        elif period_range == "monthly":
            return months(latest_loaded)
        elif period_range == "yearly":
            return years(latest_loaded)

    def make_new_extract_folders(**kwargs):
        logging.info("Created directory for storing extracts")

        ti = kwargs.pop("ti")
        filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format")
        periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load")
        dest_path = Path(ti.xcom_pull(task_ids="make_staging_folder"))

        dirs = []

        for period in periods_to_load:
            period_path_name = datetime.strptime(period["end"], "%Y/%m/%d/%H").strftime(
                filename_date_format
            )
            period_path = dest_path / period_path_name

            if period_path.exists() is False:
                period_path.mkdir()

            dirs.append(period_path)

            logging.info(period_path)

        return dirs

    def extract_new_report(**kwargs):
        ti = kwargs.pop("ti")
        periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load")
        filename_date_format = ti.xcom_pull(task_ids="get_filename_date_format")
        dest_path = Path(ti.xcom_pull(task_ids="make_staging_folder"))

        account_id = Variable.get("oracle_infinity_account_id")
        user = Variable.get("oracle_infinity_user")
        password = Variable.get("oracle_infinity_password")

        report_name = kwargs["report_name"]
        report_id = reports[report_name]

        logging.info(f"Getting reports. Parameters: {args}")

        file_paths = []

        for period in periods_to_load:
            period_path_name = datetime.strptime(period["end"], "%Y/%m/%d/%H").strftime(
                filename_date_format
            )
            period_path = dest_path / period_path_name
            fpath = period_path / (report_name + ".csv")

            file_paths.append(fpath)

            response = generate_report(
                report_name=report_name,
                report_id=report_id,
                begin=period["begin"],
                end=period["end"],
                account_id=account_id,
                user=user,
                password=password,
            )

            with open(fpath, "wb") as f:
                f.write(response.content)

        return file_paths

    def are_there_new_periods(**kwargs):
        ti = kwargs.pop("ti")
        periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load")

        if len(periods_to_load) > 0:
            return "new_periods_to_load"

        return "no_new_periods_to_load"

    def make_staging_folder(**kwargs):
        ti = kwargs.pop("ti")
        tmp_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir"))
        resource_name = kwargs["resource_name"]

        staging = tmp_dir / resource_name

        staging.mkdir(parents=True, exist_ok=True)

        return staging

    def zip_files(**kwargs):
        ti = kwargs.pop("ti")
        resource_name = kwargs["resource_name"]
        dest_dir = Path(ti.xcom_pull(task_ids="create_tmp_data_dir"))
        staging_dir = Path(ti.xcom_pull(task_ids="make_staging_folder"))

        return shutil.make_archive(
            base_name=dest_dir / resource_name, format="zip", root_dir=staging_dir
        )

    def copy_previous_to_staging(**kwargs):
        ti = kwargs.pop("ti")
        from_dir = Path(ti.xcom_pull(task_ids="unzip_data"))
        dest_dir = Path(ti.xcom_pull(task_ids="make_staging_folder"))

        copy_tree(str(from_dir.absolute()), str(dest_dir.absolute()))

        return dest_dir

    def upload_zip(**kwargs):
        ti = kwargs.pop("ti")
        path = Path(ti.xcom_pull(task_ids="zip_files"))
        resource = ti.xcom_pull(task_ids="get_resource")

        res = ckan.action.resource_patch(id=resource["id"], upload=open(path, "rb"),)

        return res

    def build_message(**kwargs):
        ti = kwargs.pop("ti")
        periods_to_load = ti.xcom_pull(task_ids="calculate_periods_to_load")

        msg = [f"Loaded {d['period_range']} data:", ""]

        for p in periods_to_load:
            begin = "-".join(p["begin"].split("/")[:-1])
            end = "-".join(p["end"].split("/")[:-1])

            msg.append(f"- {begin} to {end}")

        return "\n".join(msg)

    dag = DAG(
        d["dag_id"],
        default_args=airflow_utils.get_default_args(
            {
                "on_failure_callback": task_failure_slack_alert,
                "start_date": d["start_date"],
                "retries": 5,
                "retry_delay": timedelta(minutes=15),
            }
        ),
        description=d["description"],
        schedule_interval=d["schedule"],
        tags=["dataset"],
        catchup=False,
    )

    with dag:

        package = PythonOperator(
            task_id="get_package",
            op_kwargs={"ckan": ckan, "package_id": PACKAGE_ID},
            python_callable=ckan_utils.get_package,
        )

        create_tmp_dir = PythonOperator(
            task_id="create_tmp_data_dir",
            python_callable=airflow_utils.create_dir_with_dag_name,
            op_kwargs={"dag_id": d["dag_id"], "dir_variable_name": "tmp_dir"},
        )

        is_resource_new_branch = BranchPythonOperator(
            task_id="is_resource_new",
            python_callable=is_resource_new,
            op_kwargs={"resource_name": d["resource_name"]},
        )

        create_resource = PythonOperator(
            task_id="create_new_resource",
            python_callable=create_new_resource,
            op_kwargs={"resource_name": d["resource_name"]},
        )

        no_new_resource = DummyOperator(task_id="do_not_create_new_resource")

        resource = PythonOperator(
            task_id="get_resource",
            python_callable=get_resource,
            trigger_rule="none_failed",
            op_kwargs={"resource_name": d["resource_name"]},
        )

        get_data = PythonOperator(
            task_id="download_data", python_callable=download_data,
        )

        unzip_files = PythonOperator(task_id="unzip_data", python_callable=unzip_data,)

        filename_date_format = PythonOperator(
            task_id="get_filename_date_format",
            python_callable=get_filename_date_format,
            op_kwargs={"period_range": d["period_range"]},
        )

        latest_loaded = PythonOperator(
            task_id="determine_latest_period_loaded",
            python_callable=determine_latest_period_loaded,
        )

        periods_to_load = PythonOperator(
            task_id="calculate_periods_to_load",
            python_callable=calculate_periods_to_load,
            op_kwargs={"period_range": d["period_range"]},
        )

        no_new_periods_to_load = DummyOperator(task_id="no_new_periods_to_load")

        new_periods_to_load = DummyOperator(task_id="new_periods_to_load")

        new_data_to_load = BranchPythonOperator(
            task_id="are_there_new_periods",
            python_callable=are_there_new_periods,
            op_kwargs={"resource_name": d["resource_name"]},
        )

        staging_folder = PythonOperator(
            task_id="make_staging_folder",
            python_callable=make_staging_folder,
            op_kwargs={"resource_name": d["resource_name"]},
        )

        extract_complete = DummyOperator(task_id="extract_complete")

        extract_new = PythonOperator(
            task_id="extract_new", python_callable=make_new_extract_folders,
        )

        key_metrics = PythonOperator(
            task_id="key_metrics",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Key Metrics"},
        )

        new_v_return_visitors = PythonOperator(
            task_id="new_v_return_visitors",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "New vs. Return Visitors"},
        )

        hits_by_hour = PythonOperator(
            task_id="hits_by_hour",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Hits by Hour of Day"},
        )

        visits_by_day = PythonOperator(
            task_id="visits_by_day",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Visits by Day of Week"},
        )

        operating_system = PythonOperator(
            task_id="operating_system",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Operating System Platform"},
        )

        browser = PythonOperator(
            task_id="browser",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Browser"},
        )

        screen_resolution = PythonOperator(
            task_id="screen_resolution",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Screen Resolution"},
        )

        mobile_devices = PythonOperator(
            task_id="mobile_devices",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Mobile Devices"},
        )

        mobile_browser = PythonOperator(
            task_id="mobile_browser",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Mobile Browser"},
        )

        referring_site = PythonOperator(
            task_id="referring_site",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Referring Site"},
        )

        search_engines = PythonOperator(
            task_id="search_engines",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Search Engines"},
        )

        countries = PythonOperator(
            task_id="countries",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Countries"},
        )

        cities = PythonOperator(
            task_id="cities",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Cities"},
        )

        top_pages = PythonOperator(
            task_id="top_pages",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Top Pages"},
        )

        entry_pages = PythonOperator(
            task_id="entry_pages",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Entry Pages"},
        )

        exit_pages = PythonOperator(
            task_id="exit_pages",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Exit Pages"},
        )

        file_downloads = PythonOperator(
            task_id="file_downloads",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "File Downloads"},
        )

        email_address = PythonOperator(
            task_id="email_address",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Email Address"},
        )

        offsite_links = PythonOperator(
            task_id="offsite_links",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Offsite Links"},
        )

        anchor_tags = PythonOperator(
            task_id="anchor_tags",
            python_callable=extract_new_report,
            op_kwargs={"report_name": "Anchor Tags"},
        )

        copy_previous = PythonOperator(
            task_id="copy_previous", python_callable=copy_previous_to_staging,
        )

        zip_resource_files = PythonOperator(
            task_id="zip_files",
            python_callable=zip_files,
            op_kwargs={"resource_name": d["resource_name"]},
        )

        upload_data = PythonOperator(task_id="upload_zip", python_callable=upload_zip,)

        msg = PythonOperator(task_id="build_message", python_callable=build_message,)

        send_notification = PythonOperator(
            task_id="send_success_msg", python_callable=send_success_msg,
        )

        delete_tmp_dir = PythonOperator(
            task_id="delete_tmp_dir",
            python_callable=airflow_utils.delete_tmp_data_dir,
            op_kwargs={"dag_id": d["dag_id"], "recursively": True},
            trigger_rule="none_failed",
        )

        package >> is_resource_new_branch

        is_resource_new_branch >> create_resource

        is_resource_new_branch >> no_new_resource

        [create_resource, no_new_resource] >> resource >> get_data >> unzip_files

        [unzip_files, filename_date_format] >> latest_loaded

        latest_loaded >> periods_to_load >> new_data_to_load

        [copy_previous, extract_complete] >> zip_resource_files >> upload_data >> msg

        create_tmp_dir >> get_data

        new_data_to_load >> no_new_periods_to_load

        new_data_to_load >> new_periods_to_load >> staging_folder >> [
            extract_new,
            copy_previous,
        ]

        extract_new >> [
            key_metrics,
            new_v_return_visitors,
            hits_by_hour,
            visits_by_day,
            operating_system,
            browser,
            screen_resolution,
            mobile_devices,
            mobile_browser,
            referring_site,
            search_engines,
            countries,
            cities,
            top_pages,
            entry_pages,
            exit_pages,
            file_downloads,
            email_address,
            offsite_links,
            anchor_tags,
        ] >> extract_complete

        msg >> send_notification

        [send_notification, no_new_periods_to_load] >> delete_tmp_dir

    return dag
Exemple #4
0
         ] >> job_failed >> restore_backup >> message_slack_recover

    return dag


# build a dag for each number in range(10)
for dataset in datasets:
    dag_id = dataset['package_id'] + '-' + dataset['tps_table_code']

    schedule = '@once'
    default_args = airflow_utils.get_default_args({
        "owner":
        "Gary",
        "depends_on_past":
        False,
        "email": ["*****@*****.**"],
        "email_on_failure":
        False,
        "email_on_retry":
        False,
        "retries":
        1,
        "on_failure_callback":
        task_failure_slack_alert,
        "retries":
        0,
        "start_date":
        common_job_settings["start_date"],
    })
    globals()[dag_id] = create_dag(dag_id, dataset, schedule, default_args)
Exemple #5
0
    airflow_utils.message_slack(
        name=PACKAGE_NAME,
        message_type="error",
        msg="Job not finished",
        active_env=Variable.get("active_env"),
        prod_webhook=Variable.get("active_env") == "prod",
    )


with DAG(
        PACKAGE_NAME,
        default_args=airflow_utils.get_default_args({
            "on_failure_callback":
            task_failure_slack_alert,
            "start_date":
            datetime(2020, 11, 24, 13, 35, 0),
            "retries":
            0,
            # "retry_delay": timedelta(minutes=3),
        }),
        description=
        "Take data from opendata.toronto.ca (CSV) and put into datastore",
        schedule_interval="0 17 * * *",
        catchup=False,
        tags=["dataset"],
) as dag:

    def is_resource_new(**kwargs):
        package = kwargs["ti"].xcom_pull(task_ids="get_package")
        logging.info(
            f"resources found: {[r['name'] for r in package['resources']]}")
Exemple #6
0
        get_resource_id >> new_or_existing >> [new_resource,  existing_resource ]
        new_resource >> join_or >> join_and
        existing_resource >> backup_resource >> delete_resource >> join_or >> join_and
        join_and >> insert_records >> modify_metadata >> job_success >> delete_tmp_dir>> message_slack 
        [get_agol_data, get_resource_id] >> job_failed  
        [insert_records] >> job_failed >> restore_backup 

    return dag


for dataset in DATASETS:
    dag_id = dataset['package_id']

    schedule = '@once'
    default_args = airflow_utils.get_default_args(
        {
            "owner": "Mackenzie",
            "depends_on_past": False,
            "email": ["*****@*****.**"],
            "email_on_failure": False,
            "email_on_retry": False,
            "retries": 1,
            "on_failure_callback": task_failure_slack_alert,
            "retries": 0,
            "start_date": COMMON_JOB_SETTINGS["start_date"],
        }
    )
    globals()[dag_id] = create_dag(dag_id,
                                  dataset,
                                  schedule,
                                  default_args)
Exemple #7
0
from datetime import datetime

from airflow import DAG
from airflow.models import Variable
from airflow.operators.bash import BashOperator
from utils import airflow_utils

with DAG(
    "pull_latest_code",
    default_args=airflow_utils.get_default_args(
        {"retries": 0, "start_date": datetime(2020, 11, 10, 0, 30, 0)}
    ),
    description="Pulls repo code. Updated dags must be deleted and restarted.",
    schedule_interval="*/5 * * * *",
    tags=["sustainment"],
    catchup=False,
) as dag:

    pull_repo = BashOperator(
        task_id="pull_repo",
        bash_command=f"git -C {Variable.get('repo_dir')} pull; echo $?",
    )

    pull_repo
Exemple #8
0
from ckan_operators.package_operator import GetAllPackagesOperator, AssertIdenticalPackagesOperator

from utils import airflow_utils

CONTRIB_ADDRESS = "https://ckanadmin0.intra.prod-toronto.ca/"
DELIVERY_ADDRESS = "https://ckan0.cf.opendata.inter.prod-toronto.ca/"

DEFAULT_ARGS = airflow_utils.get_default_args({
    "owner":
    "Mackenzie",
    "depends_on_past":
    False,
    "email": ["*****@*****.**"],
    "email_on_failure":
    False,
    "email_on_retry":
    False,
    "retries":
    1,
    #"on_failure_callback": task_failure_slack_alert,
    "retries":
    0,
    "start_date":
    datetime(2021, 9, 1, 0, 0, 0)
})
DESCRIPTION = "Compares CKAN Contrib to Delivery and returns slack message if they aren't identical"
SCHEDULE = "1 0,12 * * 1-5"  # minute 1 at noon and midnight on weekdays
TAGS = ["sustainment"]

with DAG("check_contrib_delivery_sync",
         description=DESCRIPTION,
         default_args=DEFAULT_ARGS,
Exemple #9
0
def send_failure_msg():
    airflow_utils.message_slack(
        name=job_name,
        message_type="error",
        msg="Job not finished",
        active_env=Variable.get("active_env"),
        prod_webhook=Variable.get("active_env") == "prod",
    )


with DAG(
        job_name,
        default_args=airflow_utils.get_default_args({
            "on_failure_callback":
            send_failure_msg,
            "start_date":
            datetime(2020, 11, 9, 0, 30, 0),
        }),
        description="Identifies empty datastore resources and send to Slack",
        schedule_interval="5 15,18,21,0,3 * * *",
        tags=["sustainment"],
        catchup=False,
) as dag:

    ckan_creds = Variable.get("ckan_credentials_secret", deserialize_json=True)
    active_env = Variable.get("active_env")
    ckan_address = ckan_creds[active_env]["address"]
    ckan_apikey = ckan_creds[active_env]["apikey"]

    def send_success_msg(**kwargs):
        msg = kwargs.pop("ti").xcom_pull(task_ids="build_message")
Exemple #10
0
    resources_to_load = kwargs["ti"].xcom_pull(
        task_ids="identify_resources_to_load")

    if len(resources_to_load) == 0:
        return "no_files_are_not_new"

    return "yes_continue_with_refresh"


def get_package():
    return CKAN.action.package_show(id=PACKAGE_ID)


default_args = airflow_utils.get_default_args({
    "on_failure_callback":
    task_failure_slack_alert,
    "start_date":
    job_settings["start_date"]
})

with DAG(
        PACKAGE_ID,
        default_args=default_args,
        description=job_settings["description"],
        schedule_interval=job_settings["schedule"],
        catchup=False,
        tags=["dataset"],
) as dag:

    create_tmp_dir = PythonOperator(
        task_id="create_tmp_dir",
        python_callable=airflow_utils.create_dir_with_dag_name,
Exemple #11
0
        message_type="error",
        msg="Job not finished",
        active_env=Variable.get("active_env"),
        prod_webhook=Variable.get("active_env") == "prod",
    )


with DAG(
    PACKAGE_NAME,
    default_args=airflow_utils.get_default_args(
        {
            "owner": "Gary",
            "depends_on_past": False,
            "email": ["*****@*****.**"],
            "email_on_failure": False,
            "email_on_retry": False,
            "retries": 1,
            "retry_delay": timedelta(seconds=600),
            "on_failure_callback": task_failure_slack_alert,
            "start_date": days_ago(1),
            "retries": 0,
        }
    ),
    description="Take tpp json and narratives from progress portal",
    schedule_interval="0 22 * * 1-5",
    catchup=False,
    tags=["dataset"],
) as dag:

    def is_resource_new(**kwargs):
        package = kwargs["ti"].xcom_pull(task_ids="get_package")
        logging.info(f"resources found: {[r['name'] for r in package['resources']]}")
Exemple #12
0
def send_failure_message():
    airflow_utils.message_slack(
        name=PACKAGE_ID,
        message_type="error",
        msg="Job not finished",
        active_env=ACTIVE_ENV,
        prod_webhook=ACTIVE_ENV == "prod",
    )


with DAG(
        PACKAGE_ID,
        default_args=airflow_utils.get_default_args({
            "on_failure_callback":
            task_failure_slack_alert,
            "start_date":
            datetime(2020, 11, 10, 13, 35, 0),
        }),
        description=
        "Get rain gauge data from the last time it was loaded to now",
        schedule_interval="30 14 * * *",
        catchup=False,
        tags=["dataset"],
) as dag:

    CKAN_CREDS = Variable.get("ckan_credentials_secret", deserialize_json=True)
    CKAN = ckanapi.RemoteCKAN(**CKAN_CREDS[ACTIVE_ENV])

    def send_success_msg(**kwargs):
        msg = kwargs.pop("ti").xcom_pull(task_ids="build_message")
        airflow_utils.message_slack(
Exemple #13
0
def send_failure_message():
    airflow_utils.message_slack(
        name=PACKAGE_NAME,
        message_type="error",
        msg="Job not finished",
        active_env=Variable.get("active_env"),
        prod_webhook=Variable.get("active_env") == "prod",
    )


with DAG(
        PACKAGE_NAME,
        default_args=airflow_utils.get_default_args({
            "on_failure_callback": task_failure_slack_alert,
            "start_date": days_ago(1),
            "retries": 0,
            # "retry_delay": timedelta(minutes=3),
        }),
        description="",
        schedule_interval="0 17 * * *",
        catchup=False,
        tags=["dataset"],
) as dag:

    def is_resource_new(**kwargs):
        pkg = kwargs["ti"].xcom_pull(task_ids="get_package")
        resource_name = kwargs["resource_name"]

        logging.info(
            f"looking for: {resource_name} | resources found: {[r['name'] for r in pkg['resources']]}"
        )