def create_subdag_operators(parent_dag, symbol_list):
    """
    :meta private:

    Private function to create subdag operators based on
    given parent dag and list of symbols.
    """
    subdags = []
    granularity = "1m"
    bucket = "data.binance.vision"
    prefix = "data/spot/{frequency}/klines/{symbol}/{granularity}/"
    schedule = "@once"
    subdags = [
        create_subdag_operator(
            parent_dag,
            schedule,
            row.loc["symbol_name"],
            granularity,
            bucket,
            prefix,
            row.loc["symbol_id"],
            default_args,
        ) for index, row in symbol_list.iterrows()
    ]
    # chain subdag-operators together
    chain(*subdags)
    return subdags
예제 #2
0
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that trains a new model using the training dataset.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    dag = airflow_utils.initialize_airflow_dag(
        dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)

    create_model_task = _add_create_model_task(dag, bb_vars, training_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)

    helpers.chain(create_model_task, update_airflow_variable_task)
    return dag
예제 #3
0
def delete_s3_key_files_subdag(parent_dag_name, child_dag_name, start_date,
                               s3_bucket, s3_key, aws_credentials):
    dag = DAG(
        f'{parent_dag_name}.{child_dag_name}',
        description='Delete all S3 files in the provided key.',
        start_date=start_date,
        schedule_interval=None,
        catchup=False,
    )

    list_s3_processed_s3_files = S3ListOperator(
        task_id='list_s3_processed_s3_files',
        dag=dag,
        bucket=s3_bucket,
        prefix=s3_key,
        aws_conn_id=aws_credentials,
    )

    delete_processed_s3_files = S3DeleteFromContextOperator(
        task_id='delete_processed_s3_files',
        dag=dag,
        bucket=s3_bucket,
        context_task_id='list_s3_processed_s3_files',
        aws_conn_id=aws_credentials,
    )

    chain(list_s3_processed_s3_files, delete_processed_s3_files)

    return dag
예제 #4
0
 def test_chain_different_length_iterable(self):
     dag = DAG(dag_id='test_chain', start_date=datetime.now())
     [t1, t2, t3, t4, t5] = [
         DummyOperator(task_id='t{i}'.format(i=i), dag=dag)
         for i in range(1, 6)
     ]
     with self.assertRaises(AirflowException):
         helpers.chain([t1, t2], [t3, t4, t5])
예제 #5
0
 def test_chain_not_support_type(self):
     dag = DAG(dag_id='test_chain', start_date=datetime.now())
     [t1, t2] = [
         DummyOperator(task_id='t{i}'.format(i=i), dag=dag)
         for i in range(1, 3)
     ]
     with self.assertRaises(TypeError):
         helpers.chain([t1, t2], 1)
예제 #6
0
    def test_chain(self):
        dag = DAG(dag_id='test_chain', start_date=datetime.now())
        [t1, t2, t3, t4, t5, t6] = [DummyOperator(task_id='t{i}'.format(i=i), dag=dag) for i in range(1, 7)]
        helpers.chain(t1, [t2, t3], [t4, t5], t6)

        self.assertCountEqual([t2, t3], t1.get_direct_relatives(upstream=False))
        self.assertEqual([t4], t2.get_direct_relatives(upstream=False))
        self.assertEqual([t5], t3.get_direct_relatives(upstream=False))
        self.assertCountEqual([t4, t5], t6.get_direct_relatives(upstream=True))
예제 #7
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)
            benchmark_tasks[-1] >> cleanup_cluster

        install_cluster >> benchmarks
예제 #8
0
def data_quality_check_subdag(parent_dag_name, child_dag_name, start_date,
                              redshift_conn, staging_table):
    dag = DAG(
        f'{parent_dag_name}.{child_dag_name}',
        description=
        'Check if dimensions tables attend data quality principles.',
        start_date=start_date,
        schedule_interval=None,
        catchup=False,
    )

    dimensions_data_quality_check = RedshiftDataQualityOperator(
        task_id='dimensions_data_quality_check',
        dag=dag,
        redshift_conn_id=redshift_conn,
        rules=[
            {
                'query': generic_queries['table_size'].format('dm_company'),
                'op': lambda x: x > 0
            },
            {
                'query': generic_queries['table_size'].format('dm_region'),
                'op': lambda x: x > 0
            },
            {
                'query':
                generic_queries['table_size'].format('dm_consumer_profile'),
                'op':
                lambda x: x > 0
            },
            {
                'query': generic_queries['table_size'].format('dm_date'),
                'op': lambda x: x > 0
            },
        ],
    )

    fact_data_quality_check = RedshiftCompareResultsOperator(
        task_id='fact_data_quality_check',
        dag=dag,
        redshift_conn_id=redshift_conn,
        query=generic_queries['table_size'].format('ft_complaints'),
        comparison_query=generic_queries['table_size'].format(
            f'staging.{staging_table}'),
        operator=lambda x, y: x >= y,
    )

    chain(
        dimensions_data_quality_check,
        fact_data_quality_check,
    )

    return dag
예제 #9
0
    def test_chain(self):
        dag = DAG(dag_id='test_chain', start_date=datetime.now())
        [op1, op2, op3, op4, op5, op6] = [
            DummyOperator(task_id='t{i}'.format(i=i), dag=dag)
            for i in range(1, 7)
        ]
        helpers.chain(op1, [op2, op3], [op4, op5], op6)

        self.assertCountEqual([op2, op3],
                              op1.get_direct_relatives(upstream=False))
        self.assertEqual([op4], op2.get_direct_relatives(upstream=False))
        self.assertEqual([op5], op3.get_direct_relatives(upstream=False))
        self.assertCountEqual([op4, op5],
                              op6.get_direct_relatives(upstream=True))
예제 #10
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)

        with TaskGroup("Index Results", prefix_group_id=False,
                       dag=self.dag) as post_steps:
            index_status_task = self._get_status_indexer().get_index_task()

        install_cluster >> benchmarks >> [post_steps, cleanup_cluster]
예제 #11
0
def create_dag(
    args: Mapping[Text, Any],
    parent_dag_name: Optional[Text] = None,
) -> models.DAG:
    """Generates a DAG that loads data into an AutoML Dataset.

  Args:
    args: Arguments to provide to the AutoML operators as defaults.
    parent_dag_name: If this value is provided, the newly created dag object is
      made a subdag of the parent dag.

  Returns:
    The DAG object.
  """
    # Load params from Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    # Create dag.
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        None,  # schedule
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO,
        local_macros={
            'get_column_spec': _get_column_spec,
            'target': 'predictionLabel',
            'extract_object_id':
            automl_hook.AutoMLTablesHook.extract_object_id,
        },
        **args)
    dataset_creation_task = _add_dataset_creation_task(dag, bb_vars)
    dataset_id = (
        "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
    )
    import_data_task = _add_import_data_task(dag, dataset_id, bb_vars,
                                             storage_vars)
    list_table_specs_task = _add_list_table_specs_task(dag, dataset_id,
                                                       bb_vars)
    list_column_specs_task = _add_list_column_specs_task(
        dag, dataset_id, bb_vars)
    update_dataset_task = _add_update_dataset_task(dag, bb_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)
    helpers.chain(dataset_creation_task, import_data_task,
                  list_table_specs_task, list_column_specs_task,
                  update_dataset_task, update_airflow_variable_task)
    return dag
예제 #12
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils:
            utils_tasks = self._get_scale_ci_diagnosis().get_utils()
            chain(*utils_tasks)
            utils_tasks[-1] >> cleanup_cluster
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)
            benchmark_tasks[-1] >> utils

        install_cluster >> benchmarks
예제 #13
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that preprocesses data.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_options = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)

    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

    mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task(
        dag, output_type, prediction_vars, preprocess_vars, storage_vars,
        training_vars)
    mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task(
        dag, output_type, feature_options, storage_vars)
    prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task(
        dag, output_type, prediction_vars, storage_vars)
    helpers.chain(mlwp_sliding_window_pipeline_task,
                  mlwp_generate_features_pipeline_task,
                  prepare_automl_data_in_bq_task)

    return dag
예제 #14
0
def create_ETLDag(etl_name, cron_time, etl_task_list):
    dag_id = '{action_type}_{etl_name}'.format(action_type="ETL",
                                               etl_name=etl_name)
    cron_time = cron_time
    deploy_task_list = []

    dag = DAG(dag_id=dag_id,
              default_args=default_args,
              schedule_interval=cron_time,
              max_active_runs=1,
              concurrency=8,
              catchup=AIRFLOW_BACKFILL)
    deploy_task_list = [
        create_python_task(task, dag) for task in etl_task_list
    ]

    chain(*deploy_task_list)

    return (dag_id, dag)
def create_subdag_operators(parent_dag, symbol_df, **kwargs):
    """
    Function breaks down intervals for which data is missing into
    smaller time periods. This makes it easier to fetch data.

    Args:
        parent_dag (DAG instance): parent to which subdags will
            be linked to. Should be an Airflow DAG instance.
        symbol_df (dataframe): Dataframe containing symbols,
            time_since last refresh.

    Returns:
        None
    """

    subdags = []
    if not symbol_df.empty:

        # for each symbol
        for index, row in symbol_df.iterrows():
            schedule = "@once"
            symbol_id = row["symbol_id"]
            symbol_name = row["symbol_name"]
            last_known_ts = row["last_close_time"]
            current_ts = datetime.utcnow()

            # make list of subdag operators
            subdags.extend([
                create_subdag_operator(
                    parent_dag=parent_dag,
                    schedule=schedule,
                    symbol=symbol_name,
                    granularity=kwargs.get("granularity"),
                    symbol_id=symbol_id,
                    start_ts=last_known_ts,
                    end_ts=current_ts,
                    default_args=default_args,
                )
            ])
            # chain subdag-operators together
            chain(*subdags)
        return subdags
예제 #16
0
def create_train_model_dag() -> models.DAG:
    """Creates the main dag for train model main dag.

  Returns:
    Parent training DAG.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)
    load_data_subdag = subdag_operator.SubDagOperator(
        task_id=_LOAD_DATA_TASK_NAME,
        subdag=load_data_dag.create_dag(args, _DAG_ID),
        dag=main_dag)
    train_model_subdag = subdag_operator.SubDagOperator(
        task_id=_TRAIN_MODEL_TASK_NAME,
        subdag=train_model_dag.create_dag(args, _DAG_ID),
        dag=main_dag)

    helpers.chain(load_data_subdag, train_model_subdag)
    return main_dag
예제 #17
0
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=[
        fact_queries['create_ft_complaints'],
        procon_queries['insert_ft_complaints']
    ])

subdag_task_id = 'data_quality_check'
data_quality_check = SubDagOperator(task_id=subdag_task_id,
                                    dag=dag,
                                    subdag=data_quality_check_subdag(
                                        DAG_NAME, subdag_task_id, start_date,
                                        REDSHIFT_CONN, STAGING_TABLE))

drop_procon_stage_table = PostgresOperator(
    task_id='drop_procon_stage_table',
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN,
    sql=procon_queries['drop_stage_table'])

end_operator = DummyOperator(task_id='finish_execution', dag=dag)

chain(start_operator, has_file_to_process, create_procon_stage_table,
      load_procon_stage_data, [
          load_dm_date_data, load_dm_region_data, load_dm_consumer_data,
          load_dm_company_data
      ], load_ft_complaints_data, data_quality_check, drop_procon_stage_table,
      end_operator)

delete_s3_key_files.set_upstream(load_procon_stage_data)
예제 #18
0
    dag=dag)

truncate_weekly_staging_tables = PostgresOperator(
    task_id='truncate_weekly_staging_tables',
    sql='trnctTbls_wkly.sql',
    dag=dag)

load_mo_unemployment_claims_staging = PythonOperator(
    task_id='load_mo_unemployment_claims_staging',
    python_callable=load_file,
    op_kwargs={
        'filename': 'mo_unemployment_claims.csv',
        'table_name': 'stg_mo_unemployment_clms',
        'sep': ',',
        'nullstr': ''
    },
    dag=dag)

wkly_unemployment_claims_staging_to_core = PostgresOperator(
    task_id='wkly_unemployment_claims_staging_to_core',
    sql='dtaMgrtn_unemplClms_wkly.sql',
    dag=dag)

update_weekly_timestamp = PostgresOperator(task_id='update_weekly_timestamp',
                                           sql='setLstSccssflRnDt_wklyAll.sql',
                                           dag=dag)

chain(scrape_mo_unemployment_claims, truncate_weekly_staging_tables,
      load_mo_unemployment_claims_staging,
      wkly_unemployment_claims_staging_to_core, update_weekly_timestamp)
    dag=dag)

hive = SSHExecuteOperator(
    task_id="comment_import",
    bash_command=
    '(bash {path}/xianyu_itemcomment_import.sh {lastday} {last_update_date})'.
    format(path=path,
           lastday=get_lastday(),
           last_update_date=get_last_update_date()),
    ssh_hook=sshHook,
    dag=dag)

email_update = EmailOperator(task_id='xianyu_itemcomment_update_email',
                             to=['*****@*****.**'],
                             subject='xianyu itemcomment workflow',
                             html_content='[ xianyu data updated!!! ]',
                             dag=dag)
email_update_not = EmailOperator(task_id='xianyu_itemcomment_update_not_email',
                                 to=['*****@*****.**'],
                                 subject='xianyu itemcomment workflow',
                                 html_content='[ xianyu data updating!!! ]',
                                 dag=dag)
branching = BranchPythonOperator(task_id='check_attach',
                                 python_callable=lambda: check_attach(),
                                 dag=dag)

passover = DummyOperator(task_id='pass', dag=dag)
update = DummyOperator(task_id='update', dag=dag)

chain(branching, passover, email_update_not)
chain(branching, update, spark, hive, email_update)
예제 #20
0
def test_complex_dag(snapshot):
    dag = DAG(dag_id="complex_dag",
              default_args=default_args,
              schedule_interval=None)

    # Create
    create_entry_group = DummyOperator(
        task_id="create_entry_group",
        dag=dag,
    )
    create_entry_group_result = DummyOperator(
        task_id="create_entry_group_result",
        dag=dag,
    )
    create_entry_group_result2 = DummyOperator(
        task_id="create_entry_group_result2",
        dag=dag,
    )
    create_entry_gcs = DummyOperator(
        task_id="create_entry_gcs",
        dag=dag,
    )
    create_entry_gcs_result = DummyOperator(
        task_id="create_entry_gcs_result",
        dag=dag,
    )
    create_entry_gcs_result2 = DummyOperator(
        task_id="create_entry_gcs_result2",
        dag=dag,
    )
    create_tag = DummyOperator(
        task_id="create_tag",
        dag=dag,
    )
    create_tag_result = DummyOperator(
        task_id="create_tag_result",
        dag=dag,
    )
    create_tag_result2 = DummyOperator(
        task_id="create_tag_result2",
        dag=dag,
    )
    create_tag_template = DummyOperator(
        task_id="create_tag_template",
        dag=dag,
    )
    create_tag_template_result = DummyOperator(
        task_id="create_tag_template_result",
        dag=dag,
    )
    create_tag_template_result2 = DummyOperator(
        task_id="create_tag_template_result2",
        dag=dag,
    )
    create_tag_template_field = DummyOperator(
        task_id="create_tag_template_field",
        dag=dag,
    )
    create_tag_template_field_result = DummyOperator(
        task_id="create_tag_template_field_result",
        dag=dag,
    )
    create_tag_template_field_result2 = DummyOperator(
        task_id="create_tag_template_field_result",
        dag=dag,
    )

    # Delete
    delete_entry = DummyOperator(
        task_id="delete_entry",
        dag=dag,
    )
    create_entry_gcs >> delete_entry
    delete_entry_group = DummyOperator(
        task_id="delete_entry_group",
        dag=dag,
    )
    create_entry_group >> delete_entry_group
    delete_tag = DummyOperator(
        task_id="delete_tag",
        dag=dag,
    )
    create_tag >> delete_tag
    delete_tag_template_field = DummyOperator(
        task_id="delete_tag_template_field",
        dag=dag,
    )
    delete_tag_template = DummyOperator(
        task_id="delete_tag_template",
        dag=dag,
    )

    # Get
    get_entry_group = DummyOperator(
        task_id="get_entry_group",
        dag=dag,
    )
    get_entry_group_result = DummyOperator(
        task_id="get_entry_group_result",
        dag=dag,
    )
    get_entry = DummyOperator(
        task_id="get_entry",
        dag=dag,
    )
    get_entry_result = DummyOperator(
        task_id="get_entry_result",
        dag=dag,
    )
    get_tag_template = DummyOperator(
        task_id="get_tag_template",
        dag=dag,
    )
    get_tag_template_result = DummyOperator(
        task_id="get_tag_template_result",
        dag=dag,
    )

    # List
    list_tags = DummyOperator(
        task_id="list_tags",
        dag=dag,
    )
    list_tags_result = DummyOperator(
        task_id="list_tags_result",
        dag=dag,
    )

    # Lookup
    lookup_entry = DummyOperator(
        task_id="lookup_entry",
        dag=dag,
    )
    lookup_entry_result = DummyOperator(
        task_id="lookup_entry_result",
        dag=dag,
    )

    # Rename
    rename_tag_template_field = DummyOperator(
        task_id="rename_tag_template_field",
        dag=dag,
    )

    # Search
    search_catalog = DummyOperator(
        task_id="search_catalog",
        dag=dag,
    )
    search_catalog_result = DummyOperator(
        task_id="search_catalog_result",
        dag=dag,
    )

    # Update
    update_entry = DummyOperator(
        task_id="update_entry",
        dag=dag,
    )
    update_tag = DummyOperator(
        task_id="update_tag",
        dag=dag,
    )
    update_tag_template = DummyOperator(
        task_id="update_tag_template",
        dag=dag,
    )
    update_tag_template_field = DummyOperator(
        task_id="update_tag_template_field",
        dag=dag,
    )

    # Create
    create_tasks = [
        create_entry_group,
        create_entry_gcs,
        create_tag_template,
        create_tag_template_field,
        create_tag,
    ]
    chain(*create_tasks)

    create_entry_group >> delete_entry_group
    create_entry_group >> create_entry_group_result
    create_entry_group >> create_entry_group_result2

    create_entry_gcs >> delete_entry
    create_entry_gcs >> create_entry_gcs_result
    create_entry_gcs >> create_entry_gcs_result2

    create_tag_template >> delete_tag_template_field
    create_tag_template >> create_tag_template_result
    create_tag_template >> create_tag_template_result2

    create_tag_template_field >> delete_tag_template_field
    create_tag_template_field >> create_tag_template_field_result
    create_tag_template_field >> create_tag_template_field_result2

    create_tag >> delete_tag
    create_tag >> create_tag_result
    create_tag >> create_tag_result2

    # Delete
    delete_tasks = [
        delete_tag,
        delete_tag_template_field,
        delete_tag_template,
        delete_entry_group,
        delete_entry,
    ]
    chain(*delete_tasks)

    # Get
    create_tag_template >> get_tag_template >> delete_tag_template
    get_tag_template >> get_tag_template_result

    create_entry_gcs >> get_entry >> delete_entry
    get_entry >> get_entry_result

    create_entry_group >> get_entry_group >> delete_entry_group
    get_entry_group >> get_entry_group_result

    # List
    create_tag >> list_tags >> delete_tag
    list_tags >> list_tags_result

    # Lookup
    create_entry_gcs >> lookup_entry >> delete_entry
    lookup_entry >> lookup_entry_result

    # Rename
    create_tag_template_field >> rename_tag_template_field >> delete_tag_template_field

    # Search
    chain(create_tasks, search_catalog, delete_tasks)
    search_catalog >> search_catalog_result

    # Update
    create_entry_gcs >> update_entry >> delete_entry
    create_tag >> update_tag >> delete_tag
    create_tag_template >> update_tag_template >> delete_tag_template
    create_tag_template_field >> update_tag_template_field >> rename_tag_template_field

    snapshot.assert_match(
        serialize_pp(
            PipelineSnapshot.from_pipeline_def(
                make_dagster_pipeline_from_airflow_dag(
                    dag)).dep_structure_snapshot))
예제 #21
0
    update_legal_entity = PostgresOperator(task_id="update_legal_entity",
                                           postgres_conn_id="redb_postgres",
                                           sql="updates/legal_entity.sql",
                                           database=DATABASE_NAME)
    update_neighborhood = PostgresOperator(task_id="update_neighborhood",
                                           postgres_conn_id="redb_postgres",
                                           sql="updates/neighborhood.sql",
                                           database=DATABASE_NAME)
    update_parcel = PostgresOperator(task_id="update_parcel",
                                     postgres_conn_id="redb_postgres",
                                     sql="updates/parcel.sql",
                                     database=DATABASE_NAME)
    update_unit = PostgresOperator(task_id="update_unit",
                                   postgres_conn_id="redb_postgres",
                                   sql="updates/unit.sql",
                                   database=DATABASE_NAME)

    # staging_1 > staging_2 (weekly) (sql/functions)
    staging_1_to_staging_2 = PostgresOperator(
        task_id="staging_1_to_staging_2",
        postgres_conn_id="redb_postgres",
        sql="functions/staging_1_to_staging_2.sql",
        database=DATABASE_NAME)

# The "chain" method executes the tasks like normal, but provides a cleaner structure for legibility.
chain(SourcesToS3, MDBtoREDB, insert_neighborhood, insert_address,
      insert_county_id_mapping_table, insert_legal_entity, insert_parcel,
      insert_building, insert_unit, update_neighborhood, update_address,
      update_county_id_mapping_table, update_legal_entity, update_parcel,
      update_building, update_unit, staging_1_to_staging_2)
예제 #22
0
        last_avg_size
    ]) + "<br/>" + "&emsp;".join(
        [update_day, rows, total_files, total_size, avg_size])
    return res


def get_lines(file):
    lines = []
    with open(file, "r") as f:
        lines = f.readlines()
    return lines[-8::]


def formatOutput(arr):
    return "<br/>--------------------------------------------------------------------------------------------------------------------------<br/>".join(
        arr)


data = map(process, get_lines(file))
#print data
email = EmailOperator(task_id='show',
                      to=[
                          '*****@*****.**',
                          '*****@*****.**',
                          '*****@*****.**', '*****@*****.**'
                      ],
                      subject='DATA UPDATED OVERVIEW!',
                      html_content=formatOutput(data),
                      dag=dag)
chain(stat, email)
예제 #23
0
    ssh_hook=sshHook,
    dag=dag)

hive_sale = SSHExecuteOperator(
    task_id="itemsold_sale_import",
    bash_command='(bash {path}/ec_itemsold_sale_import.sh {lastday})'
        .format(path=path, lastday=get_date()[0]),
    ssh_hook=sshHook,
    dag=dag)

spark_daysale = SSHExecuteOperator(
    task_id="itemsold_daysale_parse",
    bash_command='(bash {path}/ec_itemsold_daysale_parse.sh {last_2_days} {lastday})'
        .format(path=path, last_2_days=get_date()[1], lastday=get_date()[0]),
    ssh_hook=sshHook,
    dag=dag)

hive_daysale = SSHExecuteOperator(
    task_id="itemsold_daysale_import",
    bash_command='(bash {path}/ec_itemsold_daysale_import.sh {last_2_days})'
        .format(path=path, last_2_days=get_date()[1]),
    ssh_hook=sshHook,
    dag=dag)

email = EmailOperator(task_id='itemsold_sale_daysale_email',
                      to=['*****@*****.**', '*****@*****.**'],
                      subject='itemsold sale&daysale workflow',
                      html_content='[ itemsold sale&daysale data updated!!! ]',
                      dag=dag)
chain(spark_sale, hive_sale, spark_daysale, hive_daysale, email)
        update_mask={"paths": ["display_name"]},
        location=LOCATION,
        tag_template=TEMPLATE_ID,
        tag_template_field_id=FIELD_NAME_1,
    )
    # [END howto_operator_gcp_datacatalog_update_tag_template_field]

    # Create
    create_tasks = [
        create_entry_group,
        create_entry_gcs,
        create_tag_template,
        create_tag_template_field,
        create_tag,
    ]
    chain(*create_tasks)

    create_entry_group >> delete_entry_group
    create_entry_group >> create_entry_group_result
    create_entry_group >> create_entry_group_result2

    create_entry_gcs >> delete_entry
    create_entry_gcs >> create_entry_gcs_result
    create_entry_gcs >> create_entry_gcs_result2

    create_tag_template >> delete_tag_template_field
    create_tag_template >> create_tag_template_result
    create_tag_template >> create_tag_template_result2

    create_tag_template_field >> delete_tag_template_field
    create_tag_template_field >> create_tag_template_field_result
예제 #25
0
    )


START_TIME = PythonOperator(
    task_id="starttime",
    python_callable=print_time,
)
END_TIME = PythonOperator(
    task_id="endtime",
    python_callable=print_time,
)

for i in range(1, 50):
    CPU_TEST = KubernetesPodOperator(
        dag=DRONE_LOG_DAG,
        image=f"{environ['DOCKER_REGISTRY']}/pipeline/dronelogs:cputest",
        namespace="load-testing",
        image_pull_policy="Always",
        name="cpu",
        do_xcom_push=False,
        in_cluster=True,
        config_file=f"{environ['AIRFLOW_HOME']}/.kube/config",
        is_delete_operator_pod=True,
        hostnetwork=False,
        task_id=f"{PIPILE_NAME}-task-{i}",
        retries=4,
        retry_delay=datetime.timedelta(seconds=30),
    )

    chain(START_TIME, CPU_TEST, END_TIME)
예제 #26
0
)

for i in range(1, WORKLOAD + 1):
    ARGUMENTS = json.dumps(
        {
            "index_file": INDEX_FILE,
            "index_prefix": INDEX_PREFIX,
            "batch_number": i,
            "worklaod": WORKLOAD,
        }
    )

    DECRYPT_FILES2 = KubernetesPodOperator(
        dag=DRONE_LOG_DAG,
        image=f"{environ['DOCKER_REGISTRY']}/pipeline/{PIPILE_NAME}:decrypt",
        namespace="airflow",
        image_pull_policy="Always",
        name="decrypt",
        do_xcom_push=False,
        arguments=[ARGUMENTS],
        secrets=[SECRET_ENV],
        configmaps=["airflow-config"],
        in_cluster=True,
        config_file=f"{environ['AIRFLOW_HOME']}/.kube/config",
        is_delete_operator_pod=True,
        hostnetwork=False,
        task_id=f"{PIPILE_NAME}-task-1-{i}",
    )

    chain(INDEX2, DECRYPT_FILES2)
예제 #27
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that analyzes data before preprocessing.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_vars = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/training')
    else:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/prediction')

    clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
        task_id=_CLEAN_TEMP_DIR_TASK,
        bucket=bucket_name,
        directory=bucket_path,
        dag=dag)

    user_session_pipeline_task = add_user_session_task(
        dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars,
        preprocess_vars, storage_vars, training_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        data_visualization_pipeline_task = add_data_visualization_task(
            dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars)
        generate_categorical_stats_task = add_categorical_stats_task(
            dag, feature_vars, storage_vars)
        generate_numeric_stats_task = add_numeric_stats_task(
            dag, feature_vars, storage_vars)
        helpers.chain(
            clean_temp_dir_task, user_session_pipeline_task,
            data_visualization_pipeline_task,
            [generate_categorical_stats_task, generate_numeric_stats_task])
    else:
        helpers.chain(clean_temp_dir_task, user_session_pipeline_task)
    return dag
예제 #28
0
    postgres_conn_id='redshift_conn',
    sql=[
        brzipcode_queries['drop_staging_brzipcode'],
        brzipcode_queries['create_brzipcode_table'],
    ]
)

load_brzipcode_table_data = S3ToRedshiftCustomOperator(
    task_id='load_brzipcode_table_data',
    dag=dag,
    redshift_conn_id='redshift_conn',
    aws_conn_id='aws_credentials',
    s3_bucket=s3_bucket,
    s3_key=s3_key,
    schema='staging',
    table='brzipcode',
    copy_options=[
        "delimiter '|'",
        "EMPTYASNULL",
    ]
)

end_operator = DummyOperator(task_id='finish_execution', dag=dag)

chain(
    start_operator,
    create_brzipcode_table,
    load_brzipcode_table_data,
    end_operator
)
예제 #29
0
    update_queue = CloudTasksQueueUpdateOperator(
        task_queue=Queue(stackdriver_logging_config=dict(sampling_ratio=1)),
        location=LOCATION,
        queue_name=QUEUE_ID,
        update_mask={"paths": ["stackdriver_logging_config.sampling_ratio"]},
        task_id="update_queue",
    )

    list_queue = CloudTasksQueuesListOperator(location=LOCATION,
                                              task_id="list_queue")

    chain(
        create_queue,
        update_queue,
        pause_queue,
        resume_queue,
        purge_queue,
        get_queue,
        list_queue,
        delete_queue,
    )

    # Tasks operations
    create_task = CloudTasksTaskCreateOperator(
        location=LOCATION,
        queue_name=QUEUE_ID,
        task=TASK,
        task_name=TASK_NAME,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id="create_task_to_run",
    )
    def __init__(
            self,
            *,
            emr_cluster: EmrCluster
    ):
        super().__init__(PARTNER)

        dag = PMIDAG.get_dag()
        env_config = dag.env_config

        campaign_id_mappings_bucket = env_config['itermediate_bucket_name']
        campaign_id_mappings_prefix = 'campaigns/'

        # erase existing campaign id mappings data
        erase_campaign_id_mappings_task = ClearDestinationBucketOperator(
            task_id=f'{self.partner}_erase_campaign_id_mappings',
            bucket_name=campaign_id_mappings_bucket,
            bucket_data_prefix=campaign_id_mappings_prefix,
            partner=self.partner
        )

        # perform campaign id mappings extraction
        campaign_id_mappings_extraction_task = emr_cluster.create_emr_job_task(
            task_id=f'{self.partner}_campaign_id_mappings_extraction',
            job_name=f'Campaign ID Mappings Extraction: {self.partner}',
            script='campaign_id_mappings.py',
            script_args=[
                '--partner', self.partner,
                '--data-date', '{{ ds }}',
                '--data-date-tz-offset', '{{ ds | data_date_tz_offset }}',
                '--input-database-name', env_config['raw_events_glue_db_name'],
                '--output-bucket-name', campaign_id_mappings_bucket,
                '--output-bucket-data-prefix', campaign_id_mappings_prefix
            ]
        )

        # save extracted mappings in firewall DB
        save_campaign_id_mappings_task = PythonOperator(
            task_id=f'{self.partner}_save_campaign_id_mappings',
            python_callable=_save_campaign_id_mappings,
            provide_context=True,
            op_kwargs={
                'aws_conn_id': dag.aws_conn_id,
                'mysql_conn_id': env_config['airflow_firewall_db_conn_id'],
                'bucket_name': campaign_id_mappings_bucket,
                'bucket_data_prefix': campaign_id_mappings_prefix,
                'partner': self.partner,
                'notifications_topic_arn': ':'.join([
                    "arn:aws:sns",
                    dag.aws_env['region'],
                    dag.aws_env['accountId'],
                    env_config['notifications_topic_name']
                ])
            }
        )

        # connect tasks
        chain(
            erase_campaign_id_mappings_task,
            campaign_id_mappings_extraction_task,
            save_campaign_id_mappings_task
        )

        # set pipeline entry and exit
        self._entry_task = erase_campaign_id_mappings_task
        self._exit_task = save_campaign_id_mappings_task