コード例 #1
0
    def build(self, dag):
        print_configuration = PythonOperator(
            task_id='print_configuration',
            python_callable=AirflowDbCleanupDagBuilder.
            print_configuration_function,
            provide_context=True,
            dag=dag)

        for db_object in self.database_objects:
            cleanup = PythonOperator(
                task_id='cleanup_' +
                str(db_object["airflow_db_model"].__name__),
                python_callable=AirflowDbCleanupDagBuilder.cleanup_function,
                params=db_object,
                provide_context=True,
                dag=dag)

            print_configuration.set_downstream(cleanup)

        return dag
コード例 #2
0
def check_previous_runs(**kwargs):
    context = kwargs
    current_run_id = context['dag_run'].run_id
    current_dag_id = context['dag_run'].dag_id
    # Connect to mysql and check for any errors for this DAG
    airflow_conn = MySqlHook(mysql_conn_id='deliverbi_mysql_airflow')
    l_error_count = 0
    cmd_sql = f"select count(1) from airflow.dag_run where dag_id = '{current_dag_id}' "
    cmd_sql += f"and run_id <> '{current_run_id}' and state = 'failed'"
    print(cmd_sql)
    airflow_data = airflow_conn.get_records(sql=cmd_sql)
    for row in airflow_data:
        l_error_count = int((str(row[0])))

    print("Found Previous Errors:" + str(l_error_count))
    if l_error_count != 0:
        raise AirflowException(
            "Previous Run in Error so Failing the Current Run")


# Tasks
check_previous_run_status = PythonOperator(task_id='check_previous_run_status',
                                           provide_context=True,
                                           python_callable=check_previous_runs,
                                           dag=dag)

task1 = DummyOperator(task_id='task1', retries=2, dag=dag)

check_previous_run_status.set_downstream(task1)
コード例 #3
0
ファイル: airflow_dag.py プロジェクト: ePlusPS/emr-workflow
readmission_classifier_train_predict_operator = PythonOperator(
    task_id='readmission_classifier_train_predict',
    python_callable=readmission_classifier_train_and_predict.train_and_predict,
    dag=dag)

readmission_prob_to_likert_operator = PythonOperator(
    task_id='convert_to_likert',
    python_callable=readmission_tf_prob_to_likert.convert_to_likert,
    dag=dag)

summary_report_operator = PythonOperator(
    task_id='make_summary_report',
    python_callable=create_report_summary.create_report,
    dag=dag)

df_from_api_operator.set_downstream(structured_features_operator)
structured_features_operator.set_downstream([
    all_word2vec_clean_notes_operator,
    readmission_word2vec_clean_notes_operator, ner_clean_operator,
    readmission_classifier_prep_operator
])

readmission_word2vec_clean_notes_operator.set_downstream(
    readmission_word2vec_tokenize_notes_operator)
readmission_word2vec_tokenize_notes_operator.set_downstream(
    readmission_word2vec_operator)
readmission_word2vec_operator.set_downstream(readmission_one_hot_operator)
all_word2vec_clean_notes_operator.set_downstream(
    all_word2vec_tokenize_notes_operator)
all_word2vec_tokenize_notes_operator.set_downstream(all_word2vec_operator)
all_word2vec_operator.set_downstream(infected_one_hot_operator)
コード例 #4
0
# But you can if you want to
one_task = PythonOperator(
    task_id="one_task",
    python_callable=print_stuff,
    dag=dag,
    executor_config={"KubernetesExecutor": {
        "image": "airflow:latest"
    }})

# Use the zip binary, which is only found in this special docker image
two_task = PythonOperator(
    task_id="two_task",
    python_callable=use_zip_binary,
    dag=dag,
    executor_config={"KubernetesExecutor": {
        "image": "airflow:latest"
    }})

# Limit resources on this operator/task
three_task = PythonOperator(task_id="three_task",
                            python_callable=print_stuff,
                            dag=dag,
                            executor_config={
                                "KubernetesExecutor": {
                                    "request_memory": "128Mi",
                                    "limit_memory": "128Mi"
                                }
                            })

start_task.set_downstream([one_task, two_task, three_task])
コード例 #5
0
# endregion
# endregion


with DAG(
        dag_id='website_statistics',
        default_args=default_args,
        schedule_interval=timedelta(minutes=1)) as dag:

    # Task for download from external source
    # opr_download_json = PythonOperator(task_id='download_json', python_callable=download_json, provide_context=True)

    opr_extract_json = PythonOperator(
        task_id='load_json',
        provide_context=True,
        python_callable=load_json,
    )

    opr_transform_clickhouse = PythonOperator(
        task_id='move_to_merge_tree',
        provide_context=True,
        python_callable=move_to_merge_tree,
    )

    # opr_extract_json.set_downstream(opr_download_json)
    opr_extract_json.set_downstream(opr_transform_clickhouse)

    # opr_transform_clickhouse.set_downstream(opr_extract_json)

コード例 #6
0
            'type': 'STRING'
        }, {
            'name': 'predicted_monetary',
            'type': 'FLOAT'
        }, {
            'name': 'predictions',
            'type': 'FLOAT'
        }],
        source_format="NEWLINE_DELIMITED_JSON",
        skip_leading_rows=1,
        destination_project_dataset_table="{}.{}.{}".format(
            PROJECT, dataset, 'predictions'),
        create_disposition="CREATE_IF_NEEDED",
        write_disposition="WRITE_TRUNCATE",
        dag=dag).execute(kwargs)


t3 = PythonOperator(task_id='list_predictions_files',
                    dag=dag,
                    python_callable=do_list_predictions_files)

t4 = PythonOperator(task_id='load_to_bq',
                    dag=dag,
                    python_callable=do_load_to_bq)

# How to link them
t0_predict_cond.set_downstream([t1a, t1b])
t2.set_upstream([t1a, t1b])
t3.set_upstream([t1a, t1b])
t3.set_downstream(t4)
コード例 #7
0

# returns the week day (monday, tuesday, etc.)
def get_day(**kwargs):
    print(kwargs['ti'])
    kwargs['ti'].xcom_push(key='day', value=datetime.now().weekday())


# returns the name id of the task to launch (task_for_monday, task_for_tuesday, etc.)
def branch(**kwargs):
    print(kwargs)
    return 'task_for_' + tabDays[kwargs['ti'].xcom_pull(task_ids='weekday',
                                                        key='day')]


# PythonOperator will retrieve and store into "weekday" variable the week day
get_weekday = PythonOperator(task_id='weekday',
                             python_callable=get_day,
                             provide_context=True,
                             dag=dag)
# BranchPythonOperator will use "weekday" variable, and decide which task to launch next
fork = BranchPythonOperator(task_id='branching',
                            python_callable=branch,
                            provide_context=True,
                            dag=dag)
# task 1, get the week day
get_weekday.set_downstream(fork)
# One dummy operator for each week day, all branched to the fork
for day in range(0, 6):
    fork.set_downstream(
        DummyOperator(task_id='task_for_' + tabDays[day], dag=dag))
コード例 #8
0
DEFAULT_DATE = datetime(2016, 1, 1)
default_args = dict(
    start_date=DEFAULT_DATE,
    owner='airflow')


def fail():
    raise ValueError('Expected failure.')


def success(ti=None, *args, **kwargs):
    if ti.execution_date != DEFAULT_DATE + timedelta(days=1):
        fail()
    return


# DAG tests that tasks ignore all dependencies

dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args))
dag1_task1 = PythonOperator(
    task_id='test_run_dependency_task',
    python_callable=fail,
    dag=dag1,)
dag1_task2 = PythonOperator(
    task_id='test_run_dependent_task',
    python_callable=success,
    provide_context=True,
    dag=dag1,)
dag1_task1.set_downstream(dag1_task2)
コード例 #9
0
            python_callable=get_endpoint,
            op_args=[e, SAVE_PATH, BASE_URL, API_KEYS],
        )

        t_branch = BranchPythonOperator(task_id=branch_task_id,
                                        python_callable=row_count_branch,
                                        op_args=[
                                            get_enpdpoints_task_id,
                                            file_to_gcs_task_id,
                                            zero_branch_task_id
                                        ],
                                        trigger_rule="all_done")

        t_gcs = FileToGoogleCloudStorageOperator(
            task_id=file_to_gcs_task_id,
            google_cloud_storage_conn_id='gcs_silo',
            bucket="deanslist",
            src="{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_path' )}}",
            #dst = "TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}",
            dst=endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_name') }}",
            dag=dag)

        t_zero_row = DummyOperator(task_id=zero_branch_task_id)

        t2.set_upstream(t1)
        t2.set_downstream(t_branch)
        t_branch.set_downstream(t_gcs)
        t_branch.set_downstream(t_zero_row)
コード例 #10
0
    execution_timeout=timedelta(hours=1),
    op_kwargs={'valuation_date': get_valuation_date()},
    dag=dag)

export_spot_scenarios_operator = PythonOperator(
    task_id='export_spot_scenarios_task',
    python_callable=export_spot_scenarios_run,
    execution_timeout=timedelta(hours=1),
    op_kwargs={'valuation_date': get_valuation_date()},
    dag=dag)

# -----------------------------------------------------------------------------------
# Operator Dependency Relationship
# default close tasks
basic_risks_default_close_operator.set_upstream(basic_position_operator)
basic_instrument_contract_type_operator.set_downstream(basic_position_operator)

# basic_otc_company_type_operator.set_downstream(eod_classic_scenarios_operator)
basic_otc_company_type_operator.set_downstream(
    eod_spot_scenarios_by_market_default_close_operator)
basic_otc_company_type_operator.set_downstream(
    eod_counter_party_market_risk_default_close_operator)
basic_otc_company_type_operator.set_downstream(
    eod_counter_party_market_risk_by_underlyer_default_close_operator)

# eod_classic_scenarios_operator.set_upstream(basic_position_operator)
eod_spot_scenarios_by_market_default_close_operator.set_upstream(
    basic_position_operator)

eod_position_default_close_operator.set_upstream(
    basic_risks_default_close_operator)
コード例 #11
0
ファイル: assessor.py プロジェクト: xyx0826/etl-airflow
        ))

    opr_pause = BashOperator(task_id='pause',
                             bash_command="echo 'Paused for extraction.'")

    for t, s in sources_to_extract.items():

        s['name'] = t
        s['dag'] = dag.dag_id

        opr_extract = PythonOperator(task_id=f"extract_{t}",
                                     python_callable=sources.extract_source,
                                     provide_context=True,
                                     op_kwargs=s)

        opr_extract.set_downstream(opr_pause)

    # Loop through the open datasets
    open_datasets = [
        f for f in os.listdir(
            f"{os.environ['AIRFLOW_HOME']}/processes/{dag.dag_id}")
        if not f.startswith('_') and f.endswith('.yml')
    ]

    for od in open_datasets:
        od_name = od.split('.')[0]
        od_config = yaml.load(
            open(f"{os.environ['AIRFLOW_HOME']}/processes/{dag.dag_id}/{od}"))

        # loop through the views
        for d, v in od_config['views'].items():
コード例 #12
0
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(
    task_id='test_subdag_fail',
    dag=subdag7,
    python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,)
subdag7_task3 = DummyOperator(
    task_id='test_subdag_dummy_2',
    dag=subdag7)
dag7_subdag1 = SubDagOperator(
    task_id='subdag',
    dag=dag7,
    subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that a Dag run that doesn't complete but has a root failure is marked running
dag8 = DAG(dag_id='test_dagrun_states_root_fail_unfinished', default_args=default_args)
dag8_task1 = DummyOperator(
    task_id='test_dagrun_unfinished',  # The test will unset the task instance state after
                                       # running this test
    dag=dag8,
)
dag8_task2 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag8,
    python_callable=fail,
)
コード例 #13
0

# You don't have to use any special KubernetesExecutor configuration if you don't want to
start_task = PythonOperator(
    task_id="start_task", python_callable=print_stuff, dag=dag
)

# But you can if you want to
one_task = PythonOperator(
    task_id="one_task", python_callable=print_stuff, dag=dag,
    executor_config={"KubernetesExecutor": {"image": "airflow/ci:latest"}}
)

# Use the zip binary, which is only found in this special docker image
two_task = PythonOperator(
    task_id="two_task", python_callable=use_zip_binary, dag=dag,
    executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}}
)

# Limit resources on this operator/task with node affinity & tolerations
three_task = PythonOperator(
    task_id="three_task", python_callable=print_stuff, dag=dag,
    executor_config={
        "KubernetesExecutor": {"request_memory": "128Mi",
                               "limit_memory": "128Mi",
                               "tolerations": tolerations,
                               "affinity": affinity}}
)

start_task.set_downstream([one_task, two_task, three_task])
コード例 #14
0
branch_b = PythonOperator(
    task_id='branch_b',
    python_callable=print_branchb,
    dag=dag) # 指定归属的dag

def print_branchc():
    return 'Hello branchc!'
 
branch_c = PythonOperator(
    task_id='branch_c',
    python_callable=print_branchc,
    dag=dag) # 指定归属的dag
#-------------------------------------------------------------------------------
def decide_which_path():
      if 1 > 1:
          return "branch_a"
      else:
          return "branch_b"
  
  
branch_task = BranchPythonOperator(
      task_id='run_this_first',
      python_callable=decide_which_path,
      trigger_rule="all_done",
      dag=dag)
#-------------------------------------------------------------------------------
# dependencies
branch_task.set_downstream(branch_a) #适配层以及中间层、应用层都依赖于branch_a
branch_task.set_downstream(branch_b)
branch_a.set_downstream(branch_c)
コード例 #15
0
ファイル: dag_ml_pipeline.py プロジェクト: jjayp4rk/test-sm
        'region': region,
        'bucket': bucket
    })

# launch sagemaker batch transform job and wait until it completes
batch_transform_task = SageMakerTransformOperator(
    task_id='batch_predicting',
    dag=dag,
    config=transform_config,
    aws_conn_id='airflow-sagemaker',
    wait_for_completion=True,
    check_interval=30)

# Cleanup task, deletes ALL SageMaker endpoints and model artifacts
# Uncomment below clean_up_task to clean up sagemaker endpoint resources and model artifacts

# clean_up_task = PythonOperator(
#    task_id='clean_up',
#    dag=dag,
#    python_callable=clean_up.clean_up,
#    op_kwargs={'region': region, "bucket": bucket}
# )

init.set_downstream(sm_proc_job_task)
sm_proc_job_task.set_downstream(train_model_task)
train_model_task.set_downstream(inference_pipeline_task)
inference_pipeline_task.set_downstream(batch_transform_task)
# Uncomment line below to disable clean up task

# batch_transform_task.set_downstream(clean_up_task)
コード例 #16
0
                 str(airflow_db_model.__name__) + "(s):")
    for entry in entries_to_delete:
        logging.info("\tEntry: " + str(entry) + ", Date: " +
                     str(entry.__dict__[str(age_check_column).split(".")[1]]))

    logging.info("Process will be Deleting " + str(len(entries_to_delete)) +
                 " " + str(airflow_db_model.__name__) + "(s)")

    if ENABLE_DELETE:
        logging.info("Performing Delete...")
        # using bulk delete
        query.delete(synchronize_session=False)
        session.commit()
        logging.info("Finished Performing Delete")
    else:
        logging.warn("You're opted to skip deleting the db entries!!!")

    logging.info("Finished Running Cleanup Process")


for db_object in DATABASE_OBJECTS:

    cleanup_op = PythonOperator(task_id='cleanup_' +
                                str(db_object["airflow_db_model"].__name__),
                                python_callable=cleanup_function,
                                params=db_object,
                                provide_context=True,
                                dag=dag)

    print_configuration.set_downstream(cleanup_op)
コード例 #17
0
    os.system(' '.join(['gsutil rm', BUCKET_LOC + filename]))
    os.system(' '.join(['gsutil cp', full_filename, BUCKET_LOC]))


def process_local():
    # recreate the draft-kings.csv file
    return 0


dag = DAG(dag_id='dk_data',
          description='Download and Process DraftKings Data',
          default_args=default_args,
          schedule_interval='0 14 * * 3')

source_to_local = PythonOperator(task_id='source_to_local',
                                 python_callable=source_to_local,
                                 dag=dag)

local_to_gs = PythonOperator(task_id='local_to_gs',
                             python_callable=local_to_gs,
                             dag=dag)

# process_local = PythonOperator(
#     task_id='process_local',
#     python_callable=process_local,
#     dag=dag)

# setting dependencies
# source_to_local.set_downstream([process_local, local_to_gs])
source_to_local.set_downstream(local_to_gs)
コード例 #18
0
    'start_date': datetime(2020, 3, 3),
    'retries': 3,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG('CTK_JNJ_MASTER',
          default_args=default_args,
          dagrun_timeout=timedelta(days=1),
          description="jnj mastering starting with tokenizing",
          schedule_interval='0 1 * * *')

# 0. dummy
op_starting = DummyOperator(task_id='execute', dag=dag)
# 1. tokenize
op_tokenize = PythonOperator(task_id=f'tkn_daily',
                             python_callable=tkn_daily,
                             dag=dag)
# 2. brand_classification
op_brand_classification = PythonOperator(
    task_id='jnj_brand_classification',
    python_callable=jnj_brand_classification,
    dag=dag)
# 3. mastering s_sp_item
op_s_sp_item_result = PythonOperator(task_id='s_sp_item_result',
                                     python_callable=s_sp_item_result,
                                     dag=dag)

op_starting.set_downstream(op_tokenize)
op_tokenize.set_downstream(op_brand_classification)
op_brand_classification.set_downstream(op_s_sp_item_result)
コード例 #19
0
# create dag and schedule a load interval every day at midnight (7am UTC)
dag = DAG('extract_and_load',
          catchup=False,
          default_args=default_args,
          schedule_interval=timedelta(days=1),
          max_active_runs=1)

# task to create table if it does not exist
task_create_table = PostgresOperator(
    task_id='task_create_table',
    sql='./extract_load_pipeline/sql/create_postgres_table.sql',
    postgres_conn_id='my_local_db',
    dag=dag)

# extracts bq to a gcs bucket as csv
task_bq_to_gcs = PythonOperator(
    task_id='task_bq_to_gcs',
    python_callable=bq_to_gcs,
    provide_context=True,
    op_kwargs={'start_date': default_args['start_date']},
    dag=dag)

# loads postgres table from csv
task_gcs_to_postgres = PythonOperator(task_id='task_gcs_to_postgres',
                                      python_callable=load_table,
                                      provide_context=True,
                                      dag=dag)

task_create_table.set_downstream(task_bq_to_gcs)
task_bq_to_gcs.set_downstream(task_gcs_to_postgres)
コード例 #20
0
                                 'Content-Type': 'application/json'
                             }).json()['log']
        for line in lines:
            logging.info(line)
        time.sleep(10)
    if statement_status == 'success':
        #curl -X DELETE localhost:8998/batches/53
        requests.delete(statement_url,
                        headers={'Content-Type': 'application/json'})
        final_statement_status = 'success'
        return
    if final_statement_status == 'dead':
        logging.info('Statement exception: ' + lines.json()['log'])
        for trace in statement_response.json()['output']['traceback']:
            logging.info(trace)
        raise ValueError('Final Statement Status: ' + final_statement_status)
    logging.info('Final Statement Status: ' + final_statement_status)


generatePerfilInput = PythonOperator(task_id='generatePerfilInput',
                                     python_callable=generate_perfil_input,
                                     dag=dag)

launchPerfilTraining = PythonOperator(task_id='launchPerfilTraining',
                                      python_callable=launch_perfil_training,
                                      dag=dag)

generatePerfilInput.set_downstream(launchPerfilTraining)

#generatePerfilInput >> launchPerfilTraining
コード例 #21
0
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 0,  # Defaults to not retrying.. fails if first attempt fails.
    'retry_delay': timedelta(
        minutes=5
    )  # Won't be used if retries left at 0 but retries can be overriden
}

#default pip airflow install is 1.7
# using the Airflow 1.8 context manager feature.
# it would be `with DAG() as dag:` and all the operators in that scope would have dag=dag by default

dag = DAG('pysparkexec', default_args=default_args)

adapt_model = PythonOperator(task_id='adapt_model',
                             dag=dag,
                             python_callable=sophia_air.adapt_model,
                             provide_context=False)
create_env = BashOperator(task_id='create_env',
                          dag=dag,
                          bash_command='sleep 5 && echo "slept"')
run_it = PythonOperator(task_id='run_it',
                        dag=dag,
                        python_callable=sophia_air.run_it)

# in 1.8 it will be
# get_git >> run_it << get_cluster
adapt_model.set_downstream(run_it)
create_env.set_downstream(run_it)
コード例 #22
0
ファイル: bert_reviews.py プロジェクト: miadp/workshop
    python_callable=preprocess.preprocess,
    op_kwargs=config["preprocess_data"])

train_task= PythonOperator(
    task_id='train',
    dag=dag,
    provide_context=False,
    python_callable=preprocess.preprocess,
    op_kwargs=config["preprocess_data"])

model_task= PythonOperator(
    task_id='model',
    dag=dag,
    provide_context=False,
    python_callable=preprocess.preprocess,
    op_kwargs=config["preprocess_data"])

deploy_task= PythonOperator(
    task_id='deploy',
    dag=dag,
    provide_context=False,
    python_callable=preprocess.preprocess,
    op_kwargs=config["preprocess_data"])

# set the dependencies between tasks

init.set_downstream(process_task)
process_task.set_downstream(train_task)
train_task.set_downstream(model_task)
model_task.set_downstream(deploy_task)
    task_id="start_task", python_callable=print_stuff, dag=dag,
    executor_config={
        "KubernetesExecutor": {
            "annotations": {"test": "annotation"}
        }
    }
)

# You can mount volume or secret to the worker pod
second_task = PythonOperator(
    task_id="four_task", python_callable=test_volume_mount, dag=dag,
    executor_config={
        "KubernetesExecutor": {
            "volumes": [
                {
                    "name": "test-volume",
                    "hostPath": {"path": "/tmp/"},
                },
            ],
            "volume_mounts": [
                {
                    "mountPath": "/foo/",
                    "name": "test-volume",
                },
            ]
        }
    }
)

start_task.set_downstream(second_task)
コード例 #24
0
    ["/Users/ravimuthyala/AirflowSparkTestCode/receipts.csv"],
    'driver_memory': '1g',
    'executor_cores': 1,
    'num_executors': 1,
    'executor_memory': '1g'
}

spark_submit_operator = SparkSubmitOperator(task_id='Spark_Scala_Submit_Job',
                                            dag=dag,
                                            **spark_config)

emailNotify = EmailOperator(task_id='email_notification',
                            to='*****@*****.**',
                            subject='Spark Submit Job Alert',
                            html_content='Airflow Spark Submit Job Done',
                            dag=dag)

t1Failed = EmailOperator(dag=dag,
                         trigger_rule=TriggerRule.ONE_FAILED,
                         task_id="SparkJobFailed",
                         to=["*****@*****.**"],
                         subject="Spark job Failed",
                         html_content='<h3>Spark job has failed</h3>')

python_operator.set_downstream(spark_submit_operator)
spark_submit_operator.set_downstream(emailNotify)
t1Failed.set_upstream([spark_submit_operator])

if __name__ == '__main__':
    dag.cli()
コード例 #25
0
# [START howto_operator_python]
def print_context(ds, **kwargs):
    """Print the Airflow context and ds variable from the context."""
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


def dag_run(context, dag_run_obj):
    print("[dag_run] %s" % dag_run_obj)
    return dag_run_obj


run_this = PythonOperator(
    task_id='print_the_context',
    provide_context=True,
    python_callable=print_context,
    dag=dag,
)

trigger_hdfs = TriggerDagRunOperator2(
    task_id="trigger_the_dag",
    trigger_dag_id="example_python_operator",
    python_callable=dag_run,
    execution_date="{{ execution_date }}",
    dag=dag,
)

run_this.set_downstream(trigger_hdfs)
コード例 #26
0
ファイル: generate_dags.py プロジェクト: mferon/depc
def create_subdag(dag_parent, label, team):
    dag_id_child = "%s.%s" % (dag_parent.dag_id, label)
    schema = team["schema"][label]

    dag = DAG(
        dag_id=dag_id_child,
        default_args=dag_parent.default_args,
        schedule_interval=dag_parent.schedule_interval,
    )

    # Find the corresponding operator and its parameters
    fn, operator_params = find_label_operator(schema["qos"])

    # Label is declared but there is no node in Neo4j
    count = team["labels"][label]
    if not count:
        DummyOperator(task_id="{}.notask".format(label), dag=dag)
        return dag, operator_params.get("dependencies")

    if count < 100:
        length = count
    else:
        frac, length = math.modf(count / 100)
        if frac:
            length += 1

    chunks = {
        "{}.chunk.{}".format(label, i): i
        for i in range(0, count, int(length))
    }

    tasks = []
    for name, skip in chunks.items():

        # All custom operators share these parameters
        params = {
            "app": app,
            "team": team,
            "label": label,
            "skip": skip,
            "length": length,
            **operator_params,
        }

        tasks.append(fn(task_id=name, dag=dag, params=params))

    with dag:
        delete_redis_avg_op = PythonOperator(
            task_id="{}.del_redis_average".format(label),
            provide_context=True,
            python_callable=delete_redis_avg,
            params={
                "app": app,
                "team": team,
                "label": label
            },
        )

        before_subdag_task = BeforeSubdagOperator(
            task_id="{}.before_subdag".format(label),
            params={
                "app": app,
                "team": team,
                "label": label,
                "count": count
            },
        )

        after_subdag_task = AfterSubdagOperator(
            task_id="{}.after_subdag".format(label),
            params={
                "app": app,
                "team": team,
                "label": label
            },
        )

        after_chunks_task = DummyOperator(task_id="{}.dummy".format(label))

        average_op = AverageOperator(
            task_id="{}.average".format(label),
            params={
                "app": app,
                "team": team,
                "label": label
            },
        )

        daily_worst_op = DailyWorstOperator(
            task_id="{}.daily_worst".format(label),
            params={
                "app": app,
                "team": team,
                "label": label
            },
        )

    before_subdag_task.set_downstream(delete_redis_avg_op)
    delete_redis_avg_op.set_downstream(tasks)
    after_chunks_task.set_upstream(tasks)
    after_chunks_task.set_downstream([average_op, daily_worst_op])
    after_subdag_task.set_upstream([average_op, daily_worst_op])

    return dag, operator_params.get("dependencies")
コード例 #27
0
    execution_dates = list((datetime_range(start=start_date, end=end_date)))

    for i, ex_date in enumerate(execution_dates):

        ed = ex_date.strftime('%Y-%m-%d')

        ep_template = {'sdt': ed}

        get_enpdpoints_task_id = "get_{0}_dl_endpoint_{1}".format(
            endpoint_name, ed)
        file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, ed)

        t2 = PythonOperator(task_id=get_enpdpoints_task_id,
                            python_callable=get_endpoint_with_dates,
                            op_args=[SAVE_PATH, BASE_URL, API_KEYS],
                            templates_dict=ep_template)

        t3 = FileToGoogleCloudStorageOperator(
            task_id=file_to_gcs_task_id,
            google_cloud_storage_conn_id='gcs_silo',
            bucket="deanslist",
            src="{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_path' )}}",
            dst="TEST/" + endpoint_name +
            "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id +
            "', key='dl_file_name') }}",
            dag=dag)

        t2.set_upstream(t1)
        t2.set_downstream(t3)
コード例 #28
0
    task_id="print_task",
    provide_context=True,
    python_callable=print_context,
    dag=dag,
)


def sleep(seconds):
    time.sleep(seconds)


def make_sleep_task(task_name, dag):
    seconds = random.randint(1, 3)
    task = PythonOperator(
        task_id=task_name,
        python_callable=sleep,
        op_kwargs={"seconds": float(seconds) / 10},
        dag=dag,
    )
    return task


# print_task > sleep_task
first_sleep = make_sleep_task("first_sleep", dag)
last_sleep = make_sleep_task("last_sleep", dag)

print_task.set_downstream(last_sleep)

# sleep_task > print_task
first_sleep >> print_task
コード例 #29
0
)
dag6_task2.set_upstream(dag6_task1)

# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(task_id='test_subdag_fail',
                               dag=subdag7,
                               python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,
)
subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7)
dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that a Dag run that doesn't complete but has a root failure is marked running
dag8 = DAG(dag_id='test_dagrun_states_root_fail_unfinished',
           default_args=default_args)
dag8_task1 = DummyOperator(
    task_id=
    'test_dagrun_unfinished',  # The test will unset the task instance state after
    # running this test
    dag=dag8,
)
dag8_task2 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag8,
    python_callable=fail,
コード例 #30
0
ファイル: twitter.py プロジェクト: fartashh/ETL-boilerplate
    'start_date': datetime.utcnow(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('twitter', default_args=default_args, schedule_interval=timedelta(minutes=5))

twitter_transformer = TwitterTransformer()
twitter_loader = TwitterLoader()





analyze_tweets = PythonOperator(
    task_id='analyze_tweets',
    provide_context=True,
    python_callable=twitter_transformer.process,
    dag=dag)

transfer_to_elastic = PythonOperator(
    task_id='transfer_to_elastic',
    provide_context=True,
    python_callable=twitter_loader.load_into_elastic,
    dag=dag)

analyze_tweets.set_downstream(transfer_to_elastic)
コード例 #31
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('bollinger',
          default_args=default_args,
          schedule_interval="*/30 6-14 * * 1-5")

t1 = PythonOperator(
    task_id='bb',
    python_callable=calc_bb.get_spy_data,
    #schedule_interval="0 13 * * *",
    provide_context=True,
    dag=dag,
)

t2 = PythonOperator(
    task_id='plot_bb',
    python_callable=calc_bb.plot,
    dag=dag,
)

t1.set_downstream(t2)
コード例 #32
0
                <tr><td><b> Task ID: </b></td><td>{{ task_instance.task_id }}</td></tr>
                <tr><td><b> Execution Date: </b></td><td>{{ task_instance.execution_date }}</td></tr>
                <tr><td><b> Start Date: </b></td><td>{{ task_instance.start_date }}</td></tr>
                <tr><td><b> End Date: </b></td><td>{{ task_instance.end_date }}</td></tr>
                <tr><td><b> Host Name: </b></td><td>{{ task_instance.hostname }}</td></tr>
                <tr><td><b> Unix Name: </b></td><td>{{ task_instance.unixname }}</td></tr>
                <tr><td><b> Job ID: </b></td><td>{{ task_instance.job_id }}</td></tr>
                <tr><td><b> Queued Date Time: </b></td><td>{{ task_instance.queued_dttm }}</td></tr>
                <tr><td><b> Log URL: </b></td><td><a href="{{ task_instance.log_url }}">{{ task_instance.log_url }}</a></td></tr>
            </table>

            <h2>Processes Killed</h2>
            <ul>
            {% for process_killed in task_instance.xcom_pull(task_ids='kill_halted_tasks', key='kill_halted_tasks.processes_to_kill') %}
                <li>Process {{loop.index}}</li>
                <ul>
                {% for key, value in process_killed.iteritems() %}
                    <li>{{ key }}: {{ value }}</li>
                {% endfor %}
                </ul>
            {% endfor %}
            </ul>
        </body>
    </html>
    """,
    dag=dag)


kill_halted_tasks.set_downstream(email_or_not_branch)
email_or_not_branch.set_downstream(send_processes_killed_email)
コード例 #33
0

o = PythonOperator(
    task_id='United_Kingdom',
    provide_context=True,
    python_callable=UK,
    dag=dag,
)


p = PythonOperator(
    task_id='Generate_Heat_Map',
    provide_context=True,
    python_callable=map,
    dag=dag,
)


#---------------------------#
#        Dependencies       #
#---------------------------#

# a = root
a.set_downstream(b)


# b = bottleneck to three threads
b.set_downstream(c)
c.set_downstream([d, e, f, g, h, i, j, k, l, m, n, o])
p.set_upstream([d, e, f, g, h, i, j, k, l, m, n, o])
コード例 #34
0
    dag=cell_image_analysis_2channels_dag,
)


def prepare_cellprofiler_csv(ds, **kwargs):
    """Prepare the cellprofiler csv based on the args"""
    df = get_cell_images_df(**kwargs)
    kwargs['ti'].xcom_push(key='cell_images_df', value=df)
    return


prepare_cellprofiler_csv_op = PythonOperator(
    task_id='prepare_cellprofiler_csv',
    provide_context=True,
    python_callable=prepare_cellprofiler_csv,
    dag=cell_image_analysis_2channels_dag)

prepare_cellprofiler_csv_op.set_downstream(image_conversion_dag)
cellprofiler_tasks = cell_image_analysis_generate_cellprofiler_task(
    cell_image_analysis_2channels_dag)
cellprofiler_branch_tasks = cell_image_analysis_generate_decide_run_cellprofiler(
    cell_image_analysis_2channels_dag)
image_conversion_dag.set_downstream(cellprofiler_branch_tasks)

cell_image_analysis_no_images_to_run_op.set_upstream(cellprofiler_branch_tasks)

for idx, cellprofiler_branch_task in enumerate(cellprofiler_branch_tasks):
    cellprofiler_branch_task.set_downstream(cellprofiler_tasks[idx])

cell_image_analysis_combine_cellprofiler_csvs.set_upstream(cellprofiler_tasks)
コード例 #35
0
            'service_name': service_name,
            'machine_service_name': machine_service_name
        },
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    service_tasks.append(get_task)

    #: join_council_districts must run before get_task
    get_task.set_upstream(create_prod_files)

    if i == 'pothole':
        #: get_task must run before sonar potholes
        get_task.set_downstream(create_potholes_sonar)

filename = conf['prod_data_dir'] + "/get_it_done_*.csv"
files = [os.path.basename(x) for x in glob.glob(filename)]

for index, file_ in enumerate(files):
    file_name = file_.split('.')[0]
    name_parts = file_name.split('_')
    task_name = '_'.join(name_parts[3:-2])
    md_name = '-'.join(name_parts[3:-2])

    #: Upload prod gid file to S3
    upload_task = S3FileTransferOperator(
        task_id='upload_' + task_name,
        source_base_path=conf['prod_data_dir'],
        source_key='get_it_done_{}_requests_datasd.csv'.format(