Beispiel #1
0
def create_dag(dag_id, value):
    def run_print_var():
        return "go_fail"

    default_args = {
        'owner': 'kwas',
        'start_date': datetime(2018, 9, 6),
        'var': 'default'
    }

    dag = DAG(dag_id, default_args=default_args)

    print_date = BashOperator(task_id='print_date',
                              bash_command='date',
                              dag=dag)

    branch = BranchPythonOperator(task_id='branch',
                                  python_callable=run_print_var,
                                  dag=dag)
    branch.set_upstream(print_date)

    fail = BashOperator(
        task_id='go_fail',
        bash_command='if [ ! -f /tmp/kwas-fail ]; then exit 1; fi',
        dag=dag)
    fail.set_upstream(branch)

    finish = BashOperator(task_id='final_task',
                          bash_command='echo finish',
                          trigger_rule='all_success',
                          dag=dag)
    finish.set_upstream(fail)

    return dag
def deploy_tasks(model, parent_dag_name, child_dag_name, default_args,
                 PROJECT_ID, MODEL_NAME, MODEL_VERSION, MODEL_LOCATION):
    # Create inner dag
    dag = DAG("{0}.{1}".format(parent_dag_name, child_dag_name),
              default_args=default_args,
              schedule_interval=None)

    # Constants
    OTHER_VERSION_NAME = "v_{0}".format(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S")[0:12])

    # Create model on ML-Engine
    bash_ml_engine_models_list_op = BashOperator(
        task_id="bash_ml_engine_models_list_{}_task".format(
            model.replace(".", "_")),
        xcom_push=True,
        bash_command="gcloud ml-engine models list --filter='name:{0}'".format(
            MODEL_NAME + model.replace(".", "_")),
        dag=dag)

    def check_if_model_already_exists(templates_dict, **kwargs):
        cur_model = templates_dict["model"].replace(".", "_")
        ml_engine_models_list = kwargs["ti"].xcom_pull(
            task_ids="bash_ml_engine_models_list_{}_task".format(cur_model))
        logging.info(
            "check_if_model_already_exists: {}: ml_engine_models_list = \n{}".
            format(cur_model, ml_engine_models_list))
        create_model_task = "ml_engine_create_model_{}_task".format(cur_model)
        dont_create_model_task = "dont_create_model_dummy_branch_{}_task".format(
            cur_model)
        if len(ml_engine_models_list
               ) == 0 or ml_engine_models_list == "Listed 0 items.":
            return create_model_task
        return dont_create_model_task

    check_if_model_already_exists_op = BranchPythonOperator(
        task_id="check_if_model_already_exists_{}_task".format(
            model.replace(".", "_")),
        templates_dict={"model": model.replace(".", "_")},
        python_callable=check_if_model_already_exists,
        provide_context=True,
        dag=dag)

    ml_engine_create_model_op = MLEngineModelOperator(
        task_id="ml_engine_create_model_{}_task".format(model.replace(
            ".", "_")),
        project_id=PROJECT_ID,
        model={"name": MODEL_NAME + model.replace(".", "_")},
        operation="create",
        dag=dag)

    create_model_dummy_op = DummyOperator(
        task_id="create_model_dummy_{}_task".format(model.replace(".", "_")),
        trigger_rule="all_done",
        dag=dag)

    dont_create_model_dummy_branch_op = DummyOperator(
        task_id="dont_create_model_dummy_branch_{}_task".format(
            model.replace(".", "_")),
        dag=dag)

    dont_create_model_dummy_op = DummyOperator(
        task_id="dont_create_model_dummy_{}_task".format(
            model.replace(".", "_")),
        trigger_rule="all_done",
        dag=dag)

    # Create version of model on ML-Engine
    bash_ml_engine_versions_list_op = BashOperator(
        task_id="bash_ml_engine_versions_list_{}_task".format(
            model.replace(".", "_")),
        xcom_push=True,
        bash_command=
        "gcloud ml-engine versions list --model {0} --filter='name:{1}'".
        format(MODEL_NAME + model.replace(".", "_"), MODEL_VERSION),
        dag=dag)

    def check_if_model_version_already_exists(templates_dict, **kwargs):
        cur_model = templates_dict["model"].replace(".", "_")
        ml_engine_versions_list = kwargs["ti"].xcom_pull(
            task_ids="bash_ml_engine_versions_list_{}_task".format(cur_model))
        logging.info(
            "check_if_model_version_already_exists: {}: ml_engine_versions_list = \n{}"
            .format(cur_model, ml_engine_versions_list))
        create_version_task = "ml_engine_create_version_{}_task".format(
            cur_model)
        create_other_version_task = "ml_engine_create_other_version_{}_task".format(
            cur_model)
        if len(ml_engine_versions_list
               ) == 0 or ml_engine_versions_list == "Listed 0 items.":
            return create_version_task
        return create_other_version_task

    check_if_model_version_already_exists_op = BranchPythonOperator(
        task_id="check_if_model_version_already_exists_{}_task".format(
            model.replace(".", "_")),
        templates_dict={"model": model.replace(".", "_")},
        python_callable=check_if_model_version_already_exists,
        provide_context=True,
        dag=dag)

    ml_engine_create_version_op = MLEngineVersionOperator(
        task_id="ml_engine_create_version_{}_task".format(
            model.replace(".", "_")),
        project_id=PROJECT_ID,
        model_name=MODEL_NAME + model.replace(".", "_"),
        version_name=MODEL_VERSION,
        version={
            "name": MODEL_VERSION,
            "deploymentUri": MODEL_LOCATION + model.replace(".", "_"),
            "runtimeVersion": "1.13",
            "framework": "TENSORFLOW",
            "pythonVersion": "3.5",
        },
        operation="create",
        dag=dag)

    ml_engine_create_other_version_op = MLEngineVersionOperator(
        task_id="ml_engine_create_other_version_{}_task".format(
            model.replace(".", "_")),
        project_id=PROJECT_ID,
        model_name=MODEL_NAME + model.replace(".", "_"),
        version_name=OTHER_VERSION_NAME,
        version={
            "name": OTHER_VERSION_NAME,
            "deploymentUri": MODEL_LOCATION + model.replace(".", "_"),
            "runtimeVersion": "1.13",
            "framework": "TENSORFLOW",
            "pythonVersion": "3.5",
        },
        operation="create",
        dag=dag)

    ml_engine_set_default_version_op = MLEngineVersionOperator(
        task_id="ml_engine_set_default_version_{}_task".format(
            model.replace(".", "_")),
        project_id=PROJECT_ID,
        model_name=MODEL_NAME + model.replace(".", "_"),
        version_name=MODEL_VERSION,
        version={"name": MODEL_VERSION},
        operation="set_default",
        dag=dag)

    ml_engine_set_default_other_version_op = MLEngineVersionOperator(
        task_id="ml_engine_set_default_other_version_{}_task".format(
            model.replace(".", "_")),
        project_id=PROJECT_ID,
        model_name=MODEL_NAME + model.replace(".", "_"),
        version_name=OTHER_VERSION_NAME,
        version={"name": OTHER_VERSION_NAME},
        operation="set_default",
        dag=dag)

    # Build dependency graph, set_upstream dependencies for all tasks
    check_if_model_already_exists_op.set_upstream(
        bash_ml_engine_models_list_op)

    ml_engine_create_model_op.set_upstream(check_if_model_already_exists_op)
    create_model_dummy_op.set_upstream(ml_engine_create_model_op)
    dont_create_model_dummy_branch_op.set_upstream(
        check_if_model_already_exists_op)
    dont_create_model_dummy_op.set_upstream(dont_create_model_dummy_branch_op)

    bash_ml_engine_versions_list_op.set_upstream(
        [dont_create_model_dummy_op, create_model_dummy_op])
    check_if_model_version_already_exists_op.set_upstream(
        bash_ml_engine_versions_list_op)

    ml_engine_create_version_op.set_upstream(
        check_if_model_version_already_exists_op)
    ml_engine_create_other_version_op.set_upstream(
        check_if_model_version_already_exists_op)

    ml_engine_set_default_version_op.set_upstream(ml_engine_create_version_op)
    ml_engine_set_default_other_version_op.set_upstream(
        ml_engine_create_other_version_op)

    return dag
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)

t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/'
                'jars/hadoop-0.20.1-dev-streaming.jar '
                '-mapper wc '
                '-numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/'
                'data/3.tsv -output '
}

dag = DAG(
    dag_id='example_branch_operator',
    default_args=args,
    schedule_interval="@daily")

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag)
    t.set_downstream(dummy_follow)
    dummy_follow.set_downstream(join)
Beispiel #5
0
    f"{BASE_PACKAGE}.transactional-tables",
    "OutletsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)
items_by_date_task = bash_operator_for_spark_submit(
    f"{BASE_PACKAGE}.transactional-tables",
    "ItemsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)

push_instruments.set_downstream(push_server_details)
branch_task.set_upstream(push_server_details)
branch_task.set_downstream(master_tables_load)

branch_task.set_downstream(history_load_done)
master_tables_load.set_downstream(create_table_structure)
history_load_done.set_downstream(create_table_structure)
create_table_structure.set_downstream(unix_chmod_task)
unix_chmod_task.set_downstream(market_baskets_task)
market_baskets_task.set_downstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
data_load_done.set_upstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
create_constraint_task.set_upstream(data_load_done)
    bq_check_eval_data_op.set_upstream(bq_eval_data_op)

    bash_remove_old_data_op.set_upstream(
        [bq_check_train_data_op, bq_check_eval_data_op])

    bq_export_gcs_train_csv_op.set_upstream([bash_remove_old_data_op])
    bq_export_gcs_eval_csv_op.set_upstream([bash_remove_old_data_op])

    ml_engine_training_op.set_upstream(
        [bq_export_gcs_train_csv_op, bq_export_gcs_eval_csv_op])

    bash_remove_old_saved_model_op.set_upstream(ml_engine_training_op)
    bash_copy_new_saved_model_op.set_upstream(bash_remove_old_saved_model_op)

    bash_ml_engine_models_list_op.set_upstream(ml_engine_training_op)
    check_if_model_already_exists_op.set_upstream(
        bash_ml_engine_models_list_op)

    ml_engine_create_model_op.set_upstream(check_if_model_already_exists_op)
    create_model_dummy_op.set_upstream(ml_engine_create_model_op)
    dont_create_model_dummy_branch_op.set_upstream(
        check_if_model_already_exists_op)
    dont_create_model_dummy_op.set_upstream(dont_create_model_dummy_branch_op)

    bash_ml_engine_versions_list_op.set_upstream(
        [dont_create_model_dummy_op, create_model_dummy_op])
    check_if_model_version_already_exists_op.set_upstream(
        bash_ml_engine_versions_list_op)

    ml_engine_create_version_op.set_upstream([
        bash_copy_new_saved_model_op, check_if_model_version_already_exists_op
    ])
Beispiel #7
0
class MLTaskSubDag(LoggingMixin):
    """ Class for Epi Tasks subDAGs """

    def __init__(self,
                 args: Dict,
                 parent_dag_id: str,
                 child_dag_id: str,
                 repository_class: TypeVar(TaskRepositoryMixin),
                 engine: Engine = None):
        """ Defines subDAG tasks """

        self._parent_dag_id = parent_dag_id
        self._child_dag_id = child_dag_id
        self._repository_class = repository_class
        self._engine = engine

        self._subdag = DAG(
            dag_id=f'{self._parent_dag_id}.{self._child_dag_id}',
            default_args=args,
            schedule_interval=None)

        self._initialize_task_operator = PythonOperator(
            task_id=f'initialize_{self._child_dag_id}',
            provide_context=True,
            python_callable=self._initialize_task,
            dag=self._subdag)

        self._conditional_operator = BranchPythonOperator(
            task_id=f'conditional_{self._child_dag_id}',
            provide_context=True,
            python_callable=self._execute_or_skip_task,
            dag=self._subdag)

        self._dummy_operator = DummyOperator(
            task_id=f'skip_{self._child_dag_id}',
            dag=self._subdag)

        self._start_task_in_db_operator = PythonOperator(
            task_id=f'start_task_in_db_{self._child_dag_id}',
            provide_context=True,
            python_callable=self._start_task,
            dag=self._subdag)

        self._parametrized_bash_operator = ParametrizedBashOperator(
            task_id=f'bash_{self._child_dag_id}',
            parameters_provider=self._parameters_provider,
            bash_command='echo',
            dag=self._subdag)

        self._finish_task_in_db_operator = PythonOperator(
            task_id=f'finish_task_in_db_{self._child_dag_id}',
            provide_context=True,
            python_callable=self._finish_task,
            dag=self._subdag)

        self._join_operator = DummyOperator(
            task_id=f'join_{self._child_dag_id}',
            trigger_rule='one_success',
            dag=self._subdag)

    def _initialize_task(self,
                         **kwargs) -> None:
        """ Inserts task with ml_dag_id into DB, if it doesn't already exists in DB

        Args:
            **kwargs: Airflow context

        """
        self.log.debug(f'kwargs: {kwargs}')

        ml_dag_id = dag_utils.get_ml_dag_id(parent_dag_id=self._parent_dag_id, **kwargs)

        try:
            self._repository_class(engine=self._engine).insert_task_with_ml_dag_id(ml_dag_id=ml_dag_id)
        except DBException:
            pass

    def _execute_or_skip_task(self,
                              **kwargs) -> str:
        """ Conditional that chooses task that should be executed after branching based on presence of datetime_finished
        in repository for task (based on repository_class).

        Args:
            **kwargs: Airflow context

        Returns: Name of the task that should be executed after branching

        """
        self.log.debug(f'kwargs: {kwargs}')

        ml_dag_id = dag_utils.get_ml_dag_id(parent_dag_id=self._parent_dag_id, **kwargs)

        if self._repository_class(engine=self._engine).is_task_finished(ml_dag_id=ml_dag_id):
            return 'skip_{}'.format(self._child_dag_id)
        else:
            return 'start_task_in_db_{}'.format(self._child_dag_id)

    def _start_task(self,
                    **kwargs) -> None:
        """ Writes datetime_started to task table (based on repository_class) for ml_dag_id

        Args:
            **kwargs: Airflow context

        """
        self.log.debug(f'kwargs: {kwargs}')

        ml_dag_id = dag_utils.get_ml_dag_id(parent_dag_id=self._parent_dag_id, **kwargs)

        self._repository_class(engine=self._engine).start_task(ml_dag_id=ml_dag_id)

    def _finish_task(self,
                     **kwargs) -> None:
        """ Writes datetime_finished to task table (based on repository_class) for ml_dag_id

        Args:
            **kwargs: Airflow context

        """
        self.log.debug(f'kwargs: {kwargs}')

        ml_dag_id = dag_utils.get_ml_dag_id(parent_dag_id=self._parent_dag_id, **kwargs)

        self._repository_class(engine=self._engine).finish_task(ml_dag_id=ml_dag_id)

    @abc.abstractmethod
    def _parameters_provider(self,
                             **kwargs) -> str:
        """ Abstract Callable that provides additional parameters for Bash calls.

        Returns: If not overridden returns empty string

        """
        return ''

    def build(self) -> DAG:
        """ Constructs and returns initialized subDAG """
        # DAG edges definitions
        self._conditional_operator.set_upstream(self._initialize_task_operator)
        self._start_task_in_db_operator.set_upstream(self._conditional_operator)
        self._parametrized_bash_operator.set_upstream(self._start_task_in_db_operator)
        self._finish_task_in_db_operator.set_upstream(self._parametrized_bash_operator)

        self._dummy_operator.set_upstream(self._conditional_operator)

        self._join_operator.set_upstream([self._dummy_operator, self._finish_task_in_db_operator])

        return self._subdag
dag = DAG("test_branch",
          default_args=default_args,
          schedule_interval=timedelta(minutes=5),
          catchup=False)

t1 = BashOperator(
    task_id="init",
    bash_command="echo lol",
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

options = ["wowww", "wowww2"]
t2 = BranchPythonOperator(task_id='branching',
                          python_callable=lambda: random.choice(options),
                          dag=dag)

t3 = BashOperator(
    task_id="wowww",
    bash_command="echo wowwww",
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)
t4 = DummyOperator(task_id='wowww2', trigger_rule='one_success', dag=dag)

t2.set_upstream(t1)

t3.set_upstream(t2)
t4.set_upstream(t2)
    channel=slack_channel,
    username='******',
    text='Cluster has been *restarted!*\n'
         'It\'s all fine move forward with your ETLs and Crawlers!\n'
         'Message datetime: {{params.curr_date}}',
    params={'curr_date': str(datetime.now(pytz.timezone('America/Sao_Paulo')))},
    dag=dag
)

run_etl_crawler_cluster_up = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_up', dag.schedule_interval),
  task_id='crawler_dag_cluster_up',
  dag=dag,
)

run_etl_crawler_cluster_restarted = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_restarted', dag.schedule_interval),
  task_id='crawler_dag_cluster_restarted',
  dag=dag,
)
    
branch1.set_upstream(check_cluster)                                       
send_slack_cluster_ok.set_upstream(branch1)     
send_slack_cluster_start.set_upstream(branch1)
start_cluster.set_upstream(send_slack_cluster_start)
branch2.set_upstream(start_cluster)
send_slack_cluster_down.set_upstream(branch2)
send_slack_cluster_restarted_ok.set_upstream(branch2)
run_etl_crawler_cluster_up.set_upstream(send_slack_cluster_ok)
run_etl_crawler_cluster_restarted.set_upstream(send_slack_cluster_restarted_ok)
Beispiel #10
0
                                             hdfs_path='/data/mydata/{{ ds }}'),
    schema='my_hive_db',
    provide_context=True,
    dag=dag
)
hdfs_to_hive_trasfer.set_upstream(create_hive_db)


count_data_rows = BranchPythonOperator(
    task_id='count_data_rows',
    python_callable=tasks.count_data_rows,
    templates_dict={'schema': 'my_hive_db'},
    provide_context=True,
    dag=dag
)
count_data_rows.set_upstream(hdfs_to_hive_trasfer)


stop_flow = DummyOperator(
    task_id='stop_flow',
    dag=dag
)

create_source_id = PythonOperator(
    task_id='create_source_id',
    python_callable=tasks.create_source_id,
    templates_dict={'source': 'mydata'},
    provide_context=True,
    dag=dag
)
create_source_id.set_upstream(source_data_sensor)
    files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
    dag=dag,
)

sub = SubDagOperator(subdag=subdag,
                     task_id="insert_and_id_pop",
                     trigger_rule="one_success",
                     dag=dag)

clear_latest = BashOperator(
    bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR),
    task_id="clear_latest",
    dag=dag,
)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r"\W+", "", term)
    simple_search = PythonOperator(
        task_id="search_{}_twitter".format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={"query": term},
    )
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
Beispiel #12
0
    task_id='UpdateWarehouse',
    python_callable=update_data_warehouse,
    requirements=[
        'sendgrid==6.4.8', 'apache-airflow', 'psycopg2-binary',
        'google-cloud-bigquery', 'google-cloud-bigquery-storage', 'pandas',
        'pyarrow', 'datetime', 'pandas_gbq', 'tqdm', 'google-cloud-storage',
        'fsspec', 'sklearn', 'gcsfs', 'cloudstorage'
    ],
    python_version='3',
    trigger_rule='all_done',
    dag=dag)

Join = DummyOperator(task_id='Join', dag=dag, trigger_rule='all_done')
Skip1 = DummyOperator(task_id='Skip1', dag=dag, trigger_rule='all_done')
Skip2 = DummyOperator(task_id='Skip2', dag=dag, trigger_rule='all_done')
Skip3 = DummyOperator(task_id='Skip3', dag=dag, trigger_rule='all_done')
TrainModel.set_upstream(CheckTrainApi)
Skip1.set_upstream(CheckTrainApi)
CheckPredictProfile.set_upstream(Skip1)
CheckPredictProfile.set_upstream(TrainModel)
PredictProfile.set_upstream(CheckPredictProfile)
Skip2.set_upstream(CheckPredictProfile)
CheckUpdateWarehouse.set_upstream(Skip2)
CheckUpdateWarehouse.set_upstream(PredictProfile)
UpdateWarehouse.set_upstream(CheckUpdateWarehouse)
Skip3.set_upstream(CheckUpdateWarehouse)
Join.set_upstream(Skip3)
Join.set_upstream(UpdateWarehouse)
#PredictProfile.set_upstream(TrainModel)
#UpdateWarehouse.set_upstream(PredictProfile)
Beispiel #13
0
curl = BashOperator(
    bash_command=
    r"""curl -H "Content-Type: application/json" -d '{"status":"passing", "time":"{{ ts }}"}' mock-server.default.svc.cluster.local""",
    task_id="curl-task",
    dag=dag,
)

branch = BranchPythonOperator(
    task_id='branch',
    python_callable=return_branch,
    dag=dag,
)

python_print = PythonOperator(
    task_id='python-print',
    provide_context=True,
    python_callable=print_context,
    dag=dag,
)

python_fail = PythonOperator(
    task_id='python-fail',
    python_callable=exit_failure,
    dag=dag,
)

curl.set_upstream(start)
branch.set_upstream(start)
python_print.set_upstream(branch)
python_fail.set_upstream(branch)
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)

t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)
	provide_context=True,
	python_callable=clear_export_folder,
	dag=dag
    )

export_athena_scifi_table = AWSAthenaOperator(
    task_id="export_athena_scifi_table",
    #query=export_athena_scifi_table_query,
    query=export_athena_scifi_table_query2, 
    workgroup = "devday-demo", 
    database=athena_db,
    sleep_time = 60,
    output_location='s3://'+s3_dlake+"/"+athena_output+'export_athena_scifi_table'
    )


export_scifi_tofile = PythonOperator (
    task_id='export_scifi_tofile',
	provide_context=True,
	python_callable=export_scifi_tofile,
	dag=dag
    )

check_athena_export_table.set_upstream(disp_variables)
drop_athena_export_table.set_upstream(check_athena_export_table)
check_athena_export_table_done.set_upstream(check_athena_export_table)
check_athena_export_table_pass.set_upstream(drop_athena_export_table)
check_athena_export_table_pass.set_upstream(check_athena_export_table_done)
export_athena_scifi_table.set_upstream(clear_export_folder)
clear_export_folder.set_upstream(check_athena_export_table_pass)
export_scifi_tofile.set_upstream(export_athena_scifi_table)
Beispiel #16
0
from airflow.models import DAG

args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(12)}

dag = DAG(dag_id='example_branch_operator_further_back',
          default_args=args,
          schedule_interval="@daily")

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']


def return_current_day(**context):
    return options.__getitem__(context["execution_date"].weekday())


branching = BranchPythonOperator(task_id='branching',
                                 python_callable=return_current_day,
                                 provide_context=True,
                                 dag=dag)
branching.set_upstream(run_this_first)

join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    t.set_downstream(join)
    task_id='CalculateProbability',
    python_callable=calculate_probability,
    requirements=[
        'sendgrid==6.4.8', 'apache-airflow', 'psycopg2-binary',
        'google-cloud-bigquery', 'google-cloud-bigquery-storage', 'pandas',
        'pyarrow', 'datetime', 'pandas_gbq', 'tqdm', 'google-cloud-storage',
        'fsspec', 'gcsfs'
    ],
    python_version='3',
    trigger_rule='all_done',
    dag=dag)

Join = DummyOperator(task_id='Join', dag=dag, trigger_rule='all_done')
Skip1 = DummyOperator(task_id='Skip1', dag=dag, trigger_rule='all_done')
Skip2 = DummyOperator(task_id='Skip2', dag=dag, trigger_rule='all_done')
Skip3 = DummyOperator(task_id='Skip3', dag=dag, trigger_rule='all_done')
CallDividendApi.set_upstream(CheckCallApi)
Skip1.set_upstream(CheckCallApi)
CheckCsvLoad.set_upstream(Skip1)
CheckCsvLoad.set_upstream(CallDividendApi)
CsvLoad.set_upstream(CheckCsvLoad)
Skip2.set_upstream(CheckCsvLoad)
CheckCalculateProbability.set_upstream(Skip2)
CheckCalculateProbability.set_upstream(CsvLoad)
CalculateProbability.set_upstream(CheckCalculateProbability)
Skip3.set_upstream(CheckCalculateProbability)
Join.set_upstream(Skip3)
Join.set_upstream(CalculateProbability)
#CsvLoad.set_upstream(CallDividendApi)
#CalculateProbability.set_upstream(CsvLoad)