dag=cell_image_analysis_2channels_dag, ) def prepare_cellprofiler_csv(ds, **kwargs): """Prepare the cellprofiler csv based on the args""" df = get_cell_images_df(**kwargs) kwargs['ti'].xcom_push(key='cell_images_df', value=df) return prepare_cellprofiler_csv_op = PythonOperator( task_id='prepare_cellprofiler_csv', provide_context=True, python_callable=prepare_cellprofiler_csv, dag=cell_image_analysis_2channels_dag) prepare_cellprofiler_csv_op.set_downstream(image_conversion_dag) cellprofiler_tasks = cell_image_analysis_generate_cellprofiler_task( cell_image_analysis_2channels_dag) cellprofiler_branch_tasks = cell_image_analysis_generate_decide_run_cellprofiler( cell_image_analysis_2channels_dag) image_conversion_dag.set_downstream(cellprofiler_branch_tasks) cell_image_analysis_no_images_to_run_op.set_upstream(cellprofiler_branch_tasks) for idx, cellprofiler_branch_task in enumerate(cellprofiler_branch_tasks): cellprofiler_branch_task.set_downstream(cellprofiler_tasks[idx]) cell_image_analysis_combine_cellprofiler_csvs.set_upstream(cellprofiler_tasks)
section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, ) end = DummyOperator( task_id='end', default_args=args, dag=dag, ) start.set_downstream(section_1) section_1.set_downstream(some_other_task) some_other_task.set_downstream(section_2) section_2.set_downstream(end)
dwh_script = BashOperator( task_id='dwh_script', bash_command=conf['bash_path']['load_to_dwh_location'] + ' ', on_failure_callback=send_email_failure, dag=dag) send_email = EmailOperator( to='*****@*****.**', task_id='send_email_after_all_success', subject='DAG run for {{ ds }} was successful.', html_content= 'All the task for {{ yesterday_ds }} was successful. For more details visit http://172.20.23.251:8080/admin/airflow/tree?num_runs=25&dag_id=Jobs', dag=dag) brightedge_keyword_rank_data.set_downstream(integration_script) brightedge_share_of_voice.set_downstream(integration_script) doubleclick_search_campaign.set_downstream(integration_script) doubleclick_search_productadvertised.set_downstream(integration_script) doubleclick_search_keyword.set_downstream(integration_script) doubleclick_dcm_standard_display_uk.set_downstream(integration_script) partners_display_performance.set_downstream(integration_script) time_wait_till_6_00am.set_downstream(partners_display_performance) time_wait_till_6_00am.set_downstream(doubleclick_search_campaign) time_wait_till_6_00am.set_downstream(doubleclick_dcm_standard_display_uk) time_wait_till_6_00am.set_downstream(doubleclick_search_productadvertised) time_wait_till_6_00am.set_downstream(sensor_to_check_ga_table_preset) time_wait_till_6_00am.set_downstream(doubleclick_search_keyword) sensor_to_check_ga_table_preset.set_downstream(unique_visitors) sensor_to_check_ga_table_preset.set_downstream(gapremiuim_dailyaggregatesfull) gapremiuim_dailyaggregatesfull.set_downstream(integration_script)
'--fix-flag', '0', ] t0 = mlengine_operator.MLEngineTrainingOperator( # gcp_conn_id='project_connection', task_id='sequential_startPoint', project_id=PROJECT_ID, job_id=job_id0, package_uris=[PACKAGE_URI], training_python_module='trainer.task', training_args=training_args_0, region=REGION, runtime_version='1.9', # scale_tier=ScaleTier, dag=dag) child_dag_names = ['sequential_training_12', 'sequential_training_24'] subdag = SubDagOperator(subdag=sub_dag(parent_dag_name, child_dag_names[0], default_args, dag.schedule_interval), task_id=child_dag_names[0], default_args=default_args, dag=dag) subdag1 = SubDagOperator(subdag=sub_dag(parent_dag_name, child_dag_names[1], default_args, dag.schedule_interval), task_id=child_dag_names[1], default_args=default_args, dag=dag) t0.set_downstream(subdag) subdag.set_downstream(subdag1)
section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, ) end = DummyOperator( task_id='end', default_args=args, dag=dag, ) start.set_downstream(section_1) section_1.set_downstream(some_other_task) some_other_task.set_downstream(section_2) section_2.set_downstream(end)
start = DummyOperator( task_id='start', default_args=args, dag=dag, ) onboard = SubDagOperator( task_id='onboard_dag', subdag=subdag(DAG_NAME, 'onboard_dag', args), default_args=args, dag=dag, ) ongoing = SubDagOperator( task_id='ongoing_dag', subdag=subdag(DAG_NAME, 'ongoing_dag', args), default_args=args, dag=dag, ) end = DummyOperator( task_id='end', default_args=args, dag=dag, ) start.set_downstream(onboard) onboard.set_downstream(ongoing) ongoing.set_downstream(end)
end.set_upstream(ssh_tasks) # if no hive scripts generrated short circuit step in the begining of main dag return ssh_dag dag = DAG( 's3_convert_json_to_parquet_emr_ssh', default_args=defautlt_args, dagrun_timeout=timedelta(hours=1), schedule_interval='0 3 * * *' ) step_entities_partitions = ShortCircuitOperator( task_id='step_entities_partitions', python_callable=gen_hive_scripts, provide_context=True, dag=dag) step_ssh_subdag = SubDagOperator( task_id='step_jobs_submit', subdag=get_sub_ssh_cmds_dag(dag, 'step_jobs_submit',defautlt_args), default_args=defautlt_args, dag=dag) step_end = DummyOperator( task_id='ssh_end', dag=dag) step_entities_partitions.set_downstream(step_ssh_subdag) step_ssh_subdag.set_downstream(step_end)
task_id=task_id, subdag=subdag(task_id, 'daily_items_indicator_info', data_worker.worker.collect_some_daily_info), executor=CeleryExecutor(), dag=dag, ) task_id = 'collect_daily_items_info_all' items_all_task = SubDagOperator( task_id=task_id, subdag=subdag(task_id, 'daily_items_info', data_worker.worker.collect_some_daily_info_all), executor=CeleryExecutor(), dag=dag, ) task_id = 'collect_daily_items_indicator_info_all' items_indicator_all_task = SubDagOperator( task_id=task_id, subdag=subdag(task_id, 'daily_items_indicator_info', data_worker.worker.collect_some_daily_info_all), executor=CeleryExecutor(), dag=dag, ) items_task.set_upstream( [itemcodes_task, item_naver_sectors_task, item_themes_task]) items_task.set_downstream(items_all_task) items_all_task.set_downstream(items_indicator_task) items_indicator_task.set_downstream(items_indicator_all_task)
task_id='remove_updated_dim_grade', python_callable=remove_updated_dim_grade, op_kwargs={'db': db}, dag=dag) insert_updated_dim_grade = PythonOperator( task_id='insert_updated_dim_grade', python_callable=insert_updated_dim_grade, op_kwargs={'db': db}, dag=dag) ######################################################################### ########################## DAG flow ##################################### download_files.set_upstream(begin_task) download_files.set_downstream(end_task) end_task.set_downstream(initialize_transaction_table_dim_business) initialize_transaction_table_dim_business.set_downstream( prepare_and_load_transaction_table_dim_business) prepare_and_load_transaction_table_dim_business.set_downstream( update_transaction_table_dim_business_boro) update_transaction_table_dim_business_boro.set_downstream(update_dim_business) update_dim_business.set_downstream(remove_updated_dim_business) remove_updated_dim_business.set_downstream(insert_updated_dim_business) initialize_transaction_table_dim_grade.set_upstream( remove_updated_dim_business) initialize_transaction_table_dim_grade.set_downstream( prepare_and_load_transaction_table_dim_grade) prepare_and_load_transaction_table_dim_business.set_downstream(
f"{BASE_PACKAGE}.transactional-tables", "OutletsByDate", dag, RETAIL_ID, schema_name, ENV_TYPE, ) items_by_date_task = bash_operator_for_spark_submit( f"{BASE_PACKAGE}.transactional-tables", "ItemsByDate", dag, RETAIL_ID, schema_name, ENV_TYPE, ) push_instruments.set_downstream(push_server_details) branch_task.set_upstream(push_server_details) branch_task.set_downstream(master_tables_load) branch_task.set_downstream(history_load_done) master_tables_load.set_downstream(create_table_structure) history_load_done.set_downstream(create_table_structure) create_table_structure.set_downstream(unix_chmod_task) unix_chmod_task.set_downstream(market_baskets_task) market_baskets_task.set_downstream( [transaction_line_item_task, outlets_by_date_task, items_by_date_task]) data_load_done.set_upstream( [transaction_line_item_task, outlets_by_date_task, items_by_date_task]) create_constraint_task.set_upstream(data_load_done)
dag=dag, ) sub = SubDagOperator(subdag=subdag, task_id="insert_and_id_pop", trigger_rule="one_success", dag=dag) clear_latest = BashOperator( bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR), task_id="clear_latest", dag=dag, ) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r"\W+", "", term) simple_search = PythonOperator( task_id="search_{}_twitter".format(term_without_punctuation), provide_context=True, python_callable=search_twitter, dag=dag, params={"query": term}, ) simple_search.set_upstream(gen_search_terms) simple_search.set_downstream(sub) sub.set_downstream(email_links) email_links.set_downstream(clear_latest)
dag=dag, ) task_concate_train_files.set_downstream(task_feature_engineering_for_train) input_parquet_raw_file_unit = DataInputFileUnit(project_path + 'datasets/temp/X_train_final.parquet', pandas_read_function_name='read_parquet') task_model_learning = DataOperator(operation_function=fit_write_model, params={'columns_selection': features_selection, 'column_target': feature_target, 'write_path': project_path + 'models/ensemble.model' }, input_unit=input_parquet_raw_file_unit, dag=dag, task_id='Model_learning') task_feature_engineering_for_train.set_downstream(task_model_learning) input_csv_files_unit = DataInputMultiFileUnit([project_path + 'datasets/input/X_tree_egc_t2.csv', project_path + 'datasets/input/X_geoloc_egc_t2.csv', project_path + 'datasets/input/Y_tree_egc_t2.csv'], sep=';') output_parquet_unit = DataOutputFileUnit(project_path + 'datasets/temp/X_test_raw.parquet', pandas_write_function_name='to_parquet') task_concate_test_files = DataOperator(operation_function=join_dataframes, input_unit=input_csv_files_unit, output_unit=output_parquet_unit, dag=dag, task_id='Join_test_data_source_files') task_feature_engineering_for_test = SubDagOperator( subdag=feature_engineering_sub_dag(dag.dag_id, 'Feature_engineering_for_test', model_path=project_path + 'models/', input_file=project_path + 'datasets/temp/X_test_raw.parquet',