Esempio n. 1
0
    dag=cell_image_analysis_2channels_dag,
)


def prepare_cellprofiler_csv(ds, **kwargs):
    """Prepare the cellprofiler csv based on the args"""
    df = get_cell_images_df(**kwargs)
    kwargs['ti'].xcom_push(key='cell_images_df', value=df)
    return


prepare_cellprofiler_csv_op = PythonOperator(
    task_id='prepare_cellprofiler_csv',
    provide_context=True,
    python_callable=prepare_cellprofiler_csv,
    dag=cell_image_analysis_2channels_dag)

prepare_cellprofiler_csv_op.set_downstream(image_conversion_dag)
cellprofiler_tasks = cell_image_analysis_generate_cellprofiler_task(
    cell_image_analysis_2channels_dag)
cellprofiler_branch_tasks = cell_image_analysis_generate_decide_run_cellprofiler(
    cell_image_analysis_2channels_dag)
image_conversion_dag.set_downstream(cellprofiler_branch_tasks)

cell_image_analysis_no_images_to_run_op.set_upstream(cellprofiler_branch_tasks)

for idx, cellprofiler_branch_task in enumerate(cellprofiler_branch_tasks):
    cellprofiler_branch_task.set_downstream(cellprofiler_tasks[idx])

cell_image_analysis_combine_cellprofiler_csvs.set_upstream(cellprofiler_tasks)
Esempio n. 2
0
section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    default_args=args,
    dag=dag,
)

end = DummyOperator(
    task_id='end',
    default_args=args,
    dag=dag,
)

start.set_downstream(section_1)
section_1.set_downstream(some_other_task)
some_other_task.set_downstream(section_2)
section_2.set_downstream(end)
Esempio n. 3
0
dwh_script = BashOperator(
    task_id='dwh_script',
    bash_command=conf['bash_path']['load_to_dwh_location'] + ' ',
    on_failure_callback=send_email_failure,
    dag=dag)

send_email = EmailOperator(
    to='*****@*****.**',
    task_id='send_email_after_all_success',
    subject='DAG run for {{ ds }} was successful.',
    html_content=
    'All the task for {{ yesterday_ds }} was successful. For more details visit http://172.20.23.251:8080/admin/airflow/tree?num_runs=25&dag_id=Jobs',
    dag=dag)

brightedge_keyword_rank_data.set_downstream(integration_script)
brightedge_share_of_voice.set_downstream(integration_script)
doubleclick_search_campaign.set_downstream(integration_script)
doubleclick_search_productadvertised.set_downstream(integration_script)
doubleclick_search_keyword.set_downstream(integration_script)
doubleclick_dcm_standard_display_uk.set_downstream(integration_script)
partners_display_performance.set_downstream(integration_script)
time_wait_till_6_00am.set_downstream(partners_display_performance)
time_wait_till_6_00am.set_downstream(doubleclick_search_campaign)
time_wait_till_6_00am.set_downstream(doubleclick_dcm_standard_display_uk)
time_wait_till_6_00am.set_downstream(doubleclick_search_productadvertised)
time_wait_till_6_00am.set_downstream(sensor_to_check_ga_table_preset)
time_wait_till_6_00am.set_downstream(doubleclick_search_keyword)
sensor_to_check_ga_table_preset.set_downstream(unique_visitors)
sensor_to_check_ga_table_preset.set_downstream(gapremiuim_dailyaggregatesfull)
gapremiuim_dailyaggregatesfull.set_downstream(integration_script)
Esempio n. 4
0
    '--fix-flag',
    '0',
]

t0 = mlengine_operator.MLEngineTrainingOperator(
    # gcp_conn_id='project_connection',
    task_id='sequential_startPoint',
    project_id=PROJECT_ID,
    job_id=job_id0,
    package_uris=[PACKAGE_URI],
    training_python_module='trainer.task',
    training_args=training_args_0,
    region=REGION,
    runtime_version='1.9',
    # scale_tier=ScaleTier,
    dag=dag)
child_dag_names = ['sequential_training_12', 'sequential_training_24']
subdag = SubDagOperator(subdag=sub_dag(parent_dag_name, child_dag_names[0],
                                       default_args, dag.schedule_interval),
                        task_id=child_dag_names[0],
                        default_args=default_args,
                        dag=dag)
subdag1 = SubDagOperator(subdag=sub_dag(parent_dag_name, child_dag_names[1],
                                        default_args, dag.schedule_interval),
                         task_id=child_dag_names[1],
                         default_args=default_args,
                         dag=dag)

t0.set_downstream(subdag)
subdag.set_downstream(subdag1)
section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    default_args=args,
    dag=dag,
)

end = DummyOperator(
    task_id='end',
    default_args=args,
    dag=dag,
)

start.set_downstream(section_1)
section_1.set_downstream(some_other_task)
some_other_task.set_downstream(section_2)
section_2.set_downstream(end)
Esempio n. 6
0
start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

onboard = SubDagOperator(
    task_id='onboard_dag',
    subdag=subdag(DAG_NAME, 'onboard_dag', args),
    default_args=args,
    dag=dag,
)

ongoing = SubDagOperator(
    task_id='ongoing_dag',
    subdag=subdag(DAG_NAME, 'ongoing_dag', args),
    default_args=args,
    dag=dag,
)

end = DummyOperator(
    task_id='end',
    default_args=args,
    dag=dag,
)

start.set_downstream(onboard)
onboard.set_downstream(ongoing)
ongoing.set_downstream(end)
Esempio n. 7
0
	end.set_upstream(ssh_tasks)
	# if no hive scripts generrated short circuit step in the begining of main dag
	return ssh_dag
	
		
dag = DAG(
    's3_convert_json_to_parquet_emr_ssh',
    default_args=defautlt_args,
    dagrun_timeout=timedelta(hours=1),
    schedule_interval='0 3 * * *'
)

step_entities_partitions = ShortCircuitOperator(
		task_id='step_entities_partitions', 
		python_callable=gen_hive_scripts, 
		provide_context=True,
		dag=dag)
		
step_ssh_subdag = SubDagOperator(
	task_id='step_jobs_submit',
	subdag=get_sub_ssh_cmds_dag(dag, 'step_jobs_submit',defautlt_args),
	default_args=defautlt_args,
	dag=dag)
	
step_end = DummyOperator(
	task_id='ssh_end',
	dag=dag)

step_entities_partitions.set_downstream(step_ssh_subdag)
step_ssh_subdag.set_downstream(step_end)
Esempio n. 8
0
File: dag.py Progetto: gsroot/storm
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_indicator_info',
                  data_worker.worker.collect_some_daily_info),
    executor=CeleryExecutor(),
    dag=dag,
)

task_id = 'collect_daily_items_info_all'
items_all_task = SubDagOperator(
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_info',
                  data_worker.worker.collect_some_daily_info_all),
    executor=CeleryExecutor(),
    dag=dag,
)

task_id = 'collect_daily_items_indicator_info_all'
items_indicator_all_task = SubDagOperator(
    task_id=task_id,
    subdag=subdag(task_id, 'daily_items_indicator_info',
                  data_worker.worker.collect_some_daily_info_all),
    executor=CeleryExecutor(),
    dag=dag,
)

items_task.set_upstream(
    [itemcodes_task, item_naver_sectors_task, item_themes_task])
items_task.set_downstream(items_all_task)
items_all_task.set_downstream(items_indicator_task)
items_indicator_task.set_downstream(items_indicator_all_task)
    task_id='remove_updated_dim_grade',
    python_callable=remove_updated_dim_grade,
    op_kwargs={'db': db},
    dag=dag)

insert_updated_dim_grade = PythonOperator(
    task_id='insert_updated_dim_grade',
    python_callable=insert_updated_dim_grade,
    op_kwargs={'db': db},
    dag=dag)

#########################################################################
########################## DAG flow #####################################

download_files.set_upstream(begin_task)
download_files.set_downstream(end_task)
end_task.set_downstream(initialize_transaction_table_dim_business)

initialize_transaction_table_dim_business.set_downstream(
    prepare_and_load_transaction_table_dim_business)
prepare_and_load_transaction_table_dim_business.set_downstream(
    update_transaction_table_dim_business_boro)
update_transaction_table_dim_business_boro.set_downstream(update_dim_business)
update_dim_business.set_downstream(remove_updated_dim_business)
remove_updated_dim_business.set_downstream(insert_updated_dim_business)

initialize_transaction_table_dim_grade.set_upstream(
    remove_updated_dim_business)
initialize_transaction_table_dim_grade.set_downstream(
    prepare_and_load_transaction_table_dim_grade)
prepare_and_load_transaction_table_dim_business.set_downstream(
Esempio n. 10
0
    f"{BASE_PACKAGE}.transactional-tables",
    "OutletsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)
items_by_date_task = bash_operator_for_spark_submit(
    f"{BASE_PACKAGE}.transactional-tables",
    "ItemsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)

push_instruments.set_downstream(push_server_details)
branch_task.set_upstream(push_server_details)
branch_task.set_downstream(master_tables_load)

branch_task.set_downstream(history_load_done)
master_tables_load.set_downstream(create_table_structure)
history_load_done.set_downstream(create_table_structure)
create_table_structure.set_downstream(unix_chmod_task)
unix_chmod_task.set_downstream(market_baskets_task)
market_baskets_task.set_downstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
data_load_done.set_upstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
create_constraint_task.set_upstream(data_load_done)
    dag=dag,
)

sub = SubDagOperator(subdag=subdag,
                     task_id="insert_and_id_pop",
                     trigger_rule="one_success",
                     dag=dag)

clear_latest = BashOperator(
    bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR),
    task_id="clear_latest",
    dag=dag,
)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r"\W+", "", term)
    simple_search = PythonOperator(
        task_id="search_{}_twitter".format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={"query": term},
    )
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
Esempio n. 12
0
    dag=dag,
)

task_concate_train_files.set_downstream(task_feature_engineering_for_train)

input_parquet_raw_file_unit = DataInputFileUnit(project_path + 'datasets/temp/X_train_final.parquet',
                                                pandas_read_function_name='read_parquet')
task_model_learning = DataOperator(operation_function=fit_write_model,
                                   params={'columns_selection': features_selection,
                                           'column_target': feature_target,
                                           'write_path': project_path + 'models/ensemble.model'
                                           },
                                   input_unit=input_parquet_raw_file_unit,
                                   dag=dag, task_id='Model_learning')

task_feature_engineering_for_train.set_downstream(task_model_learning)

input_csv_files_unit = DataInputMultiFileUnit([project_path + 'datasets/input/X_tree_egc_t2.csv',
                                               project_path + 'datasets/input/X_geoloc_egc_t2.csv',
                                               project_path + 'datasets/input/Y_tree_egc_t2.csv'], sep=';')
output_parquet_unit = DataOutputFileUnit(project_path + 'datasets/temp/X_test_raw.parquet',
                                         pandas_write_function_name='to_parquet')
task_concate_test_files = DataOperator(operation_function=join_dataframes,
                                       input_unit=input_csv_files_unit,
                                       output_unit=output_parquet_unit,
                                       dag=dag, task_id='Join_test_data_source_files')

task_feature_engineering_for_test = SubDagOperator(
    subdag=feature_engineering_sub_dag(dag.dag_id, 'Feature_engineering_for_test',
                                       model_path=project_path + 'models/',
                                       input_file=project_path + 'datasets/temp/X_test_raw.parquet',