def get_predict(dag, cfg_name, use_all_nodes=True, force_exec=False): """ use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the predict function will run on all nodes. When False, it gets the list from the ini file. """ task_id = 'predict' nodes_of_interest = [] if use_all_nodes else parser.getnodelist( cfg_name, get_nodes_of_interest(cfg_name)) return PythonPersistentOperator( task_id=task_id, force_execution=force_exec, python_callable=predict, ppo_kwargs={ 'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT), 'interval_width': (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT), 'svm_training_technique': (parser.get(cfg_name, 'svm_training_technique'), HASH_IT), 'nodes_of_interest': (nodes_of_interest, HASH_IT), 'reference_nodes': (parser.getnodelist(cfg_name, parser.get(cfg_name, 'reference_nodes')), HASH_IT), 'reference_victim_node': (parser.get(cfg_name, 'reference_victim_node'), HASH_IT), 'airflow_vars': ({ 'training_intervals_count': cfg_name + 'training_intervals_count' }, NO_HASH) }, input_files={ **get_inference_dataset_output_files(cfg_name), 'node_embeddings': ResourcePathById( cfg_name=cfg_name, origin_task_id='create_graph_model_node_embeddings', origin_resource_id='node_embeddings'), 'trained_model': ResourcePathById(cfg_name=cfg_name, origin_task_id='train_graph_model', origin_resource_id='trained_model') }, output_files={ 'prediction_df': ResourcePathOutput(cfg_name=cfg_name, task_id=task_id, resource_filename='df_prediction.h5'), 'df_metrics': ResourcePathOutput(cfg_name=cfg_name, task_id=task_id, resource_filename='df_metrics.h5') }, dag=dag, cfg_name=cfg_name)
def get_create_interval_metrics_tabular_xgboost(dag, cfg_name, force_exec=False): # Exact same than get_create_interval_metrics, # except the input files comes from another task... handle file in-out from higher level? task_id = 'create_interval_metrics_tabular_xgboost' return PythonPersistentOperator( task_id=task_id, force_execution=force_exec, python_callable=create_interval_metrics, ppo_kwargs={ 'interval_width': (parser.getint(cfg_name, 'interval_width'), HASH_IT), 'title': (cfg_name, HASH_IT), }, input_files={ 'prediction_df': ResourcePathById(cfg_name=cfg_name, origin_task_id='fit_predict_xgboost', origin_resource_id='df_interval_predictions') }, output_files={ 'grid_png': ResourcePathOutput(cfg_name=cfg_name, task_id=task_id, resource_filename=f"grid_{cfg_name}.png"), 'metrics_summary_file': ResourcePathOutput( cfg_name=cfg_name, task_id=task_id, resource_filename=f"metrics_summary_file_{cfg_name}.txt"), }, dag=dag, cfg_name=cfg_name)
def get_create_interval_metrics(dag, cfg_name, force_exec=False): task_id = 'create_interval_metrics' return PythonPersistentOperator( task_id=task_id, force_execution=force_exec, python_callable=create_interval_metrics, ppo_kwargs={ 'interval_width': (parser.getint(cfg_name, 'test_interval_width'), HASH_IT), 'title': (cfg_name, HASH_IT), }, input_files={ 'prediction_df': ResourcePathById(cfg_name=cfg_name, origin_task_id='predict', origin_resource_id='prediction_df') }, output_files={ 'grid_png': ResourcePathOutput(cfg_name=cfg_name, task_id=task_id, resource_filename=f"grid_{cfg_name}.png"), 'metrics_summary_file': ResourcePathOutput( cfg_name=cfg_name, task_id=task_id, resource_filename=f"metrics_summary_file_{cfg_name}.txt"), }, dag=dag, cfg_name=cfg_name)
def test_resource_task_indexed_path(setup_output_path): dag = DAG('test_dyn', default_args=default_args, schedule_interval=timedelta(days=1)) # def callable1_create_file(tp, in_files, out_files, *op_args, **op_kwargs): def callable1_create_file(log, in_files, out_files, **op_kwargs): with open(out_files['test_file_dyn_location'].path, 'w') as file: file.write("testing dynamic paths") return 'succeeded' return 'failed' # def callable2_read_file(tp, in_files, out_files, *op_args, **op_kwargs): def callable2_read_file(log, in_files, out_files, **op_kwargs): with open(in_files['test_file_dyn_location'].path, 'r') as file: assert file.readline( ) == 'testing dynamic paths', 'Invalid file content!' return 'succeeded' return 'failed' create_file = PythonPersistentOperator( task_id='create_file', force_execution=True, python_callable=callable1_create_file, output_files={ 'test_file_dyn_location': ResourcePathDynamic( path=[('var', cfg_name + 'out_dir'), ('var', cfg_name + 'create_file_hash'), ( 'const', 'training'), ('const', 'test_data.txt')]) }, dag=dag, cfg_name=cfg_name) read_file = PythonPersistentOperator( task_id='read_file', force_execution=True, python_callable=callable2_read_file, input_files={ 'test_file_dyn_location': ResourcePathById(cfg_name=cfg_name, origin_task_id='create_file', origin_resource_id='test_file_dyn_location') }, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file, execution_date=datetime.now()) ti2 = TaskInstance(task=read_file, execution_date=datetime.now()) result1 = create_file.execute(ti1.get_template_context()) result2 = read_file.execute(ti2.get_template_context()) assert result1 == 'succeeded' assert result2 == 'succeeded'
def get_node_analysis(dag, cfg_name, force_exec=False): task_id = 'node_analysis' return PythonPersistentOperator( task_id=task_id, force_execution=force_exec, python_callable=nodes_analysis, ppo_kwargs={ 'experiment_name': (cfg_name, HASH_IT), 'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT), 'nodes_of_interest': (parser.getnodelist(cfg_name, get_nodes_of_interest(cfg_name)), HASH_IT), 'reference_nodes': (parser.get(cfg_name, parser.get(cfg_name, 'reference_nodes')), HASH_IT), 'reference_victim_node': (parser.get(cfg_name, 'reference_victim_node'), HASH_IT), }, input_files={ 'df_metrics': ResourcePathById(cfg_name=cfg_name, origin_task_id='predict', origin_resource_id='df_metrics') }, output_files={ 'metrics_summary_file': ResourcePathOutput( cfg_name=cfg_name, task_id=task_id, resource_filename=f"metrics_summary_file_{cfg_name}.txt"), 'df_detailed_classifier_data': ResourcePathOutput( cfg_name=cfg_name, task_id=task_id, resource_filename='df_detailed_classifier_data.h5'), 'df_roc_classifier_data': ResourcePathOutput(cfg_name=cfg_name, task_id=task_id, resource_filename='df_roc_classifier_data.h5') }, dag=dag, cfg_name=cfg_name)
def get_create_graph_model_node_embeddings(dag, cfg_name, use_all_nodes=True, force_exec=False): """ infer_graph_model use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the infer_graph_model function will run on all nodes. When False, it gets the list from the ini file. """ task_id = 'create_graph_model_node_embeddings' nodes_of_interest = [] if use_all_nodes else parser.getnodelist( cfg_name, get_nodes_of_interest(cfg_name)) return PythonPersistentOperator( task_id=task_id, force_execution=force_exec, python_callable=infer_graph_model, ppo_kwargs={ 'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT), 'interval_width': (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT), 'predicator_name': (parser.get(cfg_name, 'model_trainer_type'), NO_HASH), 'hidden_dim': (parser.getint(cfg_name, 'hidden_dim'), HASH_IT), 'nodes_of_interest': (nodes_of_interest, HASH_IT), 'tensorboard_writer': (get_writer(cfg_name), NO_HASH), }, input_files={ **get_inference_dataset_output_files(cfg_name), 'trained_model': ResourcePathById(cfg_name=cfg_name, origin_task_id='train_graph_model', origin_resource_id='trained_model') }, output_files={ 'node_embeddings': ResourcePathOutput( cfg_name=cfg_name, task_id=task_id, resource_filename=get_trained_model_filename(cfg_name)) }, dag=dag, cfg_name=cfg_name)