Exemple #1
0
def get_predict(dag, cfg_name, use_all_nodes=True, force_exec=False):
    """
        use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the predict function
        will run on all nodes. When False, it gets the list from the ini file.
    """
    task_id = 'predict'
    nodes_of_interest = [] if use_all_nodes else parser.getnodelist(
        cfg_name, get_nodes_of_interest(cfg_name))

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=predict,
        ppo_kwargs={
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'interval_width':
            (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT),
            'svm_training_technique':
            (parser.get(cfg_name, 'svm_training_technique'), HASH_IT),
            'nodes_of_interest': (nodes_of_interest, HASH_IT),
            'reference_nodes':
            (parser.getnodelist(cfg_name,
                                parser.get(cfg_name,
                                           'reference_nodes')), HASH_IT),
            'reference_victim_node':
            (parser.get(cfg_name, 'reference_victim_node'), HASH_IT),
            'airflow_vars': ({
                'training_intervals_count':
                cfg_name + 'training_intervals_count'
            }, NO_HASH)
        },
        input_files={
            **get_inference_dataset_output_files(cfg_name), 'node_embeddings':
            ResourcePathById(
                cfg_name=cfg_name,
                origin_task_id='create_graph_model_node_embeddings',
                origin_resource_id='node_embeddings'),
            'trained_model':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='train_graph_model',
                             origin_resource_id='trained_model')
        },
        output_files={
            'prediction_df':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_prediction.h5'),
            'df_metrics':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_metrics.h5')
        },
        dag=dag,
        cfg_name=cfg_name)
Exemple #2
0
def get_create_interval_metrics_tabular_xgboost(dag,
                                                cfg_name,
                                                force_exec=False):
    # Exact same than get_create_interval_metrics,
    # except the input files comes from another task... handle file in-out from higher level?
    task_id = 'create_interval_metrics_tabular_xgboost'
    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=create_interval_metrics,
        ppo_kwargs={
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'title': (cfg_name, HASH_IT),
        },
        input_files={
            'prediction_df':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='fit_predict_xgboost',
                             origin_resource_id='df_interval_predictions')
        },
        output_files={
            'grid_png':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename=f"grid_{cfg_name}.png"),
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
        },
        dag=dag,
        cfg_name=cfg_name)
Exemple #3
0
def get_create_interval_metrics(dag, cfg_name, force_exec=False):
    task_id = 'create_interval_metrics'
    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=create_interval_metrics,
        ppo_kwargs={
            'interval_width': (parser.getint(cfg_name,
                                             'test_interval_width'), HASH_IT),
            'title': (cfg_name, HASH_IT),
        },
        input_files={
            'prediction_df':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='predict',
                             origin_resource_id='prediction_df')
        },
        output_files={
            'grid_png':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename=f"grid_{cfg_name}.png"),
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
        },
        dag=dag,
        cfg_name=cfg_name)
Exemple #4
0
def test_resource_task_indexed_path(setup_output_path):
    dag = DAG('test_dyn',
              default_args=default_args,
              schedule_interval=timedelta(days=1))

    # def callable1_create_file(tp, in_files, out_files, *op_args, **op_kwargs):
    def callable1_create_file(log, in_files, out_files, **op_kwargs):
        with open(out_files['test_file_dyn_location'].path, 'w') as file:
            file.write("testing dynamic paths")
            return 'succeeded'
        return 'failed'

    # def callable2_read_file(tp, in_files, out_files, *op_args, **op_kwargs):
    def callable2_read_file(log, in_files, out_files, **op_kwargs):
        with open(in_files['test_file_dyn_location'].path, 'r') as file:
            assert file.readline(
            ) == 'testing dynamic paths', 'Invalid file content!'
            return 'succeeded'
        return 'failed'

    create_file = PythonPersistentOperator(
        task_id='create_file',
        force_execution=True,
        python_callable=callable1_create_file,
        output_files={
            'test_file_dyn_location':
            ResourcePathDynamic(
                path=[('var', cfg_name +
                       'out_dir'), ('var', cfg_name +
                                    'create_file_hash'), (
                                        'const',
                                        'training'), ('const',
                                                      'test_data.txt')])
        },
        dag=dag,
        cfg_name=cfg_name)

    read_file = PythonPersistentOperator(
        task_id='read_file',
        force_execution=True,
        python_callable=callable2_read_file,
        input_files={
            'test_file_dyn_location':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='create_file',
                             origin_resource_id='test_file_dyn_location')
        },
        dag=dag,
        cfg_name=cfg_name)

    ti1 = TaskInstance(task=create_file, execution_date=datetime.now())
    ti2 = TaskInstance(task=read_file, execution_date=datetime.now())

    result1 = create_file.execute(ti1.get_template_context())
    result2 = read_file.execute(ti2.get_template_context())

    assert result1 == 'succeeded'
    assert result2 == 'succeeded'
Exemple #5
0
def get_node_analysis(dag, cfg_name, force_exec=False):
    task_id = 'node_analysis'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=nodes_analysis,
        ppo_kwargs={
            'experiment_name': (cfg_name, HASH_IT),
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'nodes_of_interest':
            (parser.getnodelist(cfg_name,
                                get_nodes_of_interest(cfg_name)), HASH_IT),
            'reference_nodes':
            (parser.get(cfg_name, parser.get(cfg_name,
                                             'reference_nodes')), HASH_IT),
            'reference_victim_node':
            (parser.get(cfg_name, 'reference_victim_node'), HASH_IT),
        },
        input_files={
            'df_metrics':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='predict',
                             origin_resource_id='df_metrics')
        },
        output_files={
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
            'df_detailed_classifier_data':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename='df_detailed_classifier_data.h5'),
            'df_roc_classifier_data':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_roc_classifier_data.h5')
        },
        dag=dag,
        cfg_name=cfg_name)
Exemple #6
0
def get_create_graph_model_node_embeddings(dag,
                                           cfg_name,
                                           use_all_nodes=True,
                                           force_exec=False):
    """
        infer_graph_model

        use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the infer_graph_model
        function will run on all nodes. When False, it gets the list from the ini file.
    """
    task_id = 'create_graph_model_node_embeddings'
    nodes_of_interest = [] if use_all_nodes else parser.getnodelist(
        cfg_name, get_nodes_of_interest(cfg_name))

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=infer_graph_model,
        ppo_kwargs={
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'interval_width':
            (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT),
            'predicator_name': (parser.get(cfg_name,
                                           'model_trainer_type'), NO_HASH),
            'hidden_dim': (parser.getint(cfg_name, 'hidden_dim'), HASH_IT),
            'nodes_of_interest': (nodes_of_interest, HASH_IT),
            'tensorboard_writer': (get_writer(cfg_name), NO_HASH),
        },
        input_files={
            **get_inference_dataset_output_files(cfg_name), 'trained_model':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='train_graph_model',
                             origin_resource_id='trained_model')
        },
        output_files={
            'node_embeddings':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=get_trained_model_filename(cfg_name))
        },
        dag=dag,
        cfg_name=cfg_name)