Esempio n. 1
0
def get_create_interval_metrics_tabular_xgboost(dag,
                                                cfg_name,
                                                force_exec=False):
    # Exact same than get_create_interval_metrics,
    # except the input files comes from another task... handle file in-out from higher level?
    task_id = 'create_interval_metrics_tabular_xgboost'
    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=create_interval_metrics,
        ppo_kwargs={
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'title': (cfg_name, HASH_IT),
        },
        input_files={
            'prediction_df':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='fit_predict_xgboost',
                             origin_resource_id='df_interval_predictions')
        },
        output_files={
            'grid_png':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename=f"grid_{cfg_name}.png"),
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 2
0
def get_create_interval_metrics(dag, cfg_name, force_exec=False):
    task_id = 'create_interval_metrics'
    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=create_interval_metrics,
        ppo_kwargs={
            'interval_width': (parser.getint(cfg_name,
                                             'test_interval_width'), HASH_IT),
            'title': (cfg_name, HASH_IT),
        },
        input_files={
            'prediction_df':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='predict',
                             origin_resource_id='prediction_df')
        },
        output_files={
            'grid_png':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename=f"grid_{cfg_name}.png"),
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 3
0
def get_fit_predict_random_forest_also(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_random_forest_ALSO'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_random_forest_also,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'mean_scaling_threshold':
            (parser.getfloat(cfg_name, 'mean_scaling_threshold'), HASH_IT),
            'random_forest_max_depth':
            (parser.getint(cfg_name, 'random_forest_max_depth'), HASH_IT),
            'random_forest_random_state':
            (parser.getint(cfg_name, 'random_forest_random_state'), HASH_IT),
            'random_forest_n_estimators':
            (parser.getint(cfg_name, 'random_forest_n_estimators'), HASH_IT),
            'folds': (parser.getint(cfg_name, 'folds'), HASH_IT),
            'samples_training_ratio':
            (parser.getfloat(cfg_name, 'samples_training_ratio'), HASH_IT),
            'model_type': (parser.get(cfg_name, 'model_type'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_also_predicted_scores':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename='df_also_predicted_scores.h5'),
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 4
0
def get_fit_predict_local_outlier_factor(dag,
                                         cfg_name,
                                         force_exec=False,
                                         use_smote=False):
    task_id = 'fit_predict_local_outlier_factor'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()
    cat_variables = parser.get(cfg_name, 'cont_variables').splitlines()

    smote_random_state = parser.getint(
        cfg_name, 'smote_random_state') if use_smote is True else None

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_local_outlier_factor,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'cat_variables': (cat_variables, HASH_IT),
            'n_neighbors': (parser.getint(cfg_name, 'n_neighbors'), HASH_IT),
            'contamination': (parser.getfloat(cfg_name,
                                              'contamination'), HASH_IT),
            'use_smote': (use_smote, HASH_IT),
            'smote_random_state': (smote_random_state, HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 5
0
def get_predict(dag, cfg_name, use_all_nodes=True, force_exec=False):
    """
        use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the predict function
        will run on all nodes. When False, it gets the list from the ini file.
    """
    task_id = 'predict'
    nodes_of_interest = [] if use_all_nodes else parser.getnodelist(
        cfg_name, get_nodes_of_interest(cfg_name))

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=predict,
        ppo_kwargs={
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'interval_width':
            (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT),
            'svm_training_technique':
            (parser.get(cfg_name, 'svm_training_technique'), HASH_IT),
            'nodes_of_interest': (nodes_of_interest, HASH_IT),
            'reference_nodes':
            (parser.getnodelist(cfg_name,
                                parser.get(cfg_name,
                                           'reference_nodes')), HASH_IT),
            'reference_victim_node':
            (parser.get(cfg_name, 'reference_victim_node'), HASH_IT),
            'airflow_vars': ({
                'training_intervals_count':
                cfg_name + 'training_intervals_count'
            }, NO_HASH)
        },
        input_files={
            **get_inference_dataset_output_files(cfg_name), 'node_embeddings':
            ResourcePathById(
                cfg_name=cfg_name,
                origin_task_id='create_graph_model_node_embeddings',
                origin_resource_id='node_embeddings'),
            'trained_model':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='train_graph_model',
                             origin_resource_id='trained_model')
        },
        output_files={
            'prediction_df':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_prediction.h5'),
            'df_metrics':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_metrics.h5')
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 6
0
def get_fit_predict_random_forest_classifier(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_random_forest_classifier'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()
    cat_variables = []

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_random_forest_classifier,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'cat_variables': (cat_variables, HASH_IT),
            'n_estimators': (parser.getint(cfg_name, 'n_estimators'), HASH_IT),
            'max_depth': (parser.getint(cfg_name, 'max_depth'), HASH_IT),
            'random_state': (parser.getint(cfg_name, 'random_state'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 7
0
def get_fit_predict_xgboost(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_xgboost'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_xgboost,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'importance_type': (parser.get(cfg_name,
                                           'importance_type'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'features_importance':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='features_importance.txt'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 8
0
def get_node_analysis(dag, cfg_name, force_exec=False):
    task_id = 'node_analysis'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=nodes_analysis,
        ppo_kwargs={
            'experiment_name': (cfg_name, HASH_IT),
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'nodes_of_interest':
            (parser.getnodelist(cfg_name,
                                get_nodes_of_interest(cfg_name)), HASH_IT),
            'reference_nodes':
            (parser.get(cfg_name, parser.get(cfg_name,
                                             'reference_nodes')), HASH_IT),
            'reference_victim_node':
            (parser.get(cfg_name, 'reference_victim_node'), HASH_IT),
        },
        input_files={
            'df_metrics':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='predict',
                             origin_resource_id='df_metrics')
        },
        output_files={
            'metrics_summary_file':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=f"metrics_summary_file_{cfg_name}.txt"),
            'df_detailed_classifier_data':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename='df_detailed_classifier_data.h5'),
            'df_roc_classifier_data':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_roc_classifier_data.h5')
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 9
0
def get_create_graph_model_node_embeddings(dag,
                                           cfg_name,
                                           use_all_nodes=True,
                                           force_exec=False):
    """
        infer_graph_model

        use_all_nodes: True, False. When True, nodes_of_interest is set to an empty list and the infer_graph_model
        function will run on all nodes. When False, it gets the list from the ini file.
    """
    task_id = 'create_graph_model_node_embeddings'
    nodes_of_interest = [] if use_all_nodes else parser.getnodelist(
        cfg_name, get_nodes_of_interest(cfg_name))

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=infer_graph_model,
        ppo_kwargs={
            'start': (parser.gettimestamp(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'test_end'), HASH_IT),
            'interval_width':
            (parser.gettimedelta(cfg_name, 'test_interval_width'), HASH_IT),
            'predicator_name': (parser.get(cfg_name,
                                           'model_trainer_type'), NO_HASH),
            'hidden_dim': (parser.getint(cfg_name, 'hidden_dim'), HASH_IT),
            'nodes_of_interest': (nodes_of_interest, HASH_IT),
            'tensorboard_writer': (get_writer(cfg_name), NO_HASH),
        },
        input_files={
            **get_inference_dataset_output_files(cfg_name), 'trained_model':
            ResourcePathById(cfg_name=cfg_name,
                             origin_task_id='train_graph_model',
                             origin_resource_id='trained_model')
        },
        output_files={
            'node_embeddings':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=get_trained_model_filename(cfg_name))
        },
        dag=dag,
        cfg_name=cfg_name)
Esempio n. 10
0
def get_train_graph_model(dag, cfg_name, force_exec=False):
    """
        train_graph_model
    """
    task_id = 'train_graph_model'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=train_graph_model,
        ppo_kwargs={
            'start': (parser.gettimestamp(cfg_name, 'train_start'), HASH_IT),
            'end': (parser.gettimestamp(cfg_name, 'train_end'), HASH_IT),
            'interval_width':
            (parser.gettimedelta(cfg_name, 'train_interval_width'), HASH_IT),
            'hidden_dim': (parser.getint(cfg_name, 'hidden_dim'), HASH_IT),
            'feature_extractor': (parser.get(cfg_name,
                                             'feature_extractor'), HASH_IT),
            'training_epochs': (parser.getint(cfg_name,
                                              'training_epochs'), NO_HASH),
            'predicator_name': (parser.get(cfg_name,
                                           'model_trainer_type'), NO_HASH),
            'tensorboard_writer': (get_writer(cfg_name), NO_HASH),
            'patience_epochs': (parser.getint(cfg_name,
                                              'patience_epochs'), HASH_IT),
            'learning_rate': (parser.getfloat(cfg_name,
                                              'learning_rate'), HASH_IT),
        },
        input_files=get_training_dataset_output_files(cfg_name),
        output_files={
            'trained_model':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=get_trained_model_filename(cfg_name))
        },
        dag=dag,
        cfg_name=cfg_name)