def generate_intervals(start: pd.Timestamp,
                       end: pd.Timestamp,
                       interval_width: pd.Timedelta,
                       static_path_prefix: str = None,
                       dynamic_path_prefix: Optional[List[Tuple[str,
                                                                str]]] = None):

    assert (static_path_prefix is not None and dynamic_path_prefix is None) or\
           (static_path_prefix is None and dynamic_path_prefix is not None), "Ill defined prefix"
    assert start < end and (end - interval_width) >= start, \
        f"Invalid start/end/interval_width config: {start}/{end}/{interval_width}"

    span = pd.date_range(start=start,
                         end=end - interval_width,
                         freq=interval_width)
    data_files = ['%s_%s' % (str(i), str(i + interval_width)) for i in span]

    filenames: Dict[str, ResourcePath] = {}
    for i, f in enumerate(data_files):
        if static_path_prefix is not None:
            filenames[f'data_files_{i}'] = ResourcePathStatic(
                path=osp.join(static_path_prefix, f))
        elif dynamic_path_prefix is not None:
            dynamic_path = dynamic_path_prefix.copy()
            dynamic_path.append(('const', f))
            filenames[f'data_files_{i}'] = ResourcePathDynamic(
                path=dynamic_path)
        else:
            filenames[f'data_files_{i}'] = ResourcePathStatic(path=f)

    return filenames
Example #2
0
def get_fit_predict_random_forest_also(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_random_forest_ALSO'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_random_forest_also,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'mean_scaling_threshold':
            (parser.getfloat(cfg_name, 'mean_scaling_threshold'), HASH_IT),
            'random_forest_max_depth':
            (parser.getint(cfg_name, 'random_forest_max_depth'), HASH_IT),
            'random_forest_random_state':
            (parser.getint(cfg_name, 'random_forest_random_state'), HASH_IT),
            'random_forest_n_estimators':
            (parser.getint(cfg_name, 'random_forest_n_estimators'), HASH_IT),
            'folds': (parser.getint(cfg_name, 'folds'), HASH_IT),
            'samples_training_ratio':
            (parser.getfloat(cfg_name, 'samples_training_ratio'), HASH_IT),
            'model_type': (parser.get(cfg_name, 'model_type'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_also_predicted_scores':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename='df_also_predicted_scores.h5'),
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Example #3
0
def get_fit_predict_local_outlier_factor(dag,
                                         cfg_name,
                                         force_exec=False,
                                         use_smote=False):
    task_id = 'fit_predict_local_outlier_factor'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()
    cat_variables = parser.get(cfg_name, 'cont_variables').splitlines()

    smote_random_state = parser.getint(
        cfg_name, 'smote_random_state') if use_smote is True else None

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_local_outlier_factor,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'cat_variables': (cat_variables, HASH_IT),
            'n_neighbors': (parser.getint(cfg_name, 'n_neighbors'), HASH_IT),
            'contamination': (parser.getfloat(cfg_name,
                                              'contamination'), HASH_IT),
            'use_smote': (use_smote, HASH_IT),
            'smote_random_state': (smote_random_state, HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Example #4
0
def test_resource_static_path(setup_output_path):
    out_dir = get_out_dir(cfg_name=cfg_name)
    dag = DAG('test_stat',
              default_args=default_args,
              schedule_interval=timedelta(days=1))

    def callable1_dummy_text_read(tp, in_files, out_files, *op_args,
                                  **op_kwargs):
        with open(in_files['test_file'].path, 'r') as file:
            assert file.readline(
            ) == 'test_data file content', 'Invalid file content!'
            return 'succeeded'
        return 'failed'

    # Creating an input file
    with open(osp.join(out_dir, 'test_data.txt'), 'w') as file:
        file.write("test_data file content")

    input_files = {
        'test_file':
        ResourcePathStatic(path=osp.join(out_dir, 'test_data.txt'))
    }

    read_input_file = PythonPersistentOperator(
        task_id='read_input_file',
        force_execution=True,
        python_callable=callable1_dummy_text_read,
        input_files=input_files,
        dag=dag,
        cfg_name=cfg_name)

    ti = TaskInstance(task=read_input_file, execution_date=datetime.now())

    result = read_input_file.execute(ti.get_template_context())
    assert result == 'succeeded'
Example #5
0
def get_create_inference_dataset(dag, cfg_name, force_exec=False):
    """
        create_inference_dataset
    """
    task_id = 'create_inference_dataset'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=create_dataset,
        ppo_kwargs={
            'start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'test_interval_width'), HASH_IT),
            'interval_overlap':
            (parser.getint(cfg_name, 'test_interval_overlap'), HASH_IT),
            'graph_representation':
            (parser.get(cfg_name, 'graph_representation'), HASH_IT),
            'feature_extractor': (parser.get(cfg_name,
                                             'feature_extractor'), HASH_IT)
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file'))
        },
        output_files=get_inference_dataset_output_files(cfg_name),
        dag=dag,
        cfg_name=cfg_name)
Example #6
0
def get_fit_predict_random_forest_classifier(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_random_forest_classifier'

    cont_variables = parser.get(cfg_name, 'cont_variables').splitlines()
    cat_variables = []

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_random_forest_classifier,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'cont_variables': (cont_variables, HASH_IT),
            'cat_variables': (cat_variables, HASH_IT),
            'n_estimators': (parser.getint(cfg_name, 'n_estimators'), HASH_IT),
            'max_depth': (parser.getint(cfg_name, 'max_depth'), HASH_IT),
            'random_state': (parser.getint(cfg_name, 'random_state'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)
Example #7
0
def get_fit_predict_xgboost(dag, cfg_name, force_exec=False):
    task_id = 'fit_predict_xgboost'

    return PythonPersistentOperator(
        task_id=task_id,
        force_execution=force_exec,
        python_callable=fit_predict_xgboost,
        ppo_kwargs={
            'train_start': (parser.getint(cfg_name, 'train_start'), HASH_IT),
            'train_end': (parser.getint(cfg_name, 'train_end'), HASH_IT),
            'test_start': (parser.getint(cfg_name, 'test_start'), HASH_IT),
            'test_end': (parser.getint(cfg_name, 'test_end'), HASH_IT),
            'interval_width': (parser.getint(cfg_name,
                                             'interval_width'), HASH_IT),
            'importance_type': (parser.get(cfg_name,
                                           'importance_type'), HASH_IT),
        },
        input_files={
            'raw_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'raw_file')),
            'features_file':
            ResourcePathStatic(path=parser.get(cfg_name, 'features_file'))
        },
        output_files={
            'df_all_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_all_predictions.h5'),
            'df_interval_predictions':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='df_interval_predictions.h5'),
            'features_importance':
            ResourcePathOutput(cfg_name=cfg_name,
                               task_id=task_id,
                               resource_filename='features_importance.txt'),
            'row_based_metrics':
            ResourcePathOutput(
                cfg_name=cfg_name,
                task_id=task_id,
                resource_filename=
                f'row_based_predictions_metrics_{cfg_name}.txt'),
        },
        dag=dag,
        cfg_name=cfg_name)