Example #1
0
    def test_building_two_operators_with_execution(self):
        # given
        plasma_connector = PlasmaConnector(socket_name)

        dag = DAG(dag_id='test_dag_plasma', start_date=datetime.now())

        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_plasma_unit = DataOutputPlasmaUnit(plasma_connector, object_id)
        task_1 = DataOperator(
            operation_function=drop_na_dataframe,
            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
            input_unit=input_csv_unit,
            output_unit=output_plasma_unit,
            dag=dag,
            task_id='data_operator_csv_to_plasma')

        input_plasma_unit = DataInputPlasmaUnit(plasma_connector, object_id)
        output_csv_unit = DataOutputFileUnit('data/X_parsed_22.csv',
                                             index=False)
        task_2 = DataOperator(
            operation_function=drop_na_dataframe,
            params={'columns': ['ANNEETRAVAUXPRECONISESDIAG']},
            input_unit=input_plasma_unit,
            output_unit=output_csv_unit,
            dag=dag,
            task_id='data_operator_plasma_to_csv')

        task_2.set_upstream(task_1)

        # when
        execute_dag(dag, verbose=True)

        # then
        df = pd.read_csv('data/X_parsed_22.csv')
        self.assertEqual((7241, 27), df.shape)
Example #2
0
    def test_execute_data_operator_csv_read_and_parquet_write_using_dask_api_backend(
            self):
        # given
        dag = DAG(dag_id='test', start_date=datetime.now())
        input_csv_unit = DataGlobalInputUnit('data/X.csv',
                                             api_module='dask.dataframe',
                                             sep=';')
        output_parquet_unit = DataGlobalOutputFileUnit(
            'data/X_parsed',
            write_function_name='to_parquet',
            write_index=False)

        task = DataOperator(operation_function=drop_na_dataframe,
                            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                            input_unit=input_csv_unit,
                            output_unit=output_parquet_unit,
                            dag=dag,
                            task_id='data_operator_csv_to_parquet')

        # when
        task.execute(None)

        # then
        df_transformed = pd.read_parquet('data/X_parsed', engine='pyarrow')
        self.assertEqual((10245, 27), df_transformed.shape)
Example #3
0
    def test_execute_data_operator_csv_read_and_write(self):
        # given
        dag = DAG(dag_id='test', start_date=datetime.now())
        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_csv_unit = DataOutputFileUnit('data/X_parsed.csv', index=False)

        task = DataOperator(operation_function=drop_na_dataframe,
                            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                            input_unit=input_csv_unit,
                            output_unit=output_csv_unit,
                            dag=dag,
                            task_id='data_operator_csv')

        # when
        task.execute(None)

        # then
        df_transformed = pd.read_csv('data/X_parsed.csv')
        self.assertEqual((10245, 27), df_transformed.shape)
Example #4
0
    def test_execute_data_operator_csv_read_and_plasma_write(self):
        # given
        plasma_connector = PlasmaConnector(socket_name)

        dag = DAG(dag_id='test', start_date=datetime.now())
        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_plasma_unit = DataOutputPlasmaUnit(plasma_connector, object_id)

        task = DataOperator(operation_function=drop_na_dataframe,
                            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                            input_unit=input_csv_unit,
                            output_unit=output_plasma_unit,
                            dag=dag,
                            task_id='data_operator_csv_to_parquet')

        task_instance = TaskInstance(task=task, execution_date=datetime.now())

        # when
        task.execute(task_instance.get_template_context())

        # then
        other_plasma_connector = PlasmaConnector(socket_name)
        df_transformed = other_plasma_connector.get_dataframe(object_id)
        self.assertEqual((10245, 27), df_transformed.shape)
Example #5
0
def feature_engineering_sub_dag(parent_dag_name,
                                child_dag_name,
                                model_path,
                                input_file,
                                output_file,
                                temp_files,
                                start_date,
                                schedule_interval,
                                mode='train'):

    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              schedule_interval=schedule_interval,
              start_date=start_date)

    task_fillna = DataOperator(
        operation_function=fillna_columns,
        input_unit=DataInputFileUnit(input_file,
                                     pandas_read_function_name='read_parquet'),
        output_unit=DataOutputFileUnit(
            temp_files[0], pandas_write_function_name='to_parquet'),
        dag=dag,
        task_id='Fill_NA_values',
        params={
            'simple_features': [
                'NOTEDIAGNOSTIC', 'PRIORITEDERENOUVELLEMENT',
                'FREQUENTATIONCIBLE', 'RAISONDEPLANTATION', 'SOUS_CATEGORIE',
                'STADEDEDEVELOPPEMENT', 'STADEDEVELOPPEMENTDIAG',
                'TRAITEMENTCHENILLES', 'TRAVAUXPRECONISESDIAG', 'TROTTOIR',
                'VARIETE', 'VIGUEUR', 'CODE_PARENT'
            ],
            'model_path':
            model_path,
            'mode':
            mode
        })

    task_cat_to_num = DataOperator(
        operation_function=category_to_numerical_features,
        input_unit=DataInputFileUnit(temp_files[0],
                                     pandas_read_function_name='read_parquet'),
        output_unit=DataOutputFileUnit(
            output_file, pandas_write_function_name='to_parquet'),
        dag=dag,
        task_id='Categorical_features_to_numeric',
        params={
            'features': [
                'GENRE_BOTA', 'ESPECE', 'FREQUENTATIONCIBLE',
                'RAISONDEPLANTATION', 'SOUS_CATEGORIE', 'STADEDEDEVELOPPEMENT',
                'STADEDEVELOPPEMENTDIAG', 'TRAITEMENTCHENILLES',
                'TRAVAUXPRECONISESDIAG', 'TROTTOIR', 'VARIETE', 'VIGUEUR',
                'CODE_PARENT'
            ],
            'model_path':
            model_path,
            'mode':
            mode
        })

    task_fillna.set_downstream(task_cat_to_num)

    return dag
Example #6
0
Concat train and test data (easier for feature engineering when facing a timeseries problem.
"""
input_parquet_files_unit = DataInputMultiFileUnit(
    [
        project_path + 'datasets/input/train.parquet',
        project_path + 'datasets/input/test.parquet'
    ],
    pandas_read_function_name='read_parquet')

output_parquet_concat_unit = DataOutputFileUnit(
    project_path + 'datasets/temp/X_raw.parquet',
    pandas_write_function_name='to_parquet')

task_concat_train_files = DataOperator(
    operation_function=concat_train_test,
    input_unit=input_parquet_files_unit,
    output_unit=output_parquet_concat_unit,
    dag=dag,
    task_id='Concat_train_test_data_source_files')
"""
Resampling time data
"""
input_raw_data_unit = DataInputFileUnit(
    output_parquet_concat_unit.output_path,
    pandas_read_function_name='read_parquet')

output_cleaned_data_unit = DataOutputFileUnit(
    project_path + 'datasets/temp/X_clean.parquet',
    pandas_write_function_name='to_parquet')

task_fill_missing_values = DataOperator(operation_function=resample_fillna,
                                        input_unit=input_raw_data_unit,
Example #7
0
filename_generator = FilenameGenerator(path=project_path + 'datasets/temp/')
temp_files = []
for i in range(0, 100):
    temp_files.append(filename_generator.generate_filename() + '.parquet')

dag = DAG(dag_id='Tree_Disease_Prediction_with_pg', description='Tree Disease Prediction Example (with PG)',
          schedule_interval='0 12 * * *', start_date=datetime(2017, 3, 20), catchup=False)

input_csv_files_unit = DataInputMultiFileUnit([project_path + 'datasets/input/X_tree_egc_t1.csv',
                                               project_path + 'datasets/input/X_geoloc_egc_t1.csv',
                                               project_path + 'datasets/input/Y_tree_egc_t1.csv'], sep=';')
output_parquet_unit = DataOutputFileUnit(project_path + 'datasets/temp/X_train_raw.parquet',
                                         pandas_write_function_name='to_parquet')
task_concate_train_files = DataOperator(operation_function=join_dataframes,
                                        input_unit=input_csv_files_unit,
                                        output_unit=output_parquet_unit,
                                        dag=dag, task_id='Join_train_data_source_files')

task_feature_engineering_for_train = SubDagOperator(
    subdag=feature_engineering_sub_dag(dag.dag_id, 'Feature_engineering_for_train',
                                       model_path=project_path + 'models/',
                                       input_file=project_path + 'datasets/temp/X_train_raw.parquet',
                                       output_file=project_path + 'datasets/temp/X_train_final.parquet',
                                       temp_files=temp_files[0:10],
                                       start_date=dag.start_date,
                                       schedule_interval=dag.schedule_interval),
    task_id='Feature_engineering_for_train',
    dag=dag,
)

task_concate_train_files.set_downstream(task_feature_engineering_for_train)