Example #1
0
def run_sql_submit(request):
    # if this is a POST request we need to process the form data
    if request.method == 'POST':
        # create a form instance and populate it with data from the request:
        form = RunSqlForm(request.POST)
        if form.is_valid():
            env = DataforjEnv(get_flow().name, 'local')
            step_name = form.cleaned_data['step_name']
            sql_text = form.cleaned_data['sql']
            get_flow()._steps[step_name].sql_from_editor = sql_text
            error_message = None
            header_columns = None
            rows = None
            try:
                spark_df = get_flow().dataframe_for_step(env, step_name)
                header_columns = spark_df.schema.names
                rows = spark_df.rdd.map(lambda row: row.asDict(True)).take(10)
            except Exception as e:
                error_message = str(e)

            template = loader.get_template('run/run_sql.html')
            context = {
                'step_name': step_name,
                'sql_text': sql_text,
                'error_message': error_message,
                'header_columns': header_columns,
                'rows': rows
            }
            return HttpResponse(template.render(context, request))

    # if a GET (or any other method) we'll create a blank form
    else:
        form = RunSqlForm()

    return render(request, 'run/run_sql.html', {'form': form})
Example #2
0
 def debug_step(self, env: DataforjEnv, step: str):
     exec('from pyspark.sql import SparkSession')
     exec(env.spark_session_build())
     for step_name in self._graph:
         code = self._steps[step_name].compile()
         print(f'Running step [{step_name}]')
         exec(code)
         if (step_name == step):
             exec(f'{step_name}_df.show()')
             break
Example #3
0
def debug_step(dir: str, env_name: str, step: str):
    project_file_name = f'{dir}/dataforj.yaml'
    if not os.path.exists(project_file_name):
        raise Exception(f'There is no Dataforj project in the directory \
                          [{dir}]')
    else:
        with open(project_file_name, 'r+') as project_file:
            project_yaml = '\n'.join(project_file.readlines())
            env = DataforjEnv('flow.name', env_name)
            yaml_plus_vars = project_yaml \
                .format_map(env.env_config['dataflow-config'])
            flow = dataflow.from_yaml(yaml_plus_vars)
            flow.debug_step(env, step)
Example #4
0
def open_flow(dir: str, env_name: str):
    '''
    open a project in the directory provided
    '''
    file_name = f'{dir}/dataforj.yaml'
    if not os.path.exists(file_name):
        raise Exception('There is no Dataforj project in this directory')
    else:
        with open(file_name, 'r+') as f:
            yaml = '\n'.join(f.readlines())
            env = DataforjEnv('flow.name', env_name)
            yaml_plus_vars = yaml \
                .format_map(env.env_config['dataflow-config'])
            flow = dataflow.from_yaml(yaml_plus_vars)
            return flow
Example #5
0
 def dataframe_for_step(self, env: DataforjEnv, step: str):
     exec('from pyspark.sql import SparkSession')
     exec(env.spark_session_build())
     for step_name in self._graph:
         code = self._steps[step_name].compile()
         print(f'Running step [{step_name}]')
         exec(code)
         # Validate the schema of the resulting dataframe
         if self._steps[step_name].schema_location != '':
             step_schema_location = self._steps[step_name].schema_location
             print(
                 f'Validating step [{step_name}] against the schema located in [{step_schema_location}].'
             )
             # The df will be accessible through the local symbol table
             check_schema(step_name,
                          locals()[f'{step_name}_df'], step_schema_location)
         if (step_name == step):
             return locals()[f'{step_name}_df']
Example #6
0
 def test_run(self):
     flow = Dataflow.from_python_objects(
         'example data',
         'description',
         [
             SourceStep(name='customers',
                        uri='example/data/customers.csv',
                        format_type='csv',
                        options={'header': 'true'}),
             SourceStep(name='products',
                        uri='example/data/products.csv',
                        format_type='csv',
                        options={'header': 'true'}),
             SourceStep(name='transactions',
                        uri='example/data/transactions.csv',
                        format_type='csv',
                        options={'header': 'true'}),
             SQLStep(name='customers_latest',
                     sql_file_path='example/sql/customers_latest.sql',
                     depends_on=['customers']),
             SQLStep(
                 name='transactions_with_products',
                 depends_on=['products', 'transactions'],
                 sql_file_path='example/sql/transactions_with_products.sql'
             ),  # noqa: E501
             SQLStep(name='result',
                     sql_file_path='example/sql/result.sql',
                     depends_on=[
                         'transactions_with_products', 'customers_latest'
                     ]),
             PySparkStep(
                 name='filter',
                 pyspark_file_path='example/pyspark/filter.py',
                 depends_on=['result'],
                 schema_location='example/schemas/filter_schema.yaml'),
             SinkStep(name='sink',
                      uri='example/data/result.csv',
                      format_type='csv',
                      options={'header': 'true'},
                      mode='overwrite',
                      depends_on=['filter'])
         ])
     env = DataforjEnv(flow.name, 'local')
     flow.run(env)
Example #7
0
def run_basic_step(step_name: str, request):
    env = DataforjEnv(get_flow().name, 'local')
    error_message = None
    header_columns = None
    rows = None
    try:
        spark_df = get_flow().dataframe_for_step(env, step_name)
        header_columns = spark_df.schema.names
        rows = spark_df.rdd.map(lambda row: row.asDict(True)).take(10)
    except Exception as e:
        error_message = str(e)

    template = loader.get_template('run/run_step.html')
    context = {
        'error_message': error_message,
        'header_columns': header_columns,
        'rows': rows
    }
    return HttpResponse(template.render(context, request))
Example #8
0
 def run(self, env: DataforjEnv):
     exec("from pyspark.sql import SparkSession")
     exec(env.spark_session_build())
     for step_name in self._graph:
         code = self._steps[step_name].compile()
         print(f'Running step [{step_name}]')
         exec(code)
         # Validate the schema of the resulting dataframe
         if self._steps[step_name].schema_location != '':
             step_schema_location = self._steps[step_name].schema_location
             print(
                 f'Validating step [{step_name}] against the schema located in [{step_schema_location}].'
             )
             # The df will be accessible through the local symbol table
             check_schema(step_name,
                          locals()[f'{step_name}_df'], step_schema_location)
         for test_path in self._steps[step_name].data_quality_tests:
             test_code = dq_to_pyspark(step_name, test_path)
             # print(test_code)
             exec(test_code)
Example #9
0
    def unit_test(self, env: DataforjEnv, step: str = '__ALL__'):
        exec("from pyspark.sql import SparkSession")
        exec(env.spark_session_build())
        success_count = 0
        fail_count = 0
        for step_name in self._graph:
            code = self._steps[step_name].compile()
            print(f'Running step [{step_name}]')
            exec(code)
            if(step == '__ALL__' or step_name == step) and not \
              (isinstance(self._steps[step_name], SourceStep) or
               isinstance(self._steps[step_name], SinkStep)):
                if len(self._steps[step_name].unit_tests) == 0:
                    print(f'Step [{step_name}] has no unit tests')
                else:
                    success_count_step = 0
                    fail_count_step = 0
                    for test_path in self._steps[step_name].unit_tests:
                        test_code = ut_to_pyspark(step_name, test_path)
                        try:
                            # print(test_code)
                            exec(test_code)
                            success_count_step = success_count_step + 1
                        except Exception as e:
                            print(f'Step [{step_name}] test [{test_path}] '
                                  f' failed with error [{e}]')
                            fail_count_step = fail_count_step + 1
                    print(f'For step [{step_name}] [{success_count_step}] '
                          f'tests passed and [{fail_count_step}] tests '
                          f'failed.')
                    success_count = success_count + success_count_step
                    fail_count = fail_count + fail_count_step
                if (step_name == step):
                    break

        print(f'Overall result: [{success_count}] '
              f'tests passed and [{fail_count}] tests '
              f'failed.')