def run_sql_submit(request): # if this is a POST request we need to process the form data if request.method == 'POST': # create a form instance and populate it with data from the request: form = RunSqlForm(request.POST) if form.is_valid(): env = DataforjEnv(get_flow().name, 'local') step_name = form.cleaned_data['step_name'] sql_text = form.cleaned_data['sql'] get_flow()._steps[step_name].sql_from_editor = sql_text error_message = None header_columns = None rows = None try: spark_df = get_flow().dataframe_for_step(env, step_name) header_columns = spark_df.schema.names rows = spark_df.rdd.map(lambda row: row.asDict(True)).take(10) except Exception as e: error_message = str(e) template = loader.get_template('run/run_sql.html') context = { 'step_name': step_name, 'sql_text': sql_text, 'error_message': error_message, 'header_columns': header_columns, 'rows': rows } return HttpResponse(template.render(context, request)) # if a GET (or any other method) we'll create a blank form else: form = RunSqlForm() return render(request, 'run/run_sql.html', {'form': form})
def debug_step(self, env: DataforjEnv, step: str): exec('from pyspark.sql import SparkSession') exec(env.spark_session_build()) for step_name in self._graph: code = self._steps[step_name].compile() print(f'Running step [{step_name}]') exec(code) if (step_name == step): exec(f'{step_name}_df.show()') break
def debug_step(dir: str, env_name: str, step: str): project_file_name = f'{dir}/dataforj.yaml' if not os.path.exists(project_file_name): raise Exception(f'There is no Dataforj project in the directory \ [{dir}]') else: with open(project_file_name, 'r+') as project_file: project_yaml = '\n'.join(project_file.readlines()) env = DataforjEnv('flow.name', env_name) yaml_plus_vars = project_yaml \ .format_map(env.env_config['dataflow-config']) flow = dataflow.from_yaml(yaml_plus_vars) flow.debug_step(env, step)
def open_flow(dir: str, env_name: str): ''' open a project in the directory provided ''' file_name = f'{dir}/dataforj.yaml' if not os.path.exists(file_name): raise Exception('There is no Dataforj project in this directory') else: with open(file_name, 'r+') as f: yaml = '\n'.join(f.readlines()) env = DataforjEnv('flow.name', env_name) yaml_plus_vars = yaml \ .format_map(env.env_config['dataflow-config']) flow = dataflow.from_yaml(yaml_plus_vars) return flow
def dataframe_for_step(self, env: DataforjEnv, step: str): exec('from pyspark.sql import SparkSession') exec(env.spark_session_build()) for step_name in self._graph: code = self._steps[step_name].compile() print(f'Running step [{step_name}]') exec(code) # Validate the schema of the resulting dataframe if self._steps[step_name].schema_location != '': step_schema_location = self._steps[step_name].schema_location print( f'Validating step [{step_name}] against the schema located in [{step_schema_location}].' ) # The df will be accessible through the local symbol table check_schema(step_name, locals()[f'{step_name}_df'], step_schema_location) if (step_name == step): return locals()[f'{step_name}_df']
def test_run(self): flow = Dataflow.from_python_objects( 'example data', 'description', [ SourceStep(name='customers', uri='example/data/customers.csv', format_type='csv', options={'header': 'true'}), SourceStep(name='products', uri='example/data/products.csv', format_type='csv', options={'header': 'true'}), SourceStep(name='transactions', uri='example/data/transactions.csv', format_type='csv', options={'header': 'true'}), SQLStep(name='customers_latest', sql_file_path='example/sql/customers_latest.sql', depends_on=['customers']), SQLStep( name='transactions_with_products', depends_on=['products', 'transactions'], sql_file_path='example/sql/transactions_with_products.sql' ), # noqa: E501 SQLStep(name='result', sql_file_path='example/sql/result.sql', depends_on=[ 'transactions_with_products', 'customers_latest' ]), PySparkStep( name='filter', pyspark_file_path='example/pyspark/filter.py', depends_on=['result'], schema_location='example/schemas/filter_schema.yaml'), SinkStep(name='sink', uri='example/data/result.csv', format_type='csv', options={'header': 'true'}, mode='overwrite', depends_on=['filter']) ]) env = DataforjEnv(flow.name, 'local') flow.run(env)
def run_basic_step(step_name: str, request): env = DataforjEnv(get_flow().name, 'local') error_message = None header_columns = None rows = None try: spark_df = get_flow().dataframe_for_step(env, step_name) header_columns = spark_df.schema.names rows = spark_df.rdd.map(lambda row: row.asDict(True)).take(10) except Exception as e: error_message = str(e) template = loader.get_template('run/run_step.html') context = { 'error_message': error_message, 'header_columns': header_columns, 'rows': rows } return HttpResponse(template.render(context, request))
def run(self, env: DataforjEnv): exec("from pyspark.sql import SparkSession") exec(env.spark_session_build()) for step_name in self._graph: code = self._steps[step_name].compile() print(f'Running step [{step_name}]') exec(code) # Validate the schema of the resulting dataframe if self._steps[step_name].schema_location != '': step_schema_location = self._steps[step_name].schema_location print( f'Validating step [{step_name}] against the schema located in [{step_schema_location}].' ) # The df will be accessible through the local symbol table check_schema(step_name, locals()[f'{step_name}_df'], step_schema_location) for test_path in self._steps[step_name].data_quality_tests: test_code = dq_to_pyspark(step_name, test_path) # print(test_code) exec(test_code)
def unit_test(self, env: DataforjEnv, step: str = '__ALL__'): exec("from pyspark.sql import SparkSession") exec(env.spark_session_build()) success_count = 0 fail_count = 0 for step_name in self._graph: code = self._steps[step_name].compile() print(f'Running step [{step_name}]') exec(code) if(step == '__ALL__' or step_name == step) and not \ (isinstance(self._steps[step_name], SourceStep) or isinstance(self._steps[step_name], SinkStep)): if len(self._steps[step_name].unit_tests) == 0: print(f'Step [{step_name}] has no unit tests') else: success_count_step = 0 fail_count_step = 0 for test_path in self._steps[step_name].unit_tests: test_code = ut_to_pyspark(step_name, test_path) try: # print(test_code) exec(test_code) success_count_step = success_count_step + 1 except Exception as e: print(f'Step [{step_name}] test [{test_path}] ' f' failed with error [{e}]') fail_count_step = fail_count_step + 1 print(f'For step [{step_name}] [{success_count_step}] ' f'tests passed and [{fail_count_step}] tests ' f'failed.') success_count = success_count + success_count_step fail_count = fail_count + fail_count_step if (step_name == step): break print(f'Overall result: [{success_count}] ' f'tests passed and [{fail_count}] tests ' f'failed.')