def test_to_dict(): pipeline = Pipeline(steps=[ DomainStep(name='domain', domain='foobar'), RollupStep( name='rollup', hierarchy=['a', 'b'], aggregations=[ Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a']) ], ), ]) actual_dict = pipeline.dict() expected_dict = { 'steps': [ { 'domain': 'foobar', 'name': 'domain' }, { 'name': 'rollup', 'hierarchy': ['a', 'b'], 'aggregations': [{ 'new_columns': ['a'], 'agg_function': 'sum', 'columns': ['a'] }], }, ] } assert actual_dict == expected_dict assert pipeline == Pipeline(**pipeline.dict())
def test_errors(pipeline_executor): """ It should provide helpful information when the pipeline execution fails, such as: - the step that encountered an error (nth and type) - the original exception message """ with pytest.raises(PipelineExecutionFailure) as excinfo: pipeline_executor.execute_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, { 'name': 'delete', 'columns': ['columnThatDoesNotExist', 'whatever'], }, ])) exception_message = excinfo.value.message assert 'Step #2' in exception_message assert 'delete' in exception_message assert 'columnThatDoesNotExist' in exception_message assert 'whatever' in exception_message assert excinfo.value.details['index'] == 1 assert excinfo.value.details['message'] == exception_message
def test_errors(pipeline_translator, mocker): """ It should provide helpful information when the pipeline translation fails, such as: - the step that encountered an error (nth and type) - the original exception message """ mocker.patch( 'weaverbird.backends.sql_translator.steps.filter.apply_condition', side_effect=Exception('comparison ' 'not ' 'implemented'), ) with pytest.raises(SQLPipelineTranslationFailure) as trslinfo: pipeline_translator( Pipeline( steps=[ { 'name': 'filter', 'condition': { 'column': 'title', 'operator': 'eq', }, }, ] ) ) exception_message = trslinfo.value.message assert 'Step #1' in exception_message assert 'filter' in exception_message assert 'comparison' in exception_message assert trslinfo.value.details['index'] == 0 assert trslinfo.value.details['message'] == exception_message
def test_preview_pipeline(mocker: MockFixture, pipeline_executor): df_to_json_spy = mocker.spy(pd.DataFrame, 'to_json') result = json.loads( pipeline_executor.preview_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, ]))) assert 'data' in result assert len(result['data']) == 3 # rows assert len(result['data'][0]) == 3 # columns assert result['schema']['fields'] == [ { 'name': 'colA', 'type': 'string' }, { 'name': 'colB', 'type': 'integer' }, { 'name': 'colC', 'type': 'integer' }, ] assert result['offset'] == 0 assert result['limit'] == 50 assert result['total'] == 3 # DataFrames must be exported with pandas' method to ensure NaN and dates are correctly converted df_to_json_spy.assert_called_once()
def test_filter(pipeline_executor): df, _ = pipeline_executor.execute_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, { 'name': 'filter', 'condition': { 'column': 'colA', 'operator': 'eq', 'value': 'tutu' }, }, ])) assert_dataframes_equals( df, pd.DataFrame({ 'colA': ['tutu'], 'colB': [2], 'colC': [50] }), )
def test_extract_domain(pipeline_executor: PipelineExecutor): df, _ = pipeline_executor.execute_pipeline( Pipeline(steps=[{ 'name': 'domain', 'domain': 'domain_a' }])) assert_dataframes_equals(df, pd.DataFrame(df_domain_a))
def test_report(pipeline_translator): _, report = pipeline_translator( Pipeline( steps=[ {'name': 'domain', 'domain': 'domain_a'}, ] ) ) # there should be one step_report per step in the pipeline assert len(report.sql_steps_translation_reports) == 1
def execute_pipeline(pipeline_steps, **kwargs) -> str: executor = PipelineExecutor(lambda domain: DOMAINS[domain]) # Url parameters are only strings, these two must be understood as numbers if 'limit' in kwargs: kwargs['limit'] = int(kwargs['limit']) if 'offset' in kwargs: kwargs['offset'] = int(kwargs['offset']) return executor.preview_pipeline(pipeline=Pipeline(steps=pipeline_steps), **kwargs)
def resolve_pipeline_for_combination( pipeline: PipelineOrDomainName, domain_retriever: DomainRetriever, pipeline_executor: PipelineExecutor, ) -> DataFrame: """ Combined pipelines can be either single domains (str), or complete pipeline (list of steps) """ from weaverbird.pipeline import Pipeline if isinstance(pipeline, str): return domain_retriever(pipeline) else: return pipeline_executor(Pipeline(steps=pipeline))
def test_report(pipeline_executor): _, report = pipeline_executor.execute_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, { 'name': 'rename', 'toRename': [['colA', 'col_a'], ['colB', 'col_b']] }, ])) # there should be one step_report per step in the pipeline assert len(report.steps_reports) == 2
def execute_pipeline(pipeline_steps, **kwargs) -> str: # Validation pipeline = Pipeline(steps=pipeline_steps) # Url parameters are only strings, these two must be understood as numbers if 'limit' in kwargs: kwargs['limit'] = int(kwargs['limit']) if 'offset' in kwargs: kwargs['offset'] = int(kwargs['offset']) return pandas_preview_pipeline( pipeline=pipeline, domain_retriever=lambda domain: DOMAINS[domain], **kwargs, )
def test_preview_pipeline_limit(pipeline_executor): result = pipeline_executor.preview_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, ]), limit=1, ) assert json.loads(result)['data'] == [{ 'colA': 'toto', 'colB': 1, 'colC': 100 }] # first row of the data frame
def test_translation_pipeline(pipeline_translator, mocker): query_string, _ = pipeline_translator( Pipeline( steps=[ {'name': 'domain', 'domain': 'domain_a'}, { 'name': 'filter', 'condition': {'column': 'title', 'operator': 'isnull'}, }, ] ) ) assert ( query_string == 'WITH SELECT_STEP_0 AS (SELECT title FROM books), FILTER_STEP_1 AS (SELECT * FROM SELECT_STEP_0 WHERE title IS NULL) SELECT title FROM FILTER_STEP_1' )
def test_preview_pipeline_limit_offset(pipeline_executor): result = pipeline_executor.preview_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, ]), limit=3, offset=2, ) assert json.loads(result)['data'] == [{ 'colA': 'tata', 'colB': 3, 'colC': 25 } # third row of the data frame # no other row after that one ]
def test_rename(pipeline_executor): df, _ = pipeline_executor.execute_pipeline( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, { 'name': 'rename', 'toRename': [['colA', 'col_a'], ['colB', 'col_b']] }, ])) assert_dataframes_equals( df, pd.DataFrame({ 'col_a': ['toto', 'tutu', 'tata'], 'col_b': [1, 2, 3], 'colC': [100, 50, 25] }), )
def test_pandas_execute_pipeline(case_id, case_spec_file_path): spec_file = open(case_spec_file_path, 'r') spec = json.loads(spec_file.read()) spec_file.close() df_in = pd.read_json(json.dumps(spec['input']), orient='table') df_out = pd.read_json(json.dumps(spec['expected']), orient='table') dfs_in_others = { k: pd.read_json(json.dumps(v), orient='table') for (k, v) in spec.get('other_inputs', {}).items() } pipeline = Pipeline(steps=[{ 'name': 'domain', 'domain': 'in' }, spec['step']]) DOMAINS = {'in': df_in, **dfs_in_others} result = execute_pipeline(pipeline, domain_retriever=lambda x: DOMAINS[x])[0] assert_dataframes_equals(df_out, result)
def test_extract_query(pipeline_translator): q, _ = pipeline_translator(Pipeline(steps=[{'name': 'domain', 'domain': 'domain_a'}])) assert q == 'WITH SELECT_STEP_0 AS (SELECT title FROM books) SELECT title FROM SELECT_STEP_0'
context: Dict expected_result: List def get_render_variables_test_cases(): test_cases = [] globs = glob.glob('./tests/fixtures/fixtures_templating/*.json') for file in globs: with open(file) as json_file: file_content = json.load(json_file) for test in file_content: case = Case(filename=file, data=test[0], context=test[1], expected_result=test[2]) test_cases.append(case) return test_cases cases = get_render_variables_test_cases() ids = map(lambda x: x.filename, cases) @pytest.mark.parametrize('case', cases, ids=ids) def test_step_with_variables(case: Case): pipeline_with_variables = PipelineWithVariables(**case.data) pipeline = pipeline_with_variables.render( case.context, renderer=nosql_apply_parameters_to_query ) expected_result = Pipeline(steps=case.expected_result) assert pipeline == expected_result