def test_get_pipeline_spec_from_config(self, mock_notebook_item): mock_notebook_item.return_value = google.datalab.bigquery.Query( 'foo_query_sql_string') # empty pipeline_spec with self.assertRaisesRegexp(Exception, 'Pipeline has no tasks to execute.'): bq._get_pipeline_spec_from_config({}) # empty input , transformation, output as path pipeline_config = { 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute' }, 'bq_pipeline_extract_task': { 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input as path, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'data_source': 'foo_data_source', }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'data_source': 'foo_data_source', 'path': 'foo_path', 'type': 'pydatalab.bq.execute' }, 'bq_pipeline_extract_task': { 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input as path->table, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table_1', }, 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', 'up_stream': ['bq_pipeline_load_task'], }, 'bq_pipeline_extract_task': { 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } # input as table, transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input as table, transformation, output as table pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'table': 'foo_table_1' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', 'table': 'foo_table_1' }, } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input as table, no transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table' }, 'output': { 'path': 'foo_path' } } expected = { 'tasks': { 'bq_pipeline_extract_task': { 'type': 'pydatalab.bq.extract', 'path': 'foo_path', 'table': 'foo_table' }, } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # output only; this should be identical to the above pipeline_config = { 'output': { 'table': 'foo_table', 'path': 'foo_path' } } expected = { 'tasks': { 'bq_pipeline_extract_task': { 'type': 'pydatalab.bq.extract', 'path': 'foo_path', 'table': 'foo_table' }, } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input as path, no transformation, output as table pipeline_config = { 'input': { 'path': 'foo_path' }, 'output': { 'table': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table' }, } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # input only; this should be identical to the above pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table' }, } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected) # only transformation pipeline_config = { 'transformation': { 'query': 'foo_query' }, } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, } } self.assertDictEqual( bq._get_pipeline_spec_from_config(pipeline_config), expected)
def test_get_pipeline_spec_from_config(self, mock_notebook_item): mock_notebook_item.return_value = google.datalab.bigquery.Query( 'foo_query_sql_string') # empty pipeline_spec with self.assertRaisesRegexp(Exception, 'Pipeline has no tasks to execute.'): bq._get_pipeline_spec_from_config({}) # empty input , transformation, output as path pipeline_config = { 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'data_source': 'foo_data_source', }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'data_source': 'foo_data_source', 'path': 'foo_path', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path->table, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table_1', }, 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', 'up_stream': ['bq_pipeline_load_task'], }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, transformation, output as table pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'table': 'foo_table_1' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', 'table': 'foo_table_1', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, no transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table' }, 'output': { 'path': 'foo_path' } } expected = { 'tasks': { 'bq_pipeline_extract_task': { 'type': 'pydatalab.bq.extract', 'path': 'foo_path', 'table': 'foo_table' }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # output only; this should be identical to the above pipeline_config = { 'output': { 'table': 'foo_table', 'path': 'foo_path' } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # output can also be called extract, and it should be identical to the above pipeline_config = { 'extract': { 'table': 'foo_table', 'path': 'foo_path' } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path, no transformation, output as table pipeline_config = { 'input': { 'path': 'foo_path' }, 'output': { 'table': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table' }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input only; this should be identical to the above pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table' }, } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input can also be called load, and it should be identical to the above pipeline_config = { 'load': { 'path': 'foo_path', 'table': 'foo_table' }, } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # only transformation pipeline_config = { 'transformation': { 'query': 'foo_query' }, } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) user_parameters = [ { 'name': 'foo1', 'value': 'foo1', 'type': 'STRING' }, { 'name': 'foo2', 'value': 'foo2', 'type': 'INTEGER' }, ] # only transformation with parameters pipeline_config = { 'transformation': { 'query': 'foo_query' }, 'parameters': user_parameters } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, user_parameters)
def test_get_pipeline_spec_from_config(self, mock_notebook_item): mock_notebook_item.return_value = google.datalab.bigquery.Query('foo_query_sql_string') # empty pipeline_spec with self.assertRaisesRegexp(Exception, 'Pipeline has no tasks to execute.'): bq._get_pipeline_spec_from_config({}) # empty input , transformation, output as path pipeline_config = { 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'data_source': 'foo_data_source', }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'data_source': 'foo_data_source', 'path': 'foo_path', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_table', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path->table, transformation, output as path pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table_1', }, 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', 'up_stream': ['bq_pipeline_load_task'], }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'path': 'foo_path_2' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', }, 'bq_pipeline_extract_task': { 'table': """{{ ti.xcom_pull(task_ids='bq_pipeline_execute_task_id').get('table') }}""", 'path': 'foo_path_2', 'type': 'pydatalab.bq.extract', 'up_stream': ['bq_pipeline_execute_task'] } } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, transformation, output as table pipeline_config = { 'input': { 'table': 'foo_table_1' }, 'transformation': { 'query': 'foo_query' }, 'output': { 'table': 'foo_table_1' } } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'WITH input AS (\n SELECT * FROM `foo_table_1`\n)\n\nfoo_query_sql_string', 'type': 'pydatalab.bq.execute', 'table': 'foo_table_1', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as table, no transformation, output as path pipeline_config = { 'input': { 'table': 'foo_table' }, 'output': { 'path': 'foo_path' } } expected = { 'tasks': { 'bq_pipeline_extract_task': { 'type': 'pydatalab.bq.extract', 'path': 'foo_path', 'table': 'foo_table' }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # output only; this should be identical to the above pipeline_config = { 'output': { 'table': 'foo_table', 'path': 'foo_path' } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # output can also be called extract, and it should be identical to the above pipeline_config = { 'extract': { 'table': 'foo_table', 'path': 'foo_path' } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input as path, no transformation, output as table pipeline_config = { 'input': { 'path': 'foo_path' }, 'output': { 'table': 'foo_table' } } expected = { 'tasks': { 'bq_pipeline_load_task': { 'type': 'pydatalab.bq.load', 'path': 'foo_path', 'table': 'foo_table' }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input only; this should be identical to the above pipeline_config = { 'input': { 'path': 'foo_path', 'table': 'foo_table' }, } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # input can also be called load, and it should be identical to the above pipeline_config = { 'load': { 'path': 'foo_path', 'table': 'foo_table' }, } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) # only transformation pipeline_config = { 'transformation': { 'query': 'foo_query' }, } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, None) user_parameters = [ {'name': 'foo1', 'value': 'foo1', 'type': 'STRING'}, {'name': 'foo2', 'value': 'foo2', 'type': 'INTEGER'}, ] # only transformation with parameters pipeline_config = { 'transformation': { 'query': 'foo_query' }, 'parameters': user_parameters } expected = { 'tasks': { 'bq_pipeline_execute_task': { 'sql': u'foo_query_sql_string', 'type': 'pydatalab.bq.execute', }, } } actual = bq._get_pipeline_spec_from_config(pipeline_config) self.assertPipelineConfigEquals(actual, expected, user_parameters)