def setUp(self): logging.basicConfig(level=logging.DEBUG) self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_performance.yaml") self.p_ex.dataset_prefix = "1008_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) self.bq_client.create_dataset(dataset_id=str(self.p_ex.dataset_prefix) + "test")
class TestPipelineDryRunWithArgs(unittest.TestCase): def setUp(self): self.bq_client = BigQueryExecutor() self.bq_client.create_dataset( dataset_id='4001_my_dataset_dry_run_with_args' ) self.bq_client.create_dataset( dataset_id='4001_test' ) self.p_ex = pl.PipelineExecutor( yaml_file= "tests/yaml/test_dry_run_with_args.yaml", my_string_arg='one', my_dataset_arg='my_dataset_dry_run_with_args' ) self.p_ex.dataset_prefix = "4001_" add_dataset_prefix(obj=self.p_ex.yaml, dataset_prefix=self.p_ex.dataset_prefix, kwargs={'my_string_arg': 'one', 'my_dataset_arg': 'my_dataset_dry_run_with_args'}) def test_dry_run(self): self.p_ex.run() self.assertTrue( self.bq_client.table_exists( dataset_id='4001_my_dataset_dry_run_with_args', table_id='table1' ) ) def tearDown(self): self.p_ex.dry_run_clean()
def __init__(self, yaml_file, dry_run=False, *args, **kwargs): self.kwargs = kwargs self.yaml = read_yaml_file(yaml_file) self.dataset_prefix = None if dry_run: self.dataset_prefix = f'{randint(1, 99999999):08}_' add_dataset_prefix(obj=self.yaml, dataset_prefix=self.dataset_prefix, kwargs=self.kwargs) self.bq = BigQueryExecutor() self.prod_project_id = bq_default_prod_project()
def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/unit_tests_fail.yaml") self.p_ex.dataset_prefix = "1001_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) if self.bq_client.table_exists(dataset_id='reporting', table_id='out_product'): self.bq_client.delete_table(dataset_id='reporting', table_id='out_product') if self.bq_client.table_exists(dataset_id='reporting', table_id='out_saleorder'): self.bq_client.delete_table(dataset_id='reporting', table_id='out_saleorder')
class TestRunReleases(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_run.yaml") def test_run_releases(self): self.p_ex.run_releases(release_date="2020-01-01") self.assertTrue( self.bq_client.table_exists( table_id="table1", dataset_id="test" ), "Table was not created" ) def test_run_releases_with_dataset_prefix(self): self.p_ex.dataset_prefix = "2001_" self.p_ex.run_releases(release_date="2020-01-01") self.assertTrue( self.bq_client.table_exists( table_id="table1", dataset_id="2001_test" ), "Table was not created" ) def tearDown(self): if self.bq_client.table_exists(table_id='table1', dataset_id='test'): self.bq_client.delete_table(table_id='table1', dataset_id='test') if self.bq_client.dataset_exists("2001_test"): self.bq_client.delete_dataset("2001_test", delete_contents=True)
def setUp(self): self.bq_client = BigQueryExecutor() self.bq_client.create_dataset( dataset_id='4001_my_dataset_dry_run_with_args' ) self.bq_client.create_dataset( dataset_id='4001_test' ) self.p_ex = pl.PipelineExecutor( yaml_file= "tests/yaml/test_dry_run_with_args.yaml", my_string_arg='one', my_dataset_arg='my_dataset_dry_run_with_args' ) self.p_ex.dataset_prefix = "4001_" add_dataset_prefix(obj=self.p_ex.yaml, dataset_prefix=self.p_ex.dataset_prefix, kwargs={'my_string_arg': 'one', 'my_dataset_arg': 'my_dataset_dry_run_with_args'})
class TestPipelinePerformance(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_performance.yaml") self.p_ex.dataset_prefix = "1008_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) self.bq_client.create_dataset(dataset_id=str(self.p_ex.dataset_prefix) + "test") def test_dry_pipeline_performance(self): start_time = time.time() self.p_ex.run() end_time = time.time() self.assertLess(end_time - start_time, 10, "pipeline performance is bad") def tearDown(self): self.p_ex.dry_run_clean()
class TestPipelineDryRun(unittest.TestCase): def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_dry_run.yaml") self.p_ex.dataset_prefix = "1001_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) self.bq_client.create_dataset(dataset_id=str(self.p_ex.dataset_prefix) + "test") def test_dry_run(self): self.p_ex.run() self.assertTrue( self.bq_client.table_exists( dataset_id='1001_test', table_id='table1' ) ) def tearDown(self): self.p_ex.dry_run_clean()
class TestPipelineCopyTableStructure(unittest.TestCase): def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/unit_tests_fail.yaml") self.p_ex.dataset_prefix = "1001_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) if self.bq_client.table_exists(dataset_id='reporting', table_id='out_product'): self.bq_client.delete_table(dataset_id='reporting', table_id='out_product') if self.bq_client.table_exists(dataset_id='reporting', table_id='out_saleorder'): self.bq_client.delete_table(dataset_id='reporting', table_id='out_saleorder') def test_copy_prod_structure(self): self.p_ex.copy_prod_structure(['copper-actor-127213.reporting.out_product', 'reporting.out_saleorder']) self.assertTrue( self.bq_client.table_exists(dataset_id='1001_reporting', table_id='out_product') and self.bq_client.table_exists(dataset_id='1001_reporting', table_id='out_saleorder'), "all table's structure are copied" )
class TestPipelineUnitTestsErrorRaised(unittest.TestCase): def setUp(self): self.db = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/unit_tests_fail.yaml") def test_run_unit_tests_error(self): with self.assertRaises(AssertionError): self.p_ex.run_unit_tests() def tearDown(self): self.db.delete_table( table_id='table2', dataset_id='test' ) self.db.delete_table( table_id='table3', dataset_id='test' ) self.db.delete_table( table_id='table4', dataset_id='test' )
def setUp(self): self.db = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/unit_tests_fail.yaml")
def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_dry_run.yaml") self.p_ex.dataset_prefix = "1001_" add_dataset_prefix(self.p_ex.yaml, self.p_ex.dataset_prefix) self.bq_client.create_dataset(dataset_id=str(self.p_ex.dataset_prefix) + "test")
class PipelineExecutor: def __init__(self, yaml_file, dry_run=False, *args, **kwargs): self.kwargs = kwargs self.yaml = read_yaml_file(yaml_file) self.dataset_prefix = None if dry_run: self.dataset_prefix = f'{randint(1, 99999999):08}_' add_dataset_prefix(obj=self.yaml, dataset_prefix=self.dataset_prefix, kwargs=self.kwargs) self.bq = BigQueryExecutor() self.prod_project_id = bq_default_prod_project() def remove_dataset(self, dataset_id): if self.bq.dataset_exists(dataset_id): self.bq.delete_dataset(dataset_id, delete_contents=True) def dry_run_clean(self, table_list=''): if self.dataset_prefix is not None: if bq_default_project() != self.prod_project_id: args_dataset = [] if table_list == '': table_list = self.yaml.get('table_list', '') for table in table_list: if table.count('.') == 1: dataset_id = table.split(".")[0] else: dataset_id = table.split(".")[1] dict_ = { "dataset_id": dataset_id } apply_kwargs(dict_, self.kwargs) args_dataset.append( dict_ ) for dataset in args_dataset: value = dataset.get('dataset_id', '') dataset['dataset_id'] = self.dataset_prefix + value args_dataset = [dict(t) for t in {tuple(d.items()) for d in args_dataset}] if args_dataset != []: execute_parallel( self.remove_dataset, args_dataset, message='delete dataset: ', log='dataset_id' ) def create_tables(self, batch): args = [] batch_content = batch.get('tables', '') args = extract_args(content=batch_content, to_extract='create_table', kwargs=self.kwargs) for a in args: apply_kwargs(a, self.kwargs) a.update({"dataset_prefix": self.dataset_prefix}) if args != []: execute_parallel( self.bq.create_table, args, message='Creating table:', log='table_id' ) def create_gs_tables(self, batch): args =[] batch_content = batch.get('sheets', '') args = extract_args(content=batch_content, to_extract='create_gs_table', kwargs=self.kwargs) if args == []: raise Exception("create_gs_table in YAML file is not well defined") execute_parallel( self.bq.create_gs_table, args, message='Creating live Google Sheet connection table in BigQuery:', log='table_id' ) def create_partition_tables(self, batch): args = [] batch_content = batch.get('tables', '') args = extract_args( content=batch_content, to_extract='create_partition_table', kwargs=self.kwargs ) for a in args: apply_kwargs(a, self.kwargs) a.update({"dataset_prefix": self.dataset_prefix}) if args != []: execute_parallel( self.bq.create_partition_table, args, message='Creating partition table:', log='table_id' ) def load_google_sheets(self, batch): args = [] batch_content = batch.get('sheets', '') args = extract_args(batch_content, 'load_google_sheet') if args == []: raise Exception("load_google_sheet in yaml is not well defined") execute_parallel( self.bq.load_google_sheet, args, message='Loading table:', log='table_id' ) def run_checks(self, batch): args, args_pk = [], [] batch_content = batch.get('tables', '') args = extract_args(batch_content, 'create_table') args_pk = [x.get('pk', []) for x in batch_content] for a, b in zip(args, args_pk): a.update({ "dataset_prefix": self.dataset_prefix, "primary_key": b } ) execute_parallel( self.bq.assert_unique, args, message='Run pk_check on:', log='table_id' ) def run_batch(self, batch): """ Executes a batch. """ if 'tables' in batch: if extract_args(batch['tables'], 'create_table'): self.create_tables(batch) self.run_checks(batch) if extract_args(batch['tables'], 'create_partition_table'): self.create_partition_tables(batch) if 'sheets' in batch: if extract_args(batch['sheets'], 'load_google_sheet'): self.load_google_sheets(batch) if extract_args(batch['sheets'], 'create_gs_table'): self.create_gs_tables(batch) def run_batches(self): batch_list = self.yaml.get('batches', '') for batch in batch_list: apply_kwargs(batch, self.kwargs) self.run_batch(batch) def run_python_file(self, python_file): # _dataset_prefix string is unused in run_python_file() # but it makes PipelineExecutor's dataset_prefix available to the release script, using: # from pygyver.etl.lib import get_dataset_prefix _dataset_prefix = self.dataset_prefix logging.info(f"Running {python_file}") module_name = PurePath(python_file).stem module_full_path = PurePath(os.getenv("PROJECT_ROOT")) / python_file spec = spec_from_file_location(module_name, module_full_path) module = module_from_spec(spec) spec.loader.exec_module(module) def run_releases(self, release_date=date.today().strftime("%Y-%m-%d")): release_list = self.yaml.get('releases', []) for release in release_list: if str(release.get('date', '')) == release_date: logging.info(f"Release {release_date}: {release.get('description', '')}") for python_file in release.get('python_files', []): self.run_python_file(python_file) def run(self): self.run_releases() self.run_batches() def run_unit_tests(self, batch_list=None): batch_list = batch_list or self.yaml.get('batches', '') list_unit_test = extract_unit_tests(batch_list, self.kwargs) args = extract_unit_test_value(list_unit_test) if args != []: execute_parallel( self.bq.assert_acceptance, args, message='Asserting sql', log='file' ) def copy_prod_structure(self, table_list=''): args, args_dataset, datasets = [], [], [] if table_list == '': table_list = self.yaml.get('table_list', '') for table in table_list: if table.count('.') == 1: _dict = { "source_project_id" : self.prod_project_id, "source_dataset_id" : table.split(".")[0], "source_table_id": table.split(".")[1], "dest_dataset_id" : self.dataset_prefix + table.split(".")[0], "dest_table_id": table.split(".")[1] } else: _dict = { "source_project_id" : table.split(".")[0], "source_dataset_id" : table.split(".")[1], "source_table_id": table.split(".")[2], "dest_dataset_id" : self.dataset_prefix + table.split(".")[1], "dest_table_id": table.split(".")[2] } apply_kwargs(_dict, self.kwargs) args.append(_dict) # extract datasets from table_list for table in table_list: if table.count('.') == 1: datasets.append(self.dataset_prefix + table.split(".")[0]) else: datasets.append(self.dataset_prefix + table.split(".")[1]) for dataset in np.unique(datasets): _dict = {"dataset_id" : dataset} apply_kwargs(_dict, self.kwargs) args_dataset.append( _dict ) if args_dataset != []: execute_parallel( self.bq.create_dataset, args_dataset, message='create dataset for: ', log='dataset_id' ) if args != []: execute_parallel( self.bq.copy_table_structure, args, message='copy table structure for: ', log='source_table_id' ) def run_test(self): self.run_unit_tests()
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_run.yaml")
def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_run.yaml") self.bq_client.create_dataset(dataset_id='test')
class TestPipelineExecutorRun(unittest.TestCase): def setUp(self): self.bq_client = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_run.yaml") self.bq_client.create_dataset(dataset_id='test') def test_run_completed_no_error(self): self.p_ex.run() self.assertTrue( self.bq_client.table_exists( table_id='ref_sheet1', dataset_id="test" ), "test.ref_sheet1 exists" ) self.assertTrue( self.bq_client.table_exists( table_id='ref_sheet2', dataset_id="test" ), "test.ref_sheet2 exists" ) self.assertTrue( self.bq_client.table_exists( table_id='gs_test_table1', dataset_id="test" ), "test.gs_test_table1 exists" ) self.assertTrue( self.bq_client.table_exists( table_id='table1', dataset_id="test" ), "test.table1 exists" ) self.assertTrue( self.bq_client.table_exists( table_id='table2', dataset_id="test" ), "test.table2 exists" ) def tearDown(self): if self.bq_client.table_exists(table_id='table1', dataset_id='test'): self.bq_client.delete_table(table_id='table1', dataset_id='test') if self.bq_client.table_exists(table_id='table2', dataset_id='test'): self.bq_client.delete_table(table_id='table2', dataset_id='test') if self.bq_client.table_exists(table_id='ref_sheet1', dataset_id='test'): self.bq_client.delete_table(table_id='ref_sheet1', dataset_id='test') if self.bq_client.table_exists(table_id='ref_sheet2', dataset_id='test'): self.bq_client.delete_table(table_id='ref_sheet2', dataset_id='test') if self.bq_client.table_exists(table_id='gs_test_table1', dataset_id='test'): self.bq_client.delete_table(table_id='gs_test_table1', dataset_id='test')
def setUp(self): self.bq_exec = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_dummy.yaml") self.bq_client = bigquery.Client()
class TestPipelineExecutorRunBatch(unittest.TestCase): def setUp(self): self.bq_exec = BigQueryExecutor() self.p_ex = pl.PipelineExecutor("tests/yaml/test_dummy.yaml") self.bq_client = bigquery.Client() def test_run_batch_create_tables(self): batch = { "desc": "create table1 & table2 in staging", "tables": [ { "table_desc": "table1", "create_table": { "table_id": "table1", "dataset_id": "test", "description": "some descriptive text here", "file": "tests/sql/table1.sql" }, "pk": ["col1", "col2"], "mock_data": "sql/table1_mocked.sql" }, { "table_desc": "table2", "create_table": { "table_id": "table2", "dataset_id": "test", "file": "tests/sql/table2.sql" }, "pk": ["col1", "col2"], "mock_data": "sql/table1_mocked.sql" } ] } self.p_ex.run_batch(batch) self.assertTrue( self.bq_exec.table_exists( table_id='table1', dataset_id="test"), "Tables are created") table_ref = self.bq_exec.get_table_ref(dataset_id='test', table_id='table1', project_id=bq_default_project()) table = self.bq_client.get_table(table_ref) # API request self.assertTrue( table.description == "some descriptive text here", "The 'description' is not the same" ) self.assertTrue( self.bq_exec.table_exists( table_id='table2', dataset_id="test" ), "Tables are created") def test_run_batch_create_gs_tables(self): batch = { "desc": "load test spreadsheet into bigquery", "sheets": [ { "table_desc": "ref gs_test_table1", "create_gs_table": { "table_id": "gs_test_table1", "dataset_id": "test", "sheet_name": "input", "googlesheet_uri": "https://docs.google.com/spreadsheets/d/19Jmapr9G1nrMcW2QTpY7sOvKYaFXnw5krK6dD0GwEqU/edit#gid=0" } } ] } self.p_ex.create_gs_tables(batch) self.assertTrue( self.bq_exec.table_exists( table_id='gs_test_table1', dataset_id="test"), "gs_test_table1 does NOT exists") def test_run_batch_load_google_sheets(self): batch = { "desc": "load test spreadsheet into bigquery", "sheets": [ { "table_desc": "ref sheet1", "load_google_sheet": { "table_id": "ref_sheet1", "dataset_id": "test", "sheet_name": "input", "googlesheet_uri": "https://docs.google.com/spreadsheets/d/19Jmapr9G1nrMcW2QTpY7sOvKYaFXnw5krK6dD0GwEqU/edit#gid=0" } }, { "table_desc": "ref sheet2", "load_google_sheet": { "table_id": "ref_sheet2", "dataset_id": "test", "sheet_name": "input", "description": "foo bar", "googlesheet_uri": "https://docs.google.com/spreadsheets/d/19Jmapr9G1nrMcW2QTpY7sOvKYaFXnw5krK6dD0GwEqU/edit#gid=0" } } ] } self.p_ex.load_google_sheets(batch) self.assertTrue( self.bq_exec.table_exists( table_id='ref_sheet1', dataset_id="test"), "ref_sheet1 does NOT exists") self.assertTrue( self.bq_exec.table_exists( table_id='ref_sheet2', dataset_id="test"), "ref_sheet2 does NOT exists") table_ref = self.bq_exec.get_table_ref(dataset_id='test', table_id='ref_sheet2', project_id=bq_default_project()) table = self.bq_client.get_table(table_ref) # API request self.assertTrue( table.description == "foo bar", "The 'description' is not the same" ) def test_run_batch_create_partition_tables(self): # A table must be created first, only then it can be partitioned batch = { "desc": "create partition_table1", "tables": [ { "table_desc": "creating table", "create_table": { "table_id": "partition_table1", "dataset_id": "test", "file": "tests/sql/table1.sql" }, "pk": ["col1", "col2"], "mock_data": "sql/table1_mocked.sql" }, { "table_desc": "creating partition table", "create_partition_table": { "table_id": "partition_table1", "dataset_id": "test", "description": "some descriptive text here", "file": "tests/sql/table1.sql", "partition_dates": [] }, "pk": ["col1", "col2"], "mock_data": "sql/table1_mocked.sql" } ] } self.p_ex.run_batch(batch) self.assertTrue( self.bq_exec.table_exists( table_id='partition_table1', dataset_id="test"), "Partition table is created") table_ref = self.bq_exec.get_table_ref(dataset_id='test', table_id='partition_table1', project_id=bq_default_project()) table = self.bq_client.get_table(table_ref) # API request self.assertTrue( table.description == "some descriptive text here", "The 'description' is not the same" ) def tearDown(self): if self.bq_exec.table_exists(table_id='table1', dataset_id='test'): self.bq_exec.delete_table(table_id='table1', dataset_id='test') if self.bq_exec.table_exists(table_id='table2', dataset_id='test'): self.bq_exec.delete_table(table_id='table2', dataset_id='test') if self.bq_exec.table_exists(table_id='test_run_batch_table_1', dataset_id='test'): self.bq_exec.delete_table(table_id='test_run_batch_table_1',dataset_id='test') if self.bq_exec.table_exists(table_id='test_run_batch_table_2', dataset_id='test'): self.bq_exec.delete_table(table_id='test_run_batch_table_2', dataset_id='test') if self.bq_exec.table_exists(table_id='gs_test_table1', dataset_id='test'): self.bq_exec.delete_table(table_id='gs_test_table1', dataset_id='test') if self.bq_exec.table_exists(table_id='ref_sheet1', dataset_id='test'): self.bq_exec.delete_table(table_id='ref_sheet1', dataset_id='test') if self.bq_exec.table_exists(table_id='ref_sheet2', dataset_id='test'): self.bq_exec.delete_table(table_id='ref_sheet2', dataset_id='test') if self.bq_exec.table_exists(table_id='partition_table1', dataset_id='test'): self.bq_exec.delete_table(table_id='partition_table1', dataset_id='test')
""" Facebook Tests """ import unittest import pandas as pd import mock from pandas.testing import assert_series_equal from pandas.testing import assert_frame_equal from pygyver.etl.facebook import transform_campaign_budget from pygyver.etl.facebook import build_predicted_revenue_events, calculate_batches, \ split_events_to_batches, FacebookExecutor from pygyver.etl.dw import BigQueryExecutor from pygyver.etl.dw import read_sql from facebook_business.adobjects.serverside.event_request import EventResponse from facebook_business.exceptions import FacebookRequestError db = BigQueryExecutor() error_json = { "error": { "fbtrace_id": "test_fb_trace_id", "message": "Some generic message", "error_user_msg": "A more detailed message" } } context = mock.Mock().files = {'test': 'test'} def get_predicted_revenue_mock(): sql = read_sql(file='tests/sql/unit_predicted_revenue_mocked.sql')
from pygyver.etl.lib import get_dataset_prefix from pygyver.etl.dw import BigQueryExecutor dataset_prefix = get_dataset_prefix() target_dataset_id = (dataset_prefix if dataset_prefix else "") + "test" bq = BigQueryExecutor() bq.create_dataset(target_dataset_id) bq.create_table(dataset_id=target_dataset_id, table_id="table1", file="tests/sql/table1.sql")