class DataWorkspaceCatalogueItemsPipeline(_DataWorkspacePipeline): """Pipeline meta object for data workspace catalogue items data.""" source_url = f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/dataset/catalogue-items' table_config = TableConfig( table_name='dataworkspace__catalogue_items', field_mapping=[ ('purpose', sa.Column('purpose', sa.Text, nullable=False)), ('id', sa.Column('id', UUID, primary_key=True)), ('name', sa.Column('name', sa.Text, nullable=False, index=True)), ( 'short_description', sa.Column('short_description', sa.Text, nullable=False), ), ('description', sa.Column('description', sa.Text)), ('published', sa.Column('published', sa.Boolean, nullable=False)), ('created_date', sa.Column('created_date', sa.DateTime, nullable=False)), ('published_at', sa.Column('published_at', sa.Date, index=True)), ( 'information_asset_owner', sa.Column('information_asset_owner', sa.Integer), ), ( 'information_asset_manager', sa.Column('information_asset_manager', sa.Integer), ), ('enquiries_contact', sa.Column('enquiries_contact', sa.Integer)), ('source_tags', sa.Column('source_tags', sa.ARRAY(sa.Text))), ('licence', sa.Column('license', sa.Text)), ('personal_data', sa.Column('personal_data', sa.Text)), ('retention_policy', sa.Column('retention_policy', sa.Text)), ( 'eligibility_criteria', sa.Column('eligibility_criteria', sa.ARRAY(sa.Text)), ), ('slug', sa.Column('slug', sa.Text)), ( 'source_tables', TableConfig( table_name='dataworkspace__source_tables', transforms=[ lambda record, table_config, contexts: { **record, 'dataset_id': contexts[0]['id'], } ], field_mapping=[ ('id', sa.Column('id', UUID, primary_key=True)), ('dataset_id', sa.Column('dataset_id', UUID, nullable=False)), ('name', sa.Column('name', sa.Text, nullable=False)), ('schema', sa.Column('schema', sa.Text, nullable=False)), ('table', sa.Column('table', sa.Text, nullable=False)), ], ), ), ], )
def test_columns_property_only_returns_immediate_columns(self): id_col = ("id", Column("id", Integer)) name_col = ("name", Column("name", String)) single_table_config = TableConfig( table_name="test", field_mapping=[id_col], ) nested_table_config = TableConfig( table_name="test", field_mapping=[ id_col, ( "relation", TableConfig( table_name="relation", field_mapping=[("foo", Column("foo", String))], ), ), name_col, ], ) assert single_table_config.columns == [id_col] assert nested_table_config.columns == [id_col, name_col]
class TagsClassifierTrainPipeline(_PipelineDAG): schedule_interval = ( None # For now we trigger the pipeline manually when training data is uploaded ) table_config = TableConfig( table_name='interactions_tags_classifier_metrics', field_mapping=[ ("model_version", sa.Column("model_version", sa.Text)), ("model_for_tag", sa.Column("model_for_tag", sa.Text)), ("size", sa.Column("size", sa.Integer)), ("precisions", sa.Column("precisions", sa.Numeric)), ("recalls", sa.Column("recalls", sa.Numeric)), ("f1", sa.Column("f1", sa.Numeric)), ("accuracy", sa.Column("accuracy", sa.Numeric)), ("auc", sa.Column("auc", sa.Numeric)), ], ) def get_fetch_operator(self) -> PythonOperator: return PythonOperator( task_id='train-model', python_callable=model_training_with_labelled_data, queue='tensorflow', provide_context=True, op_args=[self.table_config.table_name], retries=self.fetch_retries, )
def test_insert_data_into_db_using_db_config(mocker, mock_db_conn, s3): s3.iter_keys.return_value = [ ('1', [{"id": 1, "extra": "ignored", "data": "text"}]), ('2', [{"id": 2}]), ] table_config = TableConfig( table_name="my-table", field_mapping=( ("id", sqlalchemy.Column("id", sqlalchemy.Integer(), nullable=False)), ("data", sqlalchemy.Column("data", sqlalchemy.String())), ), ) mock_table = mock.Mock() mocker.patch.object(table_config, '_table', mock_table) mocker.patch.object(table_config, '_temp_table', mock_table) db_tables.insert_data_into_db( target_db="test-db", table_config=table_config, ts_nodash="123", ) mock_db_conn.execute.assert_has_calls( [ mock.call(mock_table.insert(), data="text", id=1), mock.call(mock_table.insert(), data=None, id=2), ] ) s3.iter_keys.assert_called_once_with()
class CoronavirusInteractionsDashboardPipeline(_SQLPipelineDAG): start_date = datetime(2020, 3, 25) dependencies = [ InteractionsDatasetPipeline, AdvisersDatasetPipeline, TeamsDatasetPipeline, ContactsDatasetPipeline, ] table_config = TableConfig( table_name="coronavirus_interactions_dashboard_data", field_mapping=[ (None, sa.Column("id", sa.Integer, primary_key=True, autoincrement=True)), ("interaction_date", sa.Column("interaction_date", sa.Date)), ("company_name", sa.Column("company_name", sa.Text)), ("company_country", sa.Column("company_country", sa.Text)), ("company_link", sa.Column("company_link", sa.Text)), ("company_sector", sa.Column("company_sector", sa.Text)), ("company_region", sa.Column("company_region", sa.Text)), ("subject_of_interaction", sa.Column("subject_of_interaction", sa.Text)), ("data_hub_link", sa.Column("data_hub_link", sa.Text)), ("team", sa.Column("team", sa.Text)), ("role", sa.Column("role", sa.Text)), ("policy_areas", sa.Column("policy_areas", sa.ARRAY(sa.Text))), ("entered_into_data_hub", sa.Column("entered_into_data_hub", sa.DateTime)), ], ) query = '''
class FDIDashboardPipeline(_SQLPipelineDAG): dependencies = [ CompaniesDatasetPipeline, InvestmentProjectsDatasetPipeline ] start_date = datetime(2020, 3, 3) table_config = TableConfig( table_name="fdi_dashboard_data", field_mapping=[ ('fdi_value', sa.Column('fdi_value', sa.Text)), ('id', sa.Column('id', UUID, primary_key=True)), ( 'investor_company_country', sa.Column('investor_company_country', sa.String), ), ('number_new_jobs', sa.Column('number_new_jobs', sa.Integer)), ( 'number_safeguarded_jobs', sa.Column('number_safeguarded_jobs', sa.Integer), ), ('overseas_region', sa.Column('overseas_region', sa.String)), ('project_end_date', sa.Column('project_end_date', sa.Date)), ('project_link', sa.Column('project_link', sa.String)), ('project_reference', sa.Column('project_reference', sa.String)), ('project_sector', sa.Column('project_sector', sa.String)), ('sector_cluster', sa.Column('sector_cluster', sa.String)), ('stage', sa.Column('stage', sa.String)), ('status', sa.Column('status', sa.String)), ('total_investment', sa.Column('total_investment', sa.Numeric)), ], ) query = '''
class HMRCNonEUExports(_HMRCPipeline): base_filename = "smke19" records_start_year = 2009 num_csv_fields = (22, 23) table_config = TableConfig( schema="hmrc", table_name="non_eu_exports", # https://www.uktradeinfo.com/Statistics/Documents/Data%20Downloads/Tech_Spec_SMKE19.DOC field_mapping=[ (None, sa.Column("id", sa.Integer, primary_key=True, autoincrement=True)), (0, sa.Column("comcode", sa.String(9))), (1, sa.Column("sitc", sa.String(5))), (2, sa.Column("record_type", sa.String(3))), (3, sa.Column("cod_sequence", sa.String(3))), (4, sa.Column("cod_alpha", sa.String(2))), (5, sa.Column("account_mmyy", sa.String(7))), (6, sa.Column("port_sequence", sa.String(3))), (7, sa.Column("port_alpha", sa.String(3))), (8, sa.Column("flag_sequence", sa.String(3))), (9, sa.Column("flag_alpha", sa.String(2))), (10, sa.Column("trade_indicator", sa.String(1))), (11, sa.Column("container", sa.String(3))), (12, sa.Column("mode_of_transport", sa.String(3))), (13, sa.Column("inland_mot", sa.String(2))), (14, sa.Column("golo_sequence", sa.String(3))), (15, sa.Column("golo_alpha", sa.String(3))), (16, sa.Column("suite_indicator", sa.String(3))), (17, sa.Column("procedure_code", sa.String(3))), (18, sa.Column("value", sa.BigInteger)), (19, sa.Column("quantity_1", sa.BigInteger)), (20, sa.Column("quantity_2", sa.BigInteger)), (21, sa.Column("industrial_plant_comcode", sa.String(15))), (22, sa.Column("_source_name", sa.String())), ], )
class HMRCEUImports(_HMRCPipeline): base_filename = "smkm46" records_start_year = 2009 num_csv_fields = (17, 18) table_config = TableConfig( schema="hmrc", table_name="eu_imports", # https://www.uktradeinfo.com/Statistics/Documents/Data%20Downloads/Tech_Spec_SMKX46.DOC field_mapping=[ (None, sa.Column("id", sa.Integer, primary_key=True, autoincrement=True)), (0, sa.Column("comcode", sa.String(9))), (1, sa.Column("record_type", sa.String(3))), (2, sa.Column("cod_sequence", sa.String(3))), (3, sa.Column("cod_alpha", sa.String(2))), (4, sa.Column("trade_indicator", sa.String(1))), (5, sa.Column("coo_seq", sa.String(3))), (6, sa.Column("coo_alpha", sa.String(2))), (7, sa.Column("nature_of_transaction", sa.String(3))), (8, sa.Column("mode_of_transport", sa.String(3))), (9, sa.Column("period_reference", sa.String(7))), (10, sa.Column("suite_indicator", sa.String(3))), (11, sa.Column("sitc", sa.String(5))), (12, sa.Column("ip_comcode", sa.String(9))), (13, sa.Column("num_consignments", sa.BigInteger)), (14, sa.Column("value", sa.BigInteger)), (15, sa.Column("nett_mass", sa.BigInteger)), (16, sa.Column("supp_unit", sa.BigInteger)), (17, sa.Column("_source_name", sa.String())), ], )
class EventsDatasetPipeline(_DatasetPipeline): """Pipeline meta object for EventsDataset.""" source_url = '{0}/v4/dataset/events-dataset'.format(config.DATAHUB_BASE_URL) table_config = TableConfig( schema='dit', table_name='data_hub__events', field_mapping=[ ('address_1', sa.Column('address_1', sa.String)), ('address_2', sa.Column('address_2', sa.String)), ('address_country__name', sa.Column('address_country', sa.String)), ('address_county', sa.Column('address_county', sa.String)), ('address_postcode', sa.Column('address_postcode', sa.String)), ('address_town', sa.Column('address_town', sa.String)), ('created_on', sa.Column('created_on', sa.DateTime)), ('end_date', sa.Column('end_date', sa.Date)), ('event_type__name', sa.Column('event_type', sa.String)), ('id', sa.Column('id', UUID, primary_key=True)), ('lead_team_id', sa.Column('lead_team_id', UUID)), ('location_type__name', sa.Column('location_type', sa.String)), ('name', sa.Column('name', sa.String)), ('notes', sa.Column('notes', sa.Text)), ('organiser_id', sa.Column('organiser_id', UUID)), ('service_name', sa.Column('service_name', sa.String)), ('start_date', sa.Column('start_date', sa.Date)), ('team_ids', sa.Column('team_ids', sa.ARRAY(sa.Text))), ('uk_region__name', sa.Column('uk_region', sa.String)), ('created_by_id', sa.Column('created_by_id', UUID)), ('disabled_on', sa.Column('disabled_on', sa.DateTime)), ], )
class GreatGovUKFormsPipeline(_ActivityStreamPipeline): name = "great-gov-uk-forms" index = "activities" table_config = TableConfig( schema="dit", table_name="great_gov_uk__forms", transforms=[ lambda record, table_config, contexts: { **record, "norm_id": record["object"]["id"].replace( "dit:directoryFormsApi:Submission:", ""), "submission_type": record["object"]["attributedTo"]["id"].replace( "dit:directoryFormsApi:SubmissionType:", ""), "submission_action": record["object"]["attributedTo"]["type"].replace( "dit:directoryFormsApi:SubmissionAction:", ""), } ], field_mapping=[ ("norm_id", sa.Column("id", sa.Integer, primary_key=True)), (("object", "url"), sa.Column("url", sa.String)), (("object", "published"), sa.Column("created_at", sa.DateTime)), ("submission_type", sa.Column("submission_type", sa.String)), ("submission_action", sa.Column("submission_action", sa.String)), (("actor", "dit:emailAddress"), sa.Column("actor_email", sa.String)), ( ("actor"), sa.Column("actor", sa.JSON), ), ( ("object", "dit:directoryFormsApi:Submission:Data"), sa.Column("data", sa.JSON), ), ( ("object", "dit:directoryFormsApi:Submission:Meta"), sa.Column("meta", sa.JSON), ), ], ) query = { "bool": { "filter": [ { "term": { "object.type": "dit:directoryFormsApi:Submission" } }, { "term": { "type": "Create" } }, ] } }
class OMISDatasetPipeline(_DatasetPipeline): """Pipeline meta object for OMISDataset.""" source_url = '{0}/v4/dataset/omis-dataset'.format(config.DATAHUB_BASE_URL) table_config = TableConfig( schema='dit', table_name='data_hub__orders', field_mapping=[ ('cancellation_reason__name', sa.Column('cancellation_reason', sa.Text)), ('cancelled_on', sa.Column('cancelled_date', sa.DateTime)), ('company_id', sa.Column('company_id', UUID)), ('completed_on', sa.Column('completion_date', sa.DateTime)), ('contact_id', sa.Column('contact_id', UUID)), ('created_by__dit_team_id', sa.Column('dit_team_id', UUID)), ('created_on', sa.Column('created_date', sa.DateTime)), ('delivery_date', sa.Column('delivery_date', sa.Date)), ('id', sa.Column('id', UUID, primary_key=True)), ('invoice__subtotal_cost', sa.Column('subtotal', sa.Integer)), ('paid_on', sa.Column('payment_received_date', sa.DateTime)), ('primary_market__name', sa.Column('market', sa.Text)), ('quote__accepted_on', sa.Column('quote_accepted_on', sa.DateTime)), ('quote__created_on', sa.Column('quote_created_on', sa.DateTime)), ('reference', sa.Column('omis_order_reference', sa.String)), ('refund_created', sa.Column('refund_created', sa.DateTime)), ('refund_total_amount', sa.Column('refund_total_amount', sa.Integer)), ('sector_name', sa.Column('sector', sa.String)), ('services', sa.Column('services', sa.Text)), ('status', sa.Column('order_status', sa.String)), ('subtotal_cost', sa.Column('net_price', sa.Numeric)), ('total_cost', sa.Column('total_cost', sa.Integer)), ('uk_region__name', sa.Column('uk_region', sa.Text)), ('vat_cost', sa.Column('vat_cost', sa.Integer)), ('created_by_id', sa.Column('created_by_id', UUID)), ], )
class ConsentPipeline(_ConsentPipeline): source_url = "{}/api/v1/person/datahub_export/?limit={}".format( config.CONSENT_BASE_URL, config.CONSENT_RESULTS_PER_PAGE) table_config = TableConfig( schema="dit", table_name="consent_service__current_consents", field_mapping=[ ("id", sa.Column("id", sa.Integer)), ("key", sa.Column("key", sa.String)), ("email", sa.Column("email", sa.String)), ("phone", sa.Column("phone", sa.String)), ("key_type", sa.Column("key_type", sa.String)), ("created_at", sa.Column("created_at", sa.DateTime)), ("modified_at", sa.Column("modified_at", sa.DateTime)), ("current", sa.Column("current", sa.Boolean)), ( "email_marketing_consent", sa.Column("email_marketing_consent", sa.Boolean), ), ( "phone_marketing_consent", sa.Column("phone_marketing_consent", sa.Boolean), ), ], )
class DataWorkspaceApplicationInstancePipeline(_DataWorkspacePipeline): """Pipeline meta object for data workspace application instance data.""" source_url = ( f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/application-instance/instances' ) table_config = TableConfig( table_name='dataworkspace__application_instances', field_mapping=[ ('commit_id', sa.Column('commit_id', sa.Text)), ('cpu', sa.Column('cpu', sa.Text)), ('id', sa.Column('id', UUID, primary_key=True)), ('memory', sa.Column('memory', sa.Text)), ('owner_id', sa.Column('owner_id', sa.Numeric)), ('proxy_url', sa.Column('proxy_url', sa.Text)), ('public_host', sa.Column('public_host', sa.Text)), ('spawner', sa.Column('spawner', sa.Text)), ( 'spawner_application_instance_id', sa.Column('spawner_application_instance_id', sa.Text), ), ( 'application_template_name', sa.Column('application_template_name', sa.Text), ), ('spawner_cpu', sa.Column('spawner_cpu', sa.Text)), ('spawner_created_at', sa.Column('spawner_created_at', sa.DateTime)), ('spawner_memory', sa.Column('spawner_memory', sa.Text)), ('spawner_stopped_at', sa.Column('spawner_stopped_at', sa.DateTime)), ('state', sa.Column('state', sa.Text)), ], )
class TagsClassifierPredictionPipeline(_PipelineDAG): table_config = TableConfig( table_name='interactions_dataset_with_tags', field_mapping=[ ("id", sa.Column("id", sa.Text, primary_key=True)), ("policy_feedback_notes", sa.Column("policy_feedback_notes", sa.Text)), ("tags_prediction", sa.Column("tags_prediction", sa.Text)), ("tag_1", sa.Column("tag_1", sa.Text)), ( "probability_score_tag_1", sa.Column("probability_score_tag_1", sa.Numeric), ), ("tag_2", sa.Column("tag_2", sa.Text)), ( "probability_score_tag_2", sa.Column("probability_score_tag_2", sa.Numeric), ), ("tag_3", sa.Column("tag_3", sa.Text)), ( "probability_score_tag_3", sa.Column("probability_score_tag_3", sa.Numeric), ), ("tag_4", sa.Column("tag_4", sa.Text)), ( "probability_score_tag_4", sa.Column("probability_score_tag_4", sa.Numeric), ), ("tag_5", sa.Column("tag_5", sa.Text)), ( "probability_score_tag_5", sa.Column("probability_score_tag_5", sa.Numeric), ), ], ) controller_pipeline = InteractionsDatasetPipeline dependencies = [InteractionsDatasetPipeline] query = f""" SELECT id, policy_feedback_notes FROM "{controller_pipeline.table_config.schema}"."{controller_pipeline.table_config.table_name}" WHERE policy_feedback_notes!='' AND policy_areas NOTNULL AND ( created_on > current_date - INTERVAL '6 weeks' OR modified_on > current_date - INTERVAL '6 weeks' ) """ def get_fetch_operator(self) -> PythonOperator: return PythonOperator( task_id='make-prediction', python_callable=make_prediction, queue='tensorflow', provide_context=True, op_args=[self.target_db, self.query, self.table_config.table_name], retries=self.fetch_retries, )
def test_table_property_returns_sqlalchemy_table(self): single_table_config = TableConfig( table_name="test", field_mapping=[("id", Column("id", Integer))], ) table = single_table_config.table assert type(table) is sqlalchemy.Table assert {col.name for col in table.columns} == {"id"}
class TestDAG(_PandasPipelineWithPollingSupport): use_polling = False data_getter = lambda: None # noqa: E731 table_config = TableConfig( table_name="test-table", field_mapping=( ("id", sa.Column("id", sa.Integer, primary_key=True)), ("data", sa.Column("data", sa.Integer)), ), )
class MinisterialInteractionsDashboardPipeline(_SQLPipelineDAG): """ A dashboard of Data Hub interactions involving Ministers and certain senior staff """ start_date = datetime(2020, 5, 6) dependencies = [ AdvisersDatasetPipeline, CompaniesDatasetPipeline, InteractionsDatasetPipeline, ONSPostcodePipeline, ] table_config = TableConfig( table_name='ministerial_interactions', field_mapping=[ (None, sa.Column('id', sa.Integer, primary_key=True, autoincrement=True)), ('interaction_id', sa.Column('interaction_id', UUID)), ('interaction_link', sa.Column('interaction_link', sa.Text)), ('company_name', sa.Column('company_name', sa.Text)), ('company_link', sa.Column('company_link', sa.Text)), ('company_country', sa.Column('company_country', sa.Text)), ('company_postcode', sa.Column('company_postcode', sa.Text)), ( 'company_address_latitude_longitude', sa.Column('company_address_latitude_longitude', sa.Text), ), ('company_uk_region', sa.Column('company_uk_region', sa.Text)), ( 'company_number_of_employees', sa.Column('company_number_of_employees', sa.Text), ), ('company_turnover', sa.Column('company_turnover', sa.Text)), ('company_one_list_tier', sa.Column('company_one_list_tier', sa.Text)), ('company_sector', sa.Column('company_sector', sa.Text)), ('interaction_date', sa.Column('interaction_date', sa.Date)), ('adviser_id', sa.Column('adviser_id', UUID)), ('adviser_name', sa.Column('adviser_name', sa.Text)), ('interaction_subject', sa.Column('interaction_subject', sa.Text)), ('communication_channel', sa.Column('communication_channel', sa.Text)), ('policy_areas', sa.Column('policy_areas', sa.Text)), ('policy_feedback_notes', sa.Column('policy_feedback_notes', sa.Text)), ('policy_issue_types', sa.Column('policy_issue_types', sa.Text)), ( 'dun_and_bradstreet_linked_record', sa.Column('dun_and_bradstreet_linked_record', sa.Text), ), ('service', sa.Column('service', sa.Text)), ], ) query = '''
class ZendeskDITTicketsPipeline(_PipelineDAG): schedule_interval = "@daily" allow_null_columns = True use_utc_now_as_source_modified = True table_config = TableConfig( schema="zendesk", table_name="dit_tickets", transforms=[transforms_fields_dit], field_mapping=[ ("id", sa.Column("id", sa.BigInteger, primary_key=True)), ("via", sa.Column("via", sa.JSON)), ("created_at", sa.Column("created_at", sa.DateTime)), ("solved_at", sa.Column("solved_at", sa.DateTime)), ("updated_at", sa.Column("updated_at", sa.DateTime)), ( "full_resolution_time_in_minutes", sa.Column("full_resolution_time_in_minutes", sa.Integer), ), ("subject", sa.Column("subject", sa.String)), ("description", sa.Column("description", sa.String)), ("service", sa.Column("service", sa.String)), ("priority", sa.Column("priority", sa.String)), ("status", sa.Column("status", sa.String)), ("recipient", sa.Column("recipient", sa.String)), ("assignee_id", sa.Column("assignee_id", sa.BigInteger)), ("organization_id", sa.Column("organization_id", sa.BigInteger)), ("group_id", sa.Column("group_id", sa.BigInteger)), ("tags", sa.Column("tags", sa.ARRAY(sa.String))), ("satisfaction_rating", sa.Column("satisfaction_rating", sa.JSON)), ( "sharing_agreement_ids", sa.Column("sharing_agreement_ids", sa.ARRAY(sa.BigInteger)), ), ("brand_id", sa.Column("brand_id", sa.BigInteger)), ("service", sa.Column("service", sa.String)), ("e_responder", sa.Column("e_responder", sa.ARRAY(sa.String))), ("e_sector", sa.Column("e_sector", sa.ARRAY(sa.String))), ("e_policy_area", sa.Column("e_policy_area", sa.ARRAY(sa.String))), ("e_dit_old", sa.Column("e_dit_old", sa.String)), ], ) def get_fetch_operator(self) -> PythonOperator: return PythonOperator( task_id="fetch-daily-tickets", python_callable=fetch_daily_tickets, provide_context=True, op_args=[ self.table_config.schema, self.table_config.table_name, "dit" ], retries=self.fetch_retries, )
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.table_config = TableConfig( schema="hmrc", table_name=self.table_name, transforms=[ # This is based on https://www.uktradeinfo.com/media/vrehes3e/tech_spec_sesa16.doc, # which has an "a" in the file name which doesn't match the files we use, but it's the # best we have lambda record, table_config, contexts: { "sitc_2": record[0][0:2], "sitc_0": record[0][2:3], "year": record[0][3:7], "month": record[0][7:9], "estimated_value": record[0][9:24], "_source_name": record[1], } ], field_mapping=[ ( None, sa.Column( "id", sa.BigInteger, primary_key=True, autoincrement=True ), ), ("sitc_2", sa.Column("sitc_2", sa.String(2), nullable=False)), ("sitc_0", sa.Column("sitc_0", sa.String(1), nullable=False)), ("year", sa.Column("year", sa.SmallInteger, nullable=False)), ("month", sa.Column("month", sa.SmallInteger, nullable=False)), ( "estimated_value", sa.Column("estimated_value", sa.BigInteger, nullable=False), ), ("_source_name", sa.Column("_source_name", sa.String, nullable=False)), ], indexes=[ LateIndex("sitc_2"), LateIndex("sitc_0"), LateIndex(["year", "month"]), LateIndex("month"), LateIndex("_source_name"), ], check_constraints=[ sa.CheckConstraint("1 <= month and month <= 12"), sa.CheckConstraint("1500 <= year AND year <= 3000"), sa.CheckConstraint("sitc_2 ~ '^[0-9][0-9]$'"), sa.CheckConstraint("sitc_0 ~ '^[0-9]$'"), sa.CheckConstraint("estimated_value >= 0"), ], )
def test_related_tables_property_returns_directly_related_tables(self): id_col = ("id", Column("id", Integer)) foo_col = ("foo", Column("foo", Integer)) bar_col = ("bar", Column("bar", Integer)) config = TableConfig( table_name="test", field_mapping=[ id_col, ( "foo", TableConfig( table_name="foo", field_mapping=[ foo_col, ( "bar", TableConfig(table_name="bar", field_mapping=[bar_col]), ), ], ), ), ], ) assert len(config.related_table_configs) == 1 assert config.related_table_configs[0][0] == "foo" assert type(config.related_table_configs[0][1]) is TableConfig assert config.related_table_configs[0][1].table_name == "foo" assert len(config.related_table_configs[0][1].related_table_configs) assert config.related_table_configs[0][1].related_table_configs[0][ 0] == "bar" assert (type( config.related_table_configs[0][1].related_table_configs[0][1]) is TableConfig) assert (config.related_table_configs[0][1].related_table_configs[0] [1].table_name == "bar")
def test_tables_property_returns_all_tables(self): id_col = ("id", Column("id", Integer)) name_col = ("name", Column("name", String)) nested_table_config = TableConfig( table_name="test", field_mapping=[ id_col, ( "relation", TableConfig( table_name="relation", field_mapping=[("foo", Column("foo", String))], ), ), name_col, ], ) tables = nested_table_config.tables assert all(type(table) is sqlalchemy.Table for table in tables) assert [table.name for table in tables] == ["test", "relation"] assert {col.name for col in tables[0].columns} == {"id", "name"} assert {col.name for col in tables[1].columns} == {"foo"}
class ExportWinsHVCDatasetPipeline(_DatasetPipeline): """Pipeline meta object for Export wins HVC data.""" source_url = '{0}/datasets/hvc'.format(config.EXPORT_WINS_BASE_URL) table_config = TableConfig( schema='dit', table_name='export_wins__hvc_dataset', field_mapping=[ ('campaign_id', sa.Column('campaign_id', sa.String)), ('financial_year', sa.Column('financial_year', sa.Integer)), ('id', sa.Column('id', sa.Integer, primary_key=True)), ('name', sa.Column('name', sa.String)), ], )
class ContactsDatasetPipeline(_DatasetPipeline): """Pipeline meta object for ContactsDataset.""" dependencies = [ConsentPipeline] source_url = '{0}/v4/dataset/contacts-dataset'.format(config.DATAHUB_BASE_URL) table_config = TableConfig( schema='dit', table_name='data_hub__contacts', field_mapping=[ ('address_1', sa.Column('address_1', sa.String)), ('address_2', sa.Column('address_2', sa.String)), ('address_country__name', sa.Column('address_country', sa.Text)), ('address_county', sa.Column('address_county', sa.String)), ('address_postcode', sa.Column('address_postcode', sa.String)), ( 'address_same_as_company', sa.Column('address_same_as_company', sa.Boolean), ), ('address_town', sa.Column('address_town', sa.String)), ('archived', sa.Column('archived', sa.Boolean)), ('archived_on', sa.Column('archived_on', sa.DateTime)), ('company_id', sa.Column('company_id', UUID)), ('created_on', sa.Column('date_added_to_datahub', sa.Date)), ('email', sa.Column('email', sa.String)), ('email_alternative', sa.Column('email_alternative', sa.String)), (None, sa.Column("email_marketing_consent", sa.Boolean, default=False)), ('id', sa.Column('id', UUID, primary_key=True)), ('job_title', sa.Column('job_title', sa.String)), ('modified_on', sa.Column('modified_on', sa.DateTime)), ('name', sa.Column('contact_name', sa.Text)), ('notes', sa.Column('notes', sa.Text)), ('primary', sa.Column('is_primary', sa.Boolean)), ('telephone_alternative', sa.Column('telephone_alternative', sa.String)), ('telephone_number', sa.Column('phone', sa.String)), ('created_by_id', sa.Column('created_by_id', UUID)), ], ) def get_transform_operator(self) -> PythonOperator: return PythonOperator( task_id='update-contact-email-consent', python_callable=update_datahub_contact_consent, provide_context=True, op_args=[ self.target_db, self.table_config.tables[0], # pylint: disable=no-member ], retries=self.fetch_retries, )
class DataWorkspaceUserPipeline(_DataWorkspacePipeline): """Pipeline meta object for data workspace user data.""" source_url = f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/account/users' table_config = TableConfig( table_name='dataworkspace__users', field_mapping=[ ('email', sa.Column('email', sa.Text)), ('first_name', sa.Column('first_name', sa.Text)), ('id', sa.Column('id', sa.Numeric, primary_key=True)), ('is_staff', sa.Column('is_staff', sa.Boolean)), ('is_superuser', sa.Column('is_superuser', sa.Boolean)), ('last_name', sa.Column('last_name', sa.Text)), ], )
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.table_config = TableConfig( schema='public_health_england', table_name=self.table_name, field_mapping=[ ('date', sa.Column('date', sa.Date)), ('areaCode', sa.Column('area_code', sa.String)), ('areaName', sa.Column('area_name', sa.String)), ('newCasesByPublishDate', sa.Column('new_cases', sa.Integer)), ('cumCasesByPublishDate', sa.Column('cumulative_cases', sa.Integer)), ('newDeathsByPublishDate', sa.Column('new_deaths', sa.Integer)), ('cumDeathsByPublishDate', sa.Column('cumulative_deaths', sa.Integer)), ], )
class ExportWinsBreakdownsDatasetPipeline(_DatasetPipeline): """Pipeline meta object for Export wins yearly export/non-export value.""" source_url = '{0}/datasets/breakdowns'.format(config.EXPORT_WINS_BASE_URL) table_config = TableConfig( schema='dit', table_name='export_wins__breakdowns_dataset', field_mapping=[ ('breakdown_type', sa.Column('type', sa.String)), ('id', sa.Column('id', sa.Integer, primary_key=True)), ('value', sa.Column('value', sa.BigInteger)), ('win__id', sa.Column('win_id', UUID)), ('year', sa.Column('year', sa.Integer)), ], )
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.table_config = TableConfig( schema=self.schema_name, table_name=self.table_name, field_mapping=[ ("id", sa.Column("id", sa.Text, primary_key=True)), ("match_id", sa.Column("match_id", sa.Integer)), ("similarity", sa.Column("similarity", sa.Text)), ], ) self.start_date = self.controller_pipeline.start_date self.schedule_interval = self.controller_pipeline.schedule_interval
class TestDAG(_PipelineDAG): table_config = TableConfig( table_name="test-table", field_mapping=( ("id", sa.Column("id", sa.Integer, primary_key=True)), ("data", sa.Column("data", sa.Integer)), ), ) def get_fetch_operator(self): return PythonOperator( task_id="fetch-data", python_callable=lambda: None, provide_context=True, )
class MaxemailCampaignsPipeline(_ActivityStreamPipeline): name = "maxemail-campaigns" index = "activities" table_config = TableConfig( schema="dit", table_name="maxemail__campaigns", field_mapping=[ ( ("object", "dit:maxemail:Campaign:id"), sa.Column("id", sa.Integer, primary_key=True), ), (("object", "name"), sa.Column("title", sa.String, nullable=False)), ( ("object", "content"), sa.Column("description", sa.String, nullable=False), ), ( ("object", "dit:emailSubject"), sa.Column("email_subject", sa.String, nullable=False), ), ( ("actor", "name"), sa.Column("email_from_name", sa.String, nullable=False), ), ( ("actor", "dit:emailAddress"), sa.Column("email_from_address", sa.String, index=True, nullable=False), ), ( ("published", ), sa.Column("started", sa.DateTime, index=True, nullable=False), ), ], ) query = { "bool": { "filter": [{ "term": { "object.type": "dit:maxemail:Campaign" } }] } }
class ExportWinsAdvisersDatasetPipeline(_DatasetPipeline): """Pipeline meta object for Export wins advisers.""" source_url = '{0}/datasets/advisors'.format(config.EXPORT_WINS_BASE_URL) table_config = TableConfig( schema='dit', table_name='export_wins__advisers_dataset', field_mapping=[ ('hq_team_display', sa.Column('hq_team', sa.String)), ('id', sa.Column('id', sa.Integer, primary_key=True)), ('location', sa.Column('location', sa.String)), ('name', sa.Column('name', sa.String)), ('team_type_display', sa.Column('team_type', sa.String)), ('win__id', sa.Column('win_id', UUID)), ], )