class DataWorkspaceCatalogueItemsPipeline(_DataWorkspacePipeline):
    """Pipeline meta object for data workspace catalogue items data."""

    source_url = f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/dataset/catalogue-items'
    table_config = TableConfig(
        table_name='dataworkspace__catalogue_items',
        field_mapping=[
            ('purpose', sa.Column('purpose', sa.Text, nullable=False)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            ('name', sa.Column('name', sa.Text, nullable=False, index=True)),
            (
                'short_description',
                sa.Column('short_description', sa.Text, nullable=False),
            ),
            ('description', sa.Column('description', sa.Text)),
            ('published', sa.Column('published', sa.Boolean, nullable=False)),
            ('created_date',
             sa.Column('created_date', sa.DateTime, nullable=False)),
            ('published_at', sa.Column('published_at', sa.Date, index=True)),
            (
                'information_asset_owner',
                sa.Column('information_asset_owner', sa.Integer),
            ),
            (
                'information_asset_manager',
                sa.Column('information_asset_manager', sa.Integer),
            ),
            ('enquiries_contact', sa.Column('enquiries_contact', sa.Integer)),
            ('source_tags', sa.Column('source_tags', sa.ARRAY(sa.Text))),
            ('licence', sa.Column('license', sa.Text)),
            ('personal_data', sa.Column('personal_data', sa.Text)),
            ('retention_policy', sa.Column('retention_policy', sa.Text)),
            (
                'eligibility_criteria',
                sa.Column('eligibility_criteria', sa.ARRAY(sa.Text)),
            ),
            ('slug', sa.Column('slug', sa.Text)),
            (
                'source_tables',
                TableConfig(
                    table_name='dataworkspace__source_tables',
                    transforms=[
                        lambda record, table_config, contexts: {
                            **record,
                            'dataset_id': contexts[0]['id'],
                        }
                    ],
                    field_mapping=[
                        ('id', sa.Column('id', UUID, primary_key=True)),
                        ('dataset_id',
                         sa.Column('dataset_id', UUID, nullable=False)),
                        ('name', sa.Column('name', sa.Text, nullable=False)),
                        ('schema', sa.Column('schema', sa.Text,
                                             nullable=False)),
                        ('table', sa.Column('table', sa.Text, nullable=False)),
                    ],
                ),
            ),
        ],
    )
Ejemplo n.º 2
0
    def test_columns_property_only_returns_immediate_columns(self):
        id_col = ("id", Column("id", Integer))
        name_col = ("name", Column("name", String))

        single_table_config = TableConfig(
            table_name="test",
            field_mapping=[id_col],
        )

        nested_table_config = TableConfig(
            table_name="test",
            field_mapping=[
                id_col,
                (
                    "relation",
                    TableConfig(
                        table_name="relation",
                        field_mapping=[("foo", Column("foo", String))],
                    ),
                ),
                name_col,
            ],
        )

        assert single_table_config.columns == [id_col]
        assert nested_table_config.columns == [id_col, name_col]
class TagsClassifierTrainPipeline(_PipelineDAG):
    schedule_interval = (
        None  # For now we trigger the pipeline manually when training data is uploaded
    )
    table_config = TableConfig(
        table_name='interactions_tags_classifier_metrics',
        field_mapping=[
            ("model_version", sa.Column("model_version", sa.Text)),
            ("model_for_tag", sa.Column("model_for_tag", sa.Text)),
            ("size", sa.Column("size", sa.Integer)),
            ("precisions", sa.Column("precisions", sa.Numeric)),
            ("recalls", sa.Column("recalls", sa.Numeric)),
            ("f1", sa.Column("f1", sa.Numeric)),
            ("accuracy", sa.Column("accuracy", sa.Numeric)),
            ("auc", sa.Column("auc", sa.Numeric)),
        ],
    )

    def get_fetch_operator(self) -> PythonOperator:
        return PythonOperator(
            task_id='train-model',
            python_callable=model_training_with_labelled_data,
            queue='tensorflow',
            provide_context=True,
            op_args=[self.table_config.table_name],
            retries=self.fetch_retries,
        )
Ejemplo n.º 4
0
def test_insert_data_into_db_using_db_config(mocker, mock_db_conn, s3):
    s3.iter_keys.return_value = [
        ('1', [{"id": 1, "extra": "ignored", "data": "text"}]),
        ('2', [{"id": 2}]),
    ]

    table_config = TableConfig(
        table_name="my-table",
        field_mapping=(
            ("id", sqlalchemy.Column("id", sqlalchemy.Integer(), nullable=False)),
            ("data", sqlalchemy.Column("data", sqlalchemy.String())),
        ),
    )
    mock_table = mock.Mock()
    mocker.patch.object(table_config, '_table', mock_table)
    mocker.patch.object(table_config, '_temp_table', mock_table)

    db_tables.insert_data_into_db(
        target_db="test-db", table_config=table_config, ts_nodash="123",
    )

    mock_db_conn.execute.assert_has_calls(
        [
            mock.call(mock_table.insert(), data="text", id=1),
            mock.call(mock_table.insert(), data=None, id=2),
        ]
    )

    s3.iter_keys.assert_called_once_with()
Ejemplo n.º 5
0
class CoronavirusInteractionsDashboardPipeline(_SQLPipelineDAG):
    start_date = datetime(2020, 3, 25)
    dependencies = [
        InteractionsDatasetPipeline,
        AdvisersDatasetPipeline,
        TeamsDatasetPipeline,
        ContactsDatasetPipeline,
    ]

    table_config = TableConfig(
        table_name="coronavirus_interactions_dashboard_data",
        field_mapping=[
            (None,
             sa.Column("id", sa.Integer, primary_key=True,
                       autoincrement=True)),
            ("interaction_date", sa.Column("interaction_date", sa.Date)),
            ("company_name", sa.Column("company_name", sa.Text)),
            ("company_country", sa.Column("company_country", sa.Text)),
            ("company_link", sa.Column("company_link", sa.Text)),
            ("company_sector", sa.Column("company_sector", sa.Text)),
            ("company_region", sa.Column("company_region", sa.Text)),
            ("subject_of_interaction",
             sa.Column("subject_of_interaction", sa.Text)),
            ("data_hub_link", sa.Column("data_hub_link", sa.Text)),
            ("team", sa.Column("team", sa.Text)),
            ("role", sa.Column("role", sa.Text)),
            ("policy_areas", sa.Column("policy_areas", sa.ARRAY(sa.Text))),
            ("entered_into_data_hub",
             sa.Column("entered_into_data_hub", sa.DateTime)),
        ],
    )

    query = '''
Ejemplo n.º 6
0
class FDIDashboardPipeline(_SQLPipelineDAG):
    dependencies = [
        CompaniesDatasetPipeline, InvestmentProjectsDatasetPipeline
    ]
    start_date = datetime(2020, 3, 3)
    table_config = TableConfig(
        table_name="fdi_dashboard_data",
        field_mapping=[
            ('fdi_value', sa.Column('fdi_value', sa.Text)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            (
                'investor_company_country',
                sa.Column('investor_company_country', sa.String),
            ),
            ('number_new_jobs', sa.Column('number_new_jobs', sa.Integer)),
            (
                'number_safeguarded_jobs',
                sa.Column('number_safeguarded_jobs', sa.Integer),
            ),
            ('overseas_region', sa.Column('overseas_region', sa.String)),
            ('project_end_date', sa.Column('project_end_date', sa.Date)),
            ('project_link', sa.Column('project_link', sa.String)),
            ('project_reference', sa.Column('project_reference', sa.String)),
            ('project_sector', sa.Column('project_sector', sa.String)),
            ('sector_cluster', sa.Column('sector_cluster', sa.String)),
            ('stage', sa.Column('stage', sa.String)),
            ('status', sa.Column('status', sa.String)),
            ('total_investment', sa.Column('total_investment', sa.Numeric)),
        ],
    )
    query = '''
Ejemplo n.º 7
0
class HMRCNonEUExports(_HMRCPipeline):
    base_filename = "smke19"
    records_start_year = 2009
    num_csv_fields = (22, 23)
    table_config = TableConfig(
        schema="hmrc",
        table_name="non_eu_exports",
        # https://www.uktradeinfo.com/Statistics/Documents/Data%20Downloads/Tech_Spec_SMKE19.DOC
        field_mapping=[
            (None, sa.Column("id", sa.Integer, primary_key=True, autoincrement=True)),
            (0, sa.Column("comcode", sa.String(9))),
            (1, sa.Column("sitc", sa.String(5))),
            (2, sa.Column("record_type", sa.String(3))),
            (3, sa.Column("cod_sequence", sa.String(3))),
            (4, sa.Column("cod_alpha", sa.String(2))),
            (5, sa.Column("account_mmyy", sa.String(7))),
            (6, sa.Column("port_sequence", sa.String(3))),
            (7, sa.Column("port_alpha", sa.String(3))),
            (8, sa.Column("flag_sequence", sa.String(3))),
            (9, sa.Column("flag_alpha", sa.String(2))),
            (10, sa.Column("trade_indicator", sa.String(1))),
            (11, sa.Column("container", sa.String(3))),
            (12, sa.Column("mode_of_transport", sa.String(3))),
            (13, sa.Column("inland_mot", sa.String(2))),
            (14, sa.Column("golo_sequence", sa.String(3))),
            (15, sa.Column("golo_alpha", sa.String(3))),
            (16, sa.Column("suite_indicator", sa.String(3))),
            (17, sa.Column("procedure_code", sa.String(3))),
            (18, sa.Column("value", sa.BigInteger)),
            (19, sa.Column("quantity_1", sa.BigInteger)),
            (20, sa.Column("quantity_2", sa.BigInteger)),
            (21, sa.Column("industrial_plant_comcode", sa.String(15))),
            (22, sa.Column("_source_name", sa.String())),
        ],
    )
Ejemplo n.º 8
0
class HMRCEUImports(_HMRCPipeline):
    base_filename = "smkm46"
    records_start_year = 2009
    num_csv_fields = (17, 18)
    table_config = TableConfig(
        schema="hmrc",
        table_name="eu_imports",
        # https://www.uktradeinfo.com/Statistics/Documents/Data%20Downloads/Tech_Spec_SMKX46.DOC
        field_mapping=[
            (None, sa.Column("id", sa.Integer, primary_key=True, autoincrement=True)),
            (0, sa.Column("comcode", sa.String(9))),
            (1, sa.Column("record_type", sa.String(3))),
            (2, sa.Column("cod_sequence", sa.String(3))),
            (3, sa.Column("cod_alpha", sa.String(2))),
            (4, sa.Column("trade_indicator", sa.String(1))),
            (5, sa.Column("coo_seq", sa.String(3))),
            (6, sa.Column("coo_alpha", sa.String(2))),
            (7, sa.Column("nature_of_transaction", sa.String(3))),
            (8, sa.Column("mode_of_transport", sa.String(3))),
            (9, sa.Column("period_reference", sa.String(7))),
            (10, sa.Column("suite_indicator", sa.String(3))),
            (11, sa.Column("sitc", sa.String(5))),
            (12, sa.Column("ip_comcode", sa.String(9))),
            (13, sa.Column("num_consignments", sa.BigInteger)),
            (14, sa.Column("value", sa.BigInteger)),
            (15, sa.Column("nett_mass", sa.BigInteger)),
            (16, sa.Column("supp_unit", sa.BigInteger)),
            (17, sa.Column("_source_name", sa.String())),
        ],
    )
Ejemplo n.º 9
0
class EventsDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for EventsDataset."""

    source_url = '{0}/v4/dataset/events-dataset'.format(config.DATAHUB_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='data_hub__events',
        field_mapping=[
            ('address_1', sa.Column('address_1', sa.String)),
            ('address_2', sa.Column('address_2', sa.String)),
            ('address_country__name', sa.Column('address_country', sa.String)),
            ('address_county', sa.Column('address_county', sa.String)),
            ('address_postcode', sa.Column('address_postcode', sa.String)),
            ('address_town', sa.Column('address_town', sa.String)),
            ('created_on', sa.Column('created_on', sa.DateTime)),
            ('end_date', sa.Column('end_date', sa.Date)),
            ('event_type__name', sa.Column('event_type', sa.String)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            ('lead_team_id', sa.Column('lead_team_id', UUID)),
            ('location_type__name', sa.Column('location_type', sa.String)),
            ('name', sa.Column('name', sa.String)),
            ('notes', sa.Column('notes', sa.Text)),
            ('organiser_id', sa.Column('organiser_id', UUID)),
            ('service_name', sa.Column('service_name', sa.String)),
            ('start_date', sa.Column('start_date', sa.Date)),
            ('team_ids', sa.Column('team_ids', sa.ARRAY(sa.Text))),
            ('uk_region__name', sa.Column('uk_region', sa.String)),
            ('created_by_id', sa.Column('created_by_id', UUID)),
            ('disabled_on', sa.Column('disabled_on', sa.DateTime)),
        ],
    )
Ejemplo n.º 10
0
class GreatGovUKFormsPipeline(_ActivityStreamPipeline):
    name = "great-gov-uk-forms"
    index = "activities"

    table_config = TableConfig(
        schema="dit",
        table_name="great_gov_uk__forms",
        transforms=[
            lambda record, table_config, contexts: {
                **record,
                "norm_id":
                record["object"]["id"].replace(
                    "dit:directoryFormsApi:Submission:", ""),
                "submission_type":
                record["object"]["attributedTo"]["id"].replace(
                    "dit:directoryFormsApi:SubmissionType:", ""),
                "submission_action":
                record["object"]["attributedTo"]["type"].replace(
                    "dit:directoryFormsApi:SubmissionAction:", ""),
            }
        ],
        field_mapping=[
            ("norm_id", sa.Column("id", sa.Integer, primary_key=True)),
            (("object", "url"), sa.Column("url", sa.String)),
            (("object", "published"), sa.Column("created_at", sa.DateTime)),
            ("submission_type", sa.Column("submission_type", sa.String)),
            ("submission_action", sa.Column("submission_action", sa.String)),
            (("actor", "dit:emailAddress"), sa.Column("actor_email",
                                                      sa.String)),
            (
                ("actor"),
                sa.Column("actor", sa.JSON),
            ),
            (
                ("object", "dit:directoryFormsApi:Submission:Data"),
                sa.Column("data", sa.JSON),
            ),
            (
                ("object", "dit:directoryFormsApi:Submission:Meta"),
                sa.Column("meta", sa.JSON),
            ),
        ],
    )

    query = {
        "bool": {
            "filter": [
                {
                    "term": {
                        "object.type": "dit:directoryFormsApi:Submission"
                    }
                },
                {
                    "term": {
                        "type": "Create"
                    }
                },
            ]
        }
    }
Ejemplo n.º 11
0
class OMISDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for OMISDataset."""

    source_url = '{0}/v4/dataset/omis-dataset'.format(config.DATAHUB_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='data_hub__orders',
        field_mapping=[
            ('cancellation_reason__name', sa.Column('cancellation_reason', sa.Text)),
            ('cancelled_on', sa.Column('cancelled_date', sa.DateTime)),
            ('company_id', sa.Column('company_id', UUID)),
            ('completed_on', sa.Column('completion_date', sa.DateTime)),
            ('contact_id', sa.Column('contact_id', UUID)),
            ('created_by__dit_team_id', sa.Column('dit_team_id', UUID)),
            ('created_on', sa.Column('created_date', sa.DateTime)),
            ('delivery_date', sa.Column('delivery_date', sa.Date)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            ('invoice__subtotal_cost', sa.Column('subtotal', sa.Integer)),
            ('paid_on', sa.Column('payment_received_date', sa.DateTime)),
            ('primary_market__name', sa.Column('market', sa.Text)),
            ('quote__accepted_on', sa.Column('quote_accepted_on', sa.DateTime)),
            ('quote__created_on', sa.Column('quote_created_on', sa.DateTime)),
            ('reference', sa.Column('omis_order_reference', sa.String)),
            ('refund_created', sa.Column('refund_created', sa.DateTime)),
            ('refund_total_amount', sa.Column('refund_total_amount', sa.Integer)),
            ('sector_name', sa.Column('sector', sa.String)),
            ('services', sa.Column('services', sa.Text)),
            ('status', sa.Column('order_status', sa.String)),
            ('subtotal_cost', sa.Column('net_price', sa.Numeric)),
            ('total_cost', sa.Column('total_cost', sa.Integer)),
            ('uk_region__name', sa.Column('uk_region', sa.Text)),
            ('vat_cost', sa.Column('vat_cost', sa.Integer)),
            ('created_by_id', sa.Column('created_by_id', UUID)),
        ],
    )
Ejemplo n.º 12
0
class ConsentPipeline(_ConsentPipeline):

    source_url = "{}/api/v1/person/datahub_export/?limit={}".format(
        config.CONSENT_BASE_URL, config.CONSENT_RESULTS_PER_PAGE)
    table_config = TableConfig(
        schema="dit",
        table_name="consent_service__current_consents",
        field_mapping=[
            ("id", sa.Column("id", sa.Integer)),
            ("key", sa.Column("key", sa.String)),
            ("email", sa.Column("email", sa.String)),
            ("phone", sa.Column("phone", sa.String)),
            ("key_type", sa.Column("key_type", sa.String)),
            ("created_at", sa.Column("created_at", sa.DateTime)),
            ("modified_at", sa.Column("modified_at", sa.DateTime)),
            ("current", sa.Column("current", sa.Boolean)),
            (
                "email_marketing_consent",
                sa.Column("email_marketing_consent", sa.Boolean),
            ),
            (
                "phone_marketing_consent",
                sa.Column("phone_marketing_consent", sa.Boolean),
            ),
        ],
    )
class DataWorkspaceApplicationInstancePipeline(_DataWorkspacePipeline):
    """Pipeline meta object for data workspace application instance data."""

    source_url = (
        f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/application-instance/instances'
    )
    table_config = TableConfig(
        table_name='dataworkspace__application_instances',
        field_mapping=[
            ('commit_id', sa.Column('commit_id', sa.Text)),
            ('cpu', sa.Column('cpu', sa.Text)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            ('memory', sa.Column('memory', sa.Text)),
            ('owner_id', sa.Column('owner_id', sa.Numeric)),
            ('proxy_url', sa.Column('proxy_url', sa.Text)),
            ('public_host', sa.Column('public_host', sa.Text)),
            ('spawner', sa.Column('spawner', sa.Text)),
            (
                'spawner_application_instance_id',
                sa.Column('spawner_application_instance_id', sa.Text),
            ),
            (
                'application_template_name',
                sa.Column('application_template_name', sa.Text),
            ),
            ('spawner_cpu', sa.Column('spawner_cpu', sa.Text)),
            ('spawner_created_at', sa.Column('spawner_created_at',
                                             sa.DateTime)),
            ('spawner_memory', sa.Column('spawner_memory', sa.Text)),
            ('spawner_stopped_at', sa.Column('spawner_stopped_at',
                                             sa.DateTime)),
            ('state', sa.Column('state', sa.Text)),
        ],
    )
class TagsClassifierPredictionPipeline(_PipelineDAG):
    table_config = TableConfig(
        table_name='interactions_dataset_with_tags',
        field_mapping=[
            ("id", sa.Column("id", sa.Text, primary_key=True)),
            ("policy_feedback_notes",
             sa.Column("policy_feedback_notes", sa.Text)),
            ("tags_prediction", sa.Column("tags_prediction", sa.Text)),
            ("tag_1", sa.Column("tag_1", sa.Text)),
            (
                "probability_score_tag_1",
                sa.Column("probability_score_tag_1", sa.Numeric),
            ),
            ("tag_2", sa.Column("tag_2", sa.Text)),
            (
                "probability_score_tag_2",
                sa.Column("probability_score_tag_2", sa.Numeric),
            ),
            ("tag_3", sa.Column("tag_3", sa.Text)),
            (
                "probability_score_tag_3",
                sa.Column("probability_score_tag_3", sa.Numeric),
            ),
            ("tag_4", sa.Column("tag_4", sa.Text)),
            (
                "probability_score_tag_4",
                sa.Column("probability_score_tag_4", sa.Numeric),
            ),
            ("tag_5", sa.Column("tag_5", sa.Text)),
            (
                "probability_score_tag_5",
                sa.Column("probability_score_tag_5", sa.Numeric),
            ),
        ],
    )

    controller_pipeline = InteractionsDatasetPipeline
    dependencies = [InteractionsDatasetPipeline]

    query = f"""
             SELECT id, policy_feedback_notes FROM "{controller_pipeline.table_config.schema}"."{controller_pipeline.table_config.table_name}"
             WHERE policy_feedback_notes!=''  AND policy_areas NOTNULL
             AND (
                  created_on > current_date - INTERVAL '6 weeks'
                  OR
                  modified_on > current_date - INTERVAL '6 weeks'
                 )
             """

    def get_fetch_operator(self) -> PythonOperator:
        return PythonOperator(
            task_id='make-prediction',
            python_callable=make_prediction,
            queue='tensorflow',
            provide_context=True,
            op_args=[self.target_db, self.query, self.table_config.table_name],
            retries=self.fetch_retries,
        )
Ejemplo n.º 15
0
    def test_table_property_returns_sqlalchemy_table(self):
        single_table_config = TableConfig(
            table_name="test",
            field_mapping=[("id", Column("id", Integer))],
        )

        table = single_table_config.table
        assert type(table) is sqlalchemy.Table
        assert {col.name for col in table.columns} == {"id"}
Ejemplo n.º 16
0
 class TestDAG(_PandasPipelineWithPollingSupport):
     use_polling = False
     data_getter = lambda: None  # noqa: E731
     table_config = TableConfig(
         table_name="test-table",
         field_mapping=(
             ("id", sa.Column("id", sa.Integer, primary_key=True)),
             ("data", sa.Column("data", sa.Integer)),
         ),
     )
Ejemplo n.º 17
0
class MinisterialInteractionsDashboardPipeline(_SQLPipelineDAG):
    """
    A dashboard of Data Hub interactions involving Ministers and certain senior staff
    """

    start_date = datetime(2020, 5, 6)
    dependencies = [
        AdvisersDatasetPipeline,
        CompaniesDatasetPipeline,
        InteractionsDatasetPipeline,
        ONSPostcodePipeline,
    ]
    table_config = TableConfig(
        table_name='ministerial_interactions',
        field_mapping=[
            (None,
             sa.Column('id', sa.Integer, primary_key=True,
                       autoincrement=True)),
            ('interaction_id', sa.Column('interaction_id', UUID)),
            ('interaction_link', sa.Column('interaction_link', sa.Text)),
            ('company_name', sa.Column('company_name', sa.Text)),
            ('company_link', sa.Column('company_link', sa.Text)),
            ('company_country', sa.Column('company_country', sa.Text)),
            ('company_postcode', sa.Column('company_postcode', sa.Text)),
            (
                'company_address_latitude_longitude',
                sa.Column('company_address_latitude_longitude', sa.Text),
            ),
            ('company_uk_region', sa.Column('company_uk_region', sa.Text)),
            (
                'company_number_of_employees',
                sa.Column('company_number_of_employees', sa.Text),
            ),
            ('company_turnover', sa.Column('company_turnover', sa.Text)),
            ('company_one_list_tier',
             sa.Column('company_one_list_tier', sa.Text)),
            ('company_sector', sa.Column('company_sector', sa.Text)),
            ('interaction_date', sa.Column('interaction_date', sa.Date)),
            ('adviser_id', sa.Column('adviser_id', UUID)),
            ('adviser_name', sa.Column('adviser_name', sa.Text)),
            ('interaction_subject', sa.Column('interaction_subject', sa.Text)),
            ('communication_channel',
             sa.Column('communication_channel', sa.Text)),
            ('policy_areas', sa.Column('policy_areas', sa.Text)),
            ('policy_feedback_notes',
             sa.Column('policy_feedback_notes', sa.Text)),
            ('policy_issue_types', sa.Column('policy_issue_types', sa.Text)),
            (
                'dun_and_bradstreet_linked_record',
                sa.Column('dun_and_bradstreet_linked_record', sa.Text),
            ),
            ('service', sa.Column('service', sa.Text)),
        ],
    )
    query = '''
Ejemplo n.º 18
0
class ZendeskDITTicketsPipeline(_PipelineDAG):
    schedule_interval = "@daily"
    allow_null_columns = True
    use_utc_now_as_source_modified = True
    table_config = TableConfig(
        schema="zendesk",
        table_name="dit_tickets",
        transforms=[transforms_fields_dit],
        field_mapping=[
            ("id", sa.Column("id", sa.BigInteger, primary_key=True)),
            ("via", sa.Column("via", sa.JSON)),
            ("created_at", sa.Column("created_at", sa.DateTime)),
            ("solved_at", sa.Column("solved_at", sa.DateTime)),
            ("updated_at", sa.Column("updated_at", sa.DateTime)),
            (
                "full_resolution_time_in_minutes",
                sa.Column("full_resolution_time_in_minutes", sa.Integer),
            ),
            ("subject", sa.Column("subject", sa.String)),
            ("description", sa.Column("description", sa.String)),
            ("service", sa.Column("service", sa.String)),
            ("priority", sa.Column("priority", sa.String)),
            ("status", sa.Column("status", sa.String)),
            ("recipient", sa.Column("recipient", sa.String)),
            ("assignee_id", sa.Column("assignee_id", sa.BigInteger)),
            ("organization_id", sa.Column("organization_id", sa.BigInteger)),
            ("group_id", sa.Column("group_id", sa.BigInteger)),
            ("tags", sa.Column("tags", sa.ARRAY(sa.String))),
            ("satisfaction_rating", sa.Column("satisfaction_rating", sa.JSON)),
            (
                "sharing_agreement_ids",
                sa.Column("sharing_agreement_ids", sa.ARRAY(sa.BigInteger)),
            ),
            ("brand_id", sa.Column("brand_id", sa.BigInteger)),
            ("service", sa.Column("service", sa.String)),
            ("e_responder", sa.Column("e_responder", sa.ARRAY(sa.String))),
            ("e_sector", sa.Column("e_sector", sa.ARRAY(sa.String))),
            ("e_policy_area", sa.Column("e_policy_area", sa.ARRAY(sa.String))),
            ("e_dit_old", sa.Column("e_dit_old", sa.String)),
        ],
    )

    def get_fetch_operator(self) -> PythonOperator:
        return PythonOperator(
            task_id="fetch-daily-tickets",
            python_callable=fetch_daily_tickets,
            provide_context=True,
            op_args=[
                self.table_config.schema, self.table_config.table_name, "dit"
            ],
            retries=self.fetch_retries,
        )
Ejemplo n.º 19
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.table_config = TableConfig(
            schema="hmrc",
            table_name=self.table_name,
            transforms=[
                # This is based on https://www.uktradeinfo.com/media/vrehes3e/tech_spec_sesa16.doc,
                # which has an "a" in the file name which doesn't match the files we use, but it's the
                # best we have
                lambda record, table_config, contexts: {
                    "sitc_2": record[0][0:2],
                    "sitc_0": record[0][2:3],
                    "year": record[0][3:7],
                    "month": record[0][7:9],
                    "estimated_value": record[0][9:24],
                    "_source_name": record[1],
                }
            ],
            field_mapping=[
                (
                    None,
                    sa.Column(
                        "id", sa.BigInteger, primary_key=True, autoincrement=True
                    ),
                ),
                ("sitc_2", sa.Column("sitc_2", sa.String(2), nullable=False)),
                ("sitc_0", sa.Column("sitc_0", sa.String(1), nullable=False)),
                ("year", sa.Column("year", sa.SmallInteger, nullable=False)),
                ("month", sa.Column("month", sa.SmallInteger, nullable=False)),
                (
                    "estimated_value",
                    sa.Column("estimated_value", sa.BigInteger, nullable=False),
                ),
                ("_source_name", sa.Column("_source_name", sa.String, nullable=False)),
            ],
            indexes=[
                LateIndex("sitc_2"),
                LateIndex("sitc_0"),
                LateIndex(["year", "month"]),
                LateIndex("month"),
                LateIndex("_source_name"),
            ],
            check_constraints=[
                sa.CheckConstraint("1 <= month and month <= 12"),
                sa.CheckConstraint("1500 <= year AND year <= 3000"),
                sa.CheckConstraint("sitc_2 ~ '^[0-9][0-9]$'"),
                sa.CheckConstraint("sitc_0 ~ '^[0-9]$'"),
                sa.CheckConstraint("estimated_value >= 0"),
            ],
        )
Ejemplo n.º 20
0
    def test_related_tables_property_returns_directly_related_tables(self):
        id_col = ("id", Column("id", Integer))
        foo_col = ("foo", Column("foo", Integer))
        bar_col = ("bar", Column("bar", Integer))
        config = TableConfig(
            table_name="test",
            field_mapping=[
                id_col,
                (
                    "foo",
                    TableConfig(
                        table_name="foo",
                        field_mapping=[
                            foo_col,
                            (
                                "bar",
                                TableConfig(table_name="bar",
                                            field_mapping=[bar_col]),
                            ),
                        ],
                    ),
                ),
            ],
        )

        assert len(config.related_table_configs) == 1
        assert config.related_table_configs[0][0] == "foo"
        assert type(config.related_table_configs[0][1]) is TableConfig
        assert config.related_table_configs[0][1].table_name == "foo"

        assert len(config.related_table_configs[0][1].related_table_configs)
        assert config.related_table_configs[0][1].related_table_configs[0][
            0] == "bar"
        assert (type(
            config.related_table_configs[0][1].related_table_configs[0][1]) is
                TableConfig)
        assert (config.related_table_configs[0][1].related_table_configs[0]
                [1].table_name == "bar")
Ejemplo n.º 21
0
    def test_tables_property_returns_all_tables(self):
        id_col = ("id", Column("id", Integer))
        name_col = ("name", Column("name", String))
        nested_table_config = TableConfig(
            table_name="test",
            field_mapping=[
                id_col,
                (
                    "relation",
                    TableConfig(
                        table_name="relation",
                        field_mapping=[("foo", Column("foo", String))],
                    ),
                ),
                name_col,
            ],
        )

        tables = nested_table_config.tables
        assert all(type(table) is sqlalchemy.Table for table in tables)
        assert [table.name for table in tables] == ["test", "relation"]
        assert {col.name for col in tables[0].columns} == {"id", "name"}
        assert {col.name for col in tables[1].columns} == {"foo"}
Ejemplo n.º 22
0
class ExportWinsHVCDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for Export wins HVC data."""

    source_url = '{0}/datasets/hvc'.format(config.EXPORT_WINS_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='export_wins__hvc_dataset',
        field_mapping=[
            ('campaign_id', sa.Column('campaign_id', sa.String)),
            ('financial_year', sa.Column('financial_year', sa.Integer)),
            ('id', sa.Column('id', sa.Integer, primary_key=True)),
            ('name', sa.Column('name', sa.String)),
        ],
    )
Ejemplo n.º 23
0
class ContactsDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for ContactsDataset."""

    dependencies = [ConsentPipeline]
    source_url = '{0}/v4/dataset/contacts-dataset'.format(config.DATAHUB_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='data_hub__contacts',
        field_mapping=[
            ('address_1', sa.Column('address_1', sa.String)),
            ('address_2', sa.Column('address_2', sa.String)),
            ('address_country__name', sa.Column('address_country', sa.Text)),
            ('address_county', sa.Column('address_county', sa.String)),
            ('address_postcode', sa.Column('address_postcode', sa.String)),
            (
                'address_same_as_company',
                sa.Column('address_same_as_company', sa.Boolean),
            ),
            ('address_town', sa.Column('address_town', sa.String)),
            ('archived', sa.Column('archived', sa.Boolean)),
            ('archived_on', sa.Column('archived_on', sa.DateTime)),
            ('company_id', sa.Column('company_id', UUID)),
            ('created_on', sa.Column('date_added_to_datahub', sa.Date)),
            ('email', sa.Column('email', sa.String)),
            ('email_alternative', sa.Column('email_alternative', sa.String)),
            (None, sa.Column("email_marketing_consent", sa.Boolean, default=False)),
            ('id', sa.Column('id', UUID, primary_key=True)),
            ('job_title', sa.Column('job_title', sa.String)),
            ('modified_on', sa.Column('modified_on', sa.DateTime)),
            ('name', sa.Column('contact_name', sa.Text)),
            ('notes', sa.Column('notes', sa.Text)),
            ('primary', sa.Column('is_primary', sa.Boolean)),
            ('telephone_alternative', sa.Column('telephone_alternative', sa.String)),
            ('telephone_number', sa.Column('phone', sa.String)),
            ('created_by_id', sa.Column('created_by_id', UUID)),
        ],
    )

    def get_transform_operator(self) -> PythonOperator:
        return PythonOperator(
            task_id='update-contact-email-consent',
            python_callable=update_datahub_contact_consent,
            provide_context=True,
            op_args=[
                self.target_db,
                self.table_config.tables[0],  # pylint: disable=no-member
            ],
            retries=self.fetch_retries,
        )
class DataWorkspaceUserPipeline(_DataWorkspacePipeline):
    """Pipeline meta object for data workspace user data."""

    source_url = f'{config.DATA_WORKSPACE_BASE_URL}/api/v1/account/users'
    table_config = TableConfig(
        table_name='dataworkspace__users',
        field_mapping=[
            ('email', sa.Column('email', sa.Text)),
            ('first_name', sa.Column('first_name', sa.Text)),
            ('id', sa.Column('id', sa.Numeric, primary_key=True)),
            ('is_staff', sa.Column('is_staff', sa.Boolean)),
            ('is_superuser', sa.Column('is_superuser', sa.Boolean)),
            ('last_name', sa.Column('last_name', sa.Text)),
        ],
    )
Ejemplo n.º 25
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.table_config = TableConfig(
         schema='public_health_england',
         table_name=self.table_name,
         field_mapping=[
             ('date', sa.Column('date', sa.Date)),
             ('areaCode', sa.Column('area_code', sa.String)),
             ('areaName', sa.Column('area_name', sa.String)),
             ('newCasesByPublishDate', sa.Column('new_cases', sa.Integer)),
             ('cumCasesByPublishDate', sa.Column('cumulative_cases', sa.Integer)),
             ('newDeathsByPublishDate', sa.Column('new_deaths', sa.Integer)),
             ('cumDeathsByPublishDate', sa.Column('cumulative_deaths', sa.Integer)),
         ],
     )
Ejemplo n.º 26
0
class ExportWinsBreakdownsDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for Export wins yearly export/non-export value."""

    source_url = '{0}/datasets/breakdowns'.format(config.EXPORT_WINS_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='export_wins__breakdowns_dataset',
        field_mapping=[
            ('breakdown_type', sa.Column('type', sa.String)),
            ('id', sa.Column('id', sa.Integer, primary_key=True)),
            ('value', sa.Column('value', sa.BigInteger)),
            ('win__id', sa.Column('win_id', UUID)),
            ('year', sa.Column('year', sa.Integer)),
        ],
    )
Ejemplo n.º 27
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.table_config = TableConfig(
            schema=self.schema_name,
            table_name=self.table_name,
            field_mapping=[
                ("id", sa.Column("id", sa.Text, primary_key=True)),
                ("match_id", sa.Column("match_id", sa.Integer)),
                ("similarity", sa.Column("similarity", sa.Text)),
            ],
        )

        self.start_date = self.controller_pipeline.start_date
        self.schedule_interval = self.controller_pipeline.schedule_interval
Ejemplo n.º 28
0
    class TestDAG(_PipelineDAG):
        table_config = TableConfig(
            table_name="test-table",
            field_mapping=(
                ("id", sa.Column("id", sa.Integer, primary_key=True)),
                ("data", sa.Column("data", sa.Integer)),
            ),
        )

        def get_fetch_operator(self):
            return PythonOperator(
                task_id="fetch-data",
                python_callable=lambda: None,
                provide_context=True,
            )
Ejemplo n.º 29
0
class MaxemailCampaignsPipeline(_ActivityStreamPipeline):
    name = "maxemail-campaigns"
    index = "activities"
    table_config = TableConfig(
        schema="dit",
        table_name="maxemail__campaigns",
        field_mapping=[
            (
                ("object", "dit:maxemail:Campaign:id"),
                sa.Column("id", sa.Integer, primary_key=True),
            ),
            (("object", "name"), sa.Column("title", sa.String,
                                           nullable=False)),
            (
                ("object", "content"),
                sa.Column("description", sa.String, nullable=False),
            ),
            (
                ("object", "dit:emailSubject"),
                sa.Column("email_subject", sa.String, nullable=False),
            ),
            (
                ("actor", "name"),
                sa.Column("email_from_name", sa.String, nullable=False),
            ),
            (
                ("actor", "dit:emailAddress"),
                sa.Column("email_from_address",
                          sa.String,
                          index=True,
                          nullable=False),
            ),
            (
                ("published", ),
                sa.Column("started", sa.DateTime, index=True, nullable=False),
            ),
        ],
    )

    query = {
        "bool": {
            "filter": [{
                "term": {
                    "object.type": "dit:maxemail:Campaign"
                }
            }]
        }
    }
Ejemplo n.º 30
0
class ExportWinsAdvisersDatasetPipeline(_DatasetPipeline):
    """Pipeline meta object for Export wins advisers."""

    source_url = '{0}/datasets/advisors'.format(config.EXPORT_WINS_BASE_URL)
    table_config = TableConfig(
        schema='dit',
        table_name='export_wins__advisers_dataset',
        field_mapping=[
            ('hq_team_display', sa.Column('hq_team', sa.String)),
            ('id', sa.Column('id', sa.Integer, primary_key=True)),
            ('location', sa.Column('location', sa.String)),
            ('name', sa.Column('name', sa.String)),
            ('team_type_display', sa.Column('team_type', sa.String)),
            ('win__id', sa.Column('win_id', UUID)),
        ],
    )