def setUp(self) -> None:

        self.project_id = "fake-recidiviz-project"
        self.mock_view_dataset_name = "my_views_dataset"
        self.mock_dataset = bigquery.dataset.DatasetReference(
            self.project_id, self.mock_view_dataset_name)

        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.project_number_patcher = patch(
            "recidiviz.utils.metadata.project_number")
        self.project_number_patcher.start().return_value = "123456789"

        self.bq_client_patcher = mock.patch(
            "recidiviz.calculator.dataflow_metric_table_manager.BigQueryClientImpl"
        )
        self.mock_client = self.bq_client_patcher.start().return_value

        self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset
        self.mock_dataflow_tables = [
            bigquery.TableReference(self.mock_dataset, "fake_table_1"),
            bigquery.TableReference(self.mock_dataset, "fake_table_2"),
        ]
        self.mock_client.list_tables.return_value = self.mock_dataflow_tables
        self.mock_client.project_id.return_value = self.project_id

        self.fake_dataflow_metrics_dataset = "fake_dataflow_metrics_dataset"
        self.fake_cold_storage_dataset = "fake_cold_storage_dataset"
        self.fake_max_jobs = 10
Ejemplo n.º 2
0
    def __init__(self, project_id, dataset_id, sandbox_dataset_id):
        desc = (
            'Load a lookup of PPI branching rules represented in CSV files. '
            'Store observation rows that violate the rules in a sandbox table. '
            'Stage the cleaned rows in a sandbox table. '
            'Drop and create the observation table with rows from stage.')
        # TODO add depends_on after base classing manual_cleaning_rules.update_questiona_answers_not_mapped_to_omop
        super().__init__(issue_numbers=ISSUE_NUMBERS,
                         description=desc,
                         affected_datasets=[cdr_consts.RDR],
                         project_id=project_id,
                         dataset_id=dataset_id,
                         sandbox_dataset_id=sandbox_dataset_id,
                         affected_tables=[OBSERVATION])
        dataset_ref = bigquery.DatasetReference(project_id, dataset_id)
        sandbox_dataset_ref = bigquery.DatasetReference(
            project_id, sandbox_dataset_id)

        self.rule_paths = PPI_BRANCHING_RULE_PATHS
        self.observation_table = bigquery.Table(
            bigquery.TableReference(dataset_ref, OBSERVATION))
        self.lookup_table = bigquery.TableReference(sandbox_dataset_ref,
                                                    RULES_LOOKUP_TABLE_ID)
        self.backup_table = bigquery.TableReference(
            sandbox_dataset_ref, OBSERVATION_BACKUP_TABLE_ID)
        self.stage_table = bigquery.TableReference(sandbox_dataset_ref,
                                                   OBSERVATION_STAGE_TABLE_ID)
Ejemplo n.º 3
0
    def setUp(self) -> None:

        self.project_id = "fake-recidiviz-project"
        self.mock_view_dataset_name = "my_views_dataset"
        self.mock_dataset = bigquery.dataset.DatasetReference(
            self.project_id, self.mock_view_dataset_name)

        self.mock_dataset_resource = {
            "datasetReference": {
                "projectId": self.project_id,
                "datasetId": self.mock_view_dataset_name,
            },
        }
        self.mock_table_resource = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": self.mock_view_dataset_name,
                "tableId": "fake_table",
            },
        }

        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.project_number_patcher = patch(
            "recidiviz.utils.metadata.project_number")
        self.project_number_patcher.start().return_value = "123456789"

        self.bq_client_patcher = mock.patch(
            "recidiviz.calculator.calculation_data_storage_manager.BigQueryClientImpl"
        )
        self.mock_client = self.bq_client_patcher.start().return_value

        self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset
        self.mock_dataflow_tables = [
            bigquery.TableReference(self.mock_dataset, "fake_table_1"),
            bigquery.TableReference(self.mock_dataset, "fake_table_2"),
        ]
        self.mock_client.list_tables.return_value = self.mock_dataflow_tables
        self.mock_client.project_id.return_value = self.project_id

        self.data_storage_config_patcher = mock.patch(
            "recidiviz.calculator.dataflow_output_storage_config")
        self.mock_data_storage_config = self.data_storage_config_patcher.start(
        )

        self.fake_dataflow_metrics_dataset = "fake_dataflow_metrics_dataset"
        self.fake_cold_storage_dataset = "fake_cold_storage_dataset"
        self.fake_max_jobs = 10

        self.mock_data_storage_config.MAX_DAYS_IN_DATAFLOW_METRICS_TABLE.return_value = (
            self.fake_max_jobs)
        self.mock_data_storage_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET = (
            self.fake_cold_storage_dataset)

        app = Flask(__name__)
        app.register_blueprint(calculation_data_storage_manager_blueprint)
        app.config["TESTING"] = True
        self.client = app.test_client()
 def patched_get_table(self, table_id):
     if table_id == self.source_table_id:
         return bigquery.Table(
             bigquery.TableReference(
                 bigquery.DatasetReference('project_name', 'dataset_name'),
                 'source_table'),
             schema=[
                 bigquery.SchemaField(
                     'clientId', 'STRING', 'NULLABLE',
                     ('Unhashed version of the Client ID for a'
                      ' given user associated with any given visit/session.'
                      )),
                 bigquery.SchemaField(
                     'fullVisitorId', 'STRING', 'NULLABLE',
                     'The unique visitor ID (also known as client ID).'),
                 bigquery.SchemaField(
                     'visitNumber', 'STRING', 'NULLABLE',
                     ('The session number for this user. '
                      'If this is the first session, then this is set to 1.'
                      )),
                 bigquery.SchemaField(
                     'totals', 'RECORD', 'NULLABLE',
                     'This section contains aggregate values across the session.',
                     [
                         bigquery.SchemaField(
                             'hits', 'INTEGER', 'NULLABLE',
                             'Total number of hits within the session.'),
                         bigquery.SchemaField(
                             'pageviews', 'INTEGER', 'NULLABLE',
                             'Total number of pageviews within the session.'
                         ),
                         bigquery.SchemaField(
                             'screenviews', 'INTEGER', 'NULLABLE',
                             'Total number of screenviews within the session.'
                         )
                     ])
             ])
     elif table_id == self.target_table_id:
         return bigquery.Table(
             bigquery.TableReference(
                 bigquery.DatasetReference('other_project_name',
                                           'other_dataset_name'),
                 'target_table'),
             schema=[
                 bigquery.SchemaField('clientId', 'STRING', 'NULLABLE'),
                 bigquery.SchemaField('fullVisitorId', 'STRING',
                                      'NULLABLE'),
                 bigquery.SchemaField(
                     'totals',
                     'RECORD',
                     'NULLABLE',
                     fields=[
                         bigquery.SchemaField('hits', 'INTEGER',
                                              'NULLABLE'),
                         bigquery.SchemaField('pageviews', 'INTEGER',
                                              'NULLABLE')
                     ])
             ])
Ejemplo n.º 5
0
 def setUp(self):
     super(ResultRecorderTest, self).setUp()
     self.client = _build_mock_client()
     dataset_reference = bigquery.DatasetReference(_PROJECT_ID, _DATASET_ID)
     self.count_results_table_reference = bigquery.TableReference(
         dataset_reference, _TABLE_ID_FOR_COUNTS)
     self.id_results_table_reference = bigquery.TableReference(
         dataset_reference, _TABLE_ID_FOR_RESULTS)
     self.recorder = result_recorder.ResultRecorder(
         client=self.client,
         count_results_table_reference=self.count_results_table_reference,
         item_results_table_reference=self.id_results_table_reference)
Ejemplo n.º 6
0
    def __init__(self,
                 project_id,
                 dataset_id,
                 sandbox_dataset_id,
                 export_date=None,
                 namer=None):
        desc = (f'All new pid/rid mappings will be identified via SQL and '
                f'stored, along with a shift integer, in a sandbox table.  '
                f'The table will be read to load into the primary pipeline '
                f'table, pipeline_tables.primary_pid_rid_mapping.')

        namer = dataset_id if not namer else namer

        super().__init__(issue_numbers=['DC1543'],
                         description=desc,
                         affected_datasets=[cdr_consts.RDR],
                         project_id=project_id,
                         dataset_id=dataset_id,
                         sandbox_dataset_id=sandbox_dataset_id,
                         table_namer=namer)

        # primary table ref
        dataset_ref = bigquery.DatasetReference(project_id, PIPELINE_TABLES)
        self.primary_mapping_table = bigquery.TableReference(
            dataset_ref, PRIMARY_PID_RID_MAPPING)

        # rdr table ref
        dataset_ref = bigquery.DatasetReference(project_id, dataset_id)
        self.rdr_table = bigquery.TableReference(dataset_ref, PID_RID_MAPPING)

        # rdr sandbox table ref
        dataset_ref = bigquery.DatasetReference(project_id, sandbox_dataset_id)
        self.rdr_sandbox_table = bigquery.TableReference(
            dataset_ref, self.sandbox_table_for(PID_RID_MAPPING))

        # store fields as json object
        self.fields = fields_for(PRIMARY_PID_RID_MAPPING,
                                 sub_path='pipeline_tables')

        # set export date
        try:
            self.export_date = validate_bq_date_string(export_date)
            LOGGER.info(f'Using provided export_date: `{export_date}`')
        except (TypeError, ValueError):
            # otherwise, default to using today's date
            LOGGER.warning(
                f"Failed to validate the export_date:  '{export_date}'")
            self.export_date = datetime.now().strftime('%Y-%m-%d')
            LOGGER.warning(f"Setting export_date to now: '{self.export_date}'")
Ejemplo n.º 7
0
    def test_add_missing_fields_to_schema_fields_with_same_name_different_mode(
            self):
        """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field
        with the same name but different mode as an existing field."""
        table_ref = bigquery.TableReference(self.mock_dataset,
                                            self.mock_table_id)
        schema_fields = [
            bigquery.SchemaField('fake_schema_field',
                                 'STRING',
                                 mode="NULLABLE")
        ]
        table = bigquery.Table(table_ref, schema_fields)
        self.mock_client.get_table.return_value = table

        new_schema_fields = [
            bigquery.SchemaField('fake_schema_field',
                                 'STRING',
                                 mode="REQUIRED")
        ]

        with pytest.raises(ValueError):
            self.bq_client.add_missing_fields_to_schema(
                self.mock_dataset_id, self.mock_table_id, new_schema_fields)

        self.mock_client.update_table.assert_not_called()
Ejemplo n.º 8
0
    def test_upload_data_tokyo_non_existing_dataset(self, project_id,
                                                    random_dataset_id,
                                                    bigquery_client):
        from google.cloud import bigquery

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        non_existing_tokyo_dataset = random_dataset_id
        non_existing_tokyo_destination = "{}.to_gbq_test".format(
            non_existing_tokyo_dataset)

        # Initialize table with sample data
        gbq.to_gbq(
            df,
            non_existing_tokyo_destination,
            project_id,
            credentials=self.credentials,
            location="asia-northeast1",
        )

        table = bigquery_client.get_table(
            bigquery.TableReference(
                bigquery.DatasetReference(project_id,
                                          non_existing_tokyo_dataset),
                "to_gbq_test",
            ))
        assert table.num_rows > 0
Ejemplo n.º 9
0
    def create_external_tables(self, bucket_name, prefix, date, tables,
                               project, dataset, table_prefix, version):
        if table_prefix:
            table_prefix += "_"
        else:
            table_prefix = ""

        gcs_loc = f"gs://{bucket_name}/{prefix}/v{version}/{date}"

        client = bigquery.Client(project=project)

        dataset_ref = client.dataset(dataset)

        for leanplum_name in tables:
            table_name = f"{table_prefix}{leanplum_name}_v{version}_{date}"
            logging.info(f"Creating table {table_name}")

            table_ref = bigquery.TableReference(dataset_ref, table_name)
            table = bigquery.Table(table_ref)

            client.delete_table(table, not_found_ok=True)

            external_config = bigquery.ExternalConfig('CSV')
            external_config.source_uris = [f"{gcs_loc}/{leanplum_name}/*"]
            external_config.autodetect = True

            table.external_data_configuration = external_config

            client.create_table(table)
Ejemplo n.º 10
0
    def test_bq_sql_aggregate_integer(self):
        project = 'project'
        dataset_id = 'dataset_id'
        table_id = 'table_id'
        name = 'name'
        i = 0
        dataset_ref = bigquery.DatasetReference(project, dataset_id)
        table_ref = bigquery.TableReference(dataset_ref, table_id)
        f = bigquery.SchemaField(name, 'INTEGER')

        expected = (
            "SELECT 'name' name, 'INTEGER' type, COUNT(name) count, AVG(name) average, "
            'STDDEV_SAMP(name) std, CAST(MAX(name) AS STRING) max, CAST(MIN(name) AS '
            'STRING) min, CAST(APPROX_TOP_COUNT(name, 1)[ORDINAL(1)].value AS STRING) '
            'mode, COUNT(*) - COUNT(name) miss_count, SAFE_DIVIDE(COUNT(*) - COUNT(name), '
            'COUNT(*)) miss_rate, 0 miss_days, COUNT(DISTINCT name) unique_count, '
            'SAFE_DIVIDE(COUNT(DISTINCT name), COUNT(name)) unique_rate, '
            'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(2)] AS STRING) quantile4_1, '
            'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(3)] AS STRING) median, '
            'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(4)] AS STRING) quantile4_3, '
            'CAST(APPROX_QUANTILES(name, 100)[ORDINAL(2)] AS STRING) quantile100_1, '
            'CAST(APPROX_QUANTILES(name, 100)[ORDINAL(100)] AS STRING) quantile100_99, '
            'COUNTIF(name >= 0) not_negatives, COUNTIF(name = 0) zeros, 0 empty_strings, '
            '0 ord FROM dataset_id.table_id')

        actual = bq_sql.aggregate(f, table_ref, i, '""')

        self.assertEqual(actual, expected)
Ejemplo n.º 11
0
def create_bq_view_of_joined_features_and_entities(
        source: BigQuerySource, entity_source: BigQuerySource,
        entity_names: List[str]) -> BigQuerySource:
    """
    Creates BQ view that joins tables from `source` and `entity_source` with join key derived from `entity_names`.
    Returns BigQuerySource with reference to created view.
    """
    bq_client = bigquery.Client()

    source_ref = table_reference_from_string(source.bigquery_options.table_ref)
    entities_ref = table_reference_from_string(
        entity_source.bigquery_options.table_ref)

    destination_ref = bigquery.TableReference(
        bigquery.DatasetReference(source_ref.project, source_ref.dataset_id),
        f"_view_{source_ref.table_id}_{datetime.now():%Y%m%d%H%M%s}",
    )

    view = bigquery.Table(destination_ref)
    view.view_query = JOIN_TEMPLATE.format(
        entities=entities_ref,
        source=source_ref,
        entity_key=" AND ".join(
            [f"source.{e} = entities.{e}" for e in entity_names]),
    )
    view.expires = datetime.now() + timedelta(days=1)
    bq_client.create_table(view)

    return BigQuerySource(
        event_timestamp_column=source.event_timestamp_column,
        created_timestamp_column=source.created_timestamp_column,
        table_ref=f"{view.project}:{view.dataset_id}.{view.table_id}",
        field_mapping=source.field_mapping,
        date_partition_column=source.date_partition_column,
    )
Ejemplo n.º 12
0
def upload_csv_data_to_bq_table(client, dataset_id, table_name, fq_file_path,
                                write_disposition):
    """
    Uploads data from local csv file to bigquery table

    :param client: an instantiated bigquery client object
    :param dataset_id: identifies the dataset
    :param table_name: identifies the table name where data needs to be uploaded
    :param fq_file_path: Fully qualified path to the csv file which needs to be uploaded
    :param write_disposition: Write disposition for job choose b/w write_empty, write_append, write_truncate
    :return: job result
    """
    dataset_ref = bigquery.DatasetReference(client.project, dataset_id)
    table_ref = bigquery.TableReference(dataset_ref, table_name)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job_config.write_disposition = write_disposition

    LOGGER.info(f"Uploading {fq_file_path} data to {dataset_id}.{table_name}")
    with open(fq_file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file,
                                          table_ref,
                                          job_config=job_config)
    try:
        result = job.result()  # Waits for table load to complete.
    except (BadRequest, OSError, AttributeError, TypeError, ValueError) as exp:
        message = f"Unable to load data to table {table_name}"
        LOGGER.exception(message)
        raise exp

    return result
Ejemplo n.º 13
0
def stage_entities_to_bq(entity_source: pd.DataFrame, project: str,
                         dataset: str) -> BigQuerySource:
    """
    Stores given (entity) dataframe as new table in BQ. Name of the table generated based on current time.
    Table will expire in 1 day.
    Returns BigQuerySource with reference to created table.
    """
    bq_client = bigquery.Client()
    destination = bigquery.TableReference(
        bigquery.DatasetReference(project, dataset),
        f"_entities_{datetime.now():%Y%m%d%H%M%s}",
    )

    # prevent casting ns -> ms exception inside pyarrow
    entity_source["event_timestamp"] = entity_source[
        "event_timestamp"].dt.floor("ms")

    load_job: bigquery.LoadJob = bq_client.load_table_from_dataframe(
        entity_source, destination)
    load_job.result()  # wait until complete

    dest_table: bigquery.Table = bq_client.get_table(destination)
    dest_table.expires = datetime.now() + timedelta(days=1)
    bq_client.update_table(dest_table, fields=["expires"])

    return BigQuerySource(
        event_timestamp_column="event_timestamp",
        table_ref=
        f"{destination.project}:{destination.dataset_id}.{destination.table_id}",
    )
Ejemplo n.º 14
0
    def setUp(self):
        self.project_id = 'foo_project'
        self.dataset_id = 'foo_dataset'
        self.sandbox_dataset_id = 'foo_sandbox'
        self.mapping_dataset_id = 'foo_mapping_dataset'
        self.client = None
        self.maxDiff = None

        self.hpo_list = [{
            "hpo_id": "hpo_1",
            "name": "hpo_name_1"
        }, {
            "hpo_id": "hpo_2",
            "name": "hpo_name_2"
        }]

        self.fields = [{
            "type": "integer",
            "name": "foo_id",
            "mode": "nullable",
            "description": "The foo_id used in the foo table."
        }, {
            "type":
            "string",
            "name":
            "src_id",
            "mode":
            "nullable",
            "description":
            "The provenance of the data associated with the foo_id."
        }]

        mapped_table_names = [
            cdm_table for cdm_table in common.AOU_REQUIRED if cdm_table not in
            [common.PERSON, common.DEATH, common.FACT_RELATIONSHIP]
        ]

        mapping_table_names = [
            gen_ext.MAPPING_PREFIX + cdm_table
            for cdm_table in mapped_table_names
        ]

        dataset_ref = bigquery.DatasetReference(self.project_id,
                                                self.dataset_id)

        self.mapping_table_objs = [
            bigquery.TableReference(dataset_ref, table_name)
            for table_name in mapping_table_names + mapped_table_names
        ]

        self.rule_instance = gen_ext.GenerateExtTables(self.project_id,
                                                       self.dataset_id,
                                                       self.sandbox_dataset_id,
                                                       self.mapping_dataset_id)

        self.assertEqual(self.rule_instance.project_id, self.project_id)
        self.assertEqual(self.rule_instance.dataset_id, self.dataset_id)
        self.assertEqual(self.rule_instance.sandbox_dataset_id,
                         self.sandbox_dataset_id)
Ejemplo n.º 15
0
def upload_to_bq(bq_filename):
    """Uploads json file from GCS to BigQuery"""
    bq_client = bigquery.Client(project=PROJECT_ID)
    logging.debug("uploading to BQ")
    job_id = 'bq_import_{}'.format(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    gcs_file_path = 'gs://' + GCS_BUCKET + '/' + bq_filename

    job_config = bigquery.job.LoadJobConfig()
    job_config.schema = [
        bigquery.SchemaField(
            'project_id',
            'STRING',
            description='GCP project where from where public dataset is'),
        bigquery.SchemaField('dataset_id',
                             'STRING',
                             description='BigQuery dataset'),
        bigquery.SchemaField('table_id',
                             'STRING',
                             description='Table id (name) of public dataset'),
        bigquery.SchemaField('dataset_description', 'STRING'),
        bigquery.SchemaField(
            'table_description',
            'STRING',
        ),
        bigquery.SchemaField('table_created',
                             'TIMESTAMP',
                             description='Datetime when table was created'),
        bigquery.SchemaField(
            'table_modified',
            'TIMESTAMP',
            description=
            'Datetime when table was last time modified (inserted rows)'),
        bigquery.SchemaField('table_num_bytes',
                             'INTEGER',
                             description='Size of table in bytes'),
        bigquery.SchemaField('table_num_rows',
                             'INTEGER',
                             description='Number of rows in the table'),
        bigquery.SchemaField(
            'table_type',
            'STRING',
            description='Whether table is normal table, view or BQ ML model'),
        bigquery.SchemaField(
            'table_partitioning_type',
            'STRING',
            description='Whether and how table is partitioned'),
    ]
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    dataset_ref = bq_client.dataset(OUT_DATASET_ID)
    table_ref = bigquery.TableReference(dataset_ref, OUT_TABLE_ID)
    job = bigquery.LoadJob(job_id, [gcs_file_path],
                           client=bq_client,
                           destination=table_ref,
                           job_config=job_config)
    job.result()
    logging.debug("done")
Ejemplo n.º 16
0
def table_reference_from_string(table_ref: str):
    """
    Parses reference string with format "{project}:{dataset}.{table}" into bigquery.TableReference
    """
    project, dataset_and_table = table_ref.split(":")
    dataset, table_id = dataset_and_table.split(".")
    return bigquery.TableReference(bigquery.DatasetReference(project, dataset),
                                   table_id)
Ejemplo n.º 17
0
 def get_table_if_exists(self, table_name):
     table_ref = bigquery.TableReference(self.dataset_ref, table_name)
     try:
         table = self.client.get_table(table_ref)
     except NotFound:
         return None
     else:
         return table
Ejemplo n.º 18
0
def get_table_refs(bq_client: bigquery.Client,
                   bq_storage_client: bigquery_storage.BigQueryReadClient,
                   dataset: bigquery.Dataset):
    query = f"select table_id from {dataset.project}.{dataset.dataset_id}.__TABLES__"
    df: pd.DataFrame = bq_client.query(query).to_dataframe(
        bqstorage_client=bq_storage_client)
    for table_id in df.table_id:
        if table_id is not None:
            yield bigquery.TableReference(dataset.reference, table_id)
Ejemplo n.º 19
0
 def _table_from_ft(ft_schema: dict) -> bigquery.Table:
     """Create a local representation of a BigQuery table"""
     # A "TableSchema" is just a sequence of SchemaFields https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.table.Table.html
     schema = list(map(_create_field_schema, ft_schema['columns']))
     table = bigquery.Table(
         bigquery.TableReference(ds, to_safe_name(ft_schema['name'])),
         schema)
     table.description = ft_schema.get('description', '')
     return table
Ejemplo n.º 20
0
def copy_table():
    """Copy temporary table to final destination"""

    logging.info("copy table")
    dataset_ref = bigquery.DatasetReference(GCP_PROJECT, DATASET_NAME)
    table_ref = bigquery.TableReference(dataset_ref, TABLE_NAME)
    copyjob_config = bigquery.CopyJobConfig()
    copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    bq.copy_table(temp_table_ref, table_ref, job_config=copyjob_config)
Ejemplo n.º 21
0
 def try_get_table():
     try:
         table = bq_client.get_table(
             bigquery.TableReference(
                 bigquery.DatasetReference(bq_project, bq_dataset),
                 bq_table_id))
     except NotFound:
         return None, False
     else:
         return table, True
    def setUp(self):

        self.project_id = 'fake-recidiviz-project'
        self.mock_view_dataset_name = 'my_views_dataset'
        self.mock_dataset = bigquery.dataset.DatasetReference(
            self.project_id, self.mock_view_dataset_name)

        self.project_id_patcher = patch('recidiviz.utils.metadata.project_id')
        self.project_id_patcher.start().return_value = self.project_id
        self.project_number_patcher = patch(
            'recidiviz.utils.metadata.project_number')
        self.project_number_patcher.start().return_value = '123456789'

        self.bq_client_patcher = mock.patch(
            'recidiviz.calculator.calculation_data_storage_manager.BigQueryClientImpl'
        )
        self.mock_client = self.bq_client_patcher.start().return_value

        self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset
        self.mock_dataflow_tables = [
            bigquery.TableReference(self.mock_dataset, 'fake_table_1'),
            bigquery.TableReference(self.mock_dataset, 'fake_table_2')
        ]
        self.mock_client.list_tables.return_value = self.mock_dataflow_tables
        self.mock_client.project_id.return_value = self.project_id

        self.data_storage_config_patcher = mock.patch(
            'recidiviz.calculator.calculation_data_storage_config')
        self.mock_data_storage_config = self.data_storage_config_patcher.start(
        )

        self.fake_dataflow_metrics_dataset = 'fake_dataflow_metrics_dataset'
        self.fake_cold_storage_dataset = 'fake_cold_storage_dataset'
        self.fake_max_jobs = 10

        self.mock_data_storage_config.MAX_DAYS_IN_DATAFLOW_METRICS_TABLE.return_value = self.fake_max_jobs
        self.mock_data_storage_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET = self.fake_cold_storage_dataset

        app = Flask(__name__)
        app.register_blueprint(calculation_data_storage_manager_blueprint)
        app.config['TESTING'] = True
        self.client = app.test_client()
    def test_is_table_definition_in_match_with_bigquery(self):
        dataset_reference = bigquery.DatasetReference('project', 'dataset')
        table_reference = bigquery.TableReference(dataset_reference, 'table')
        schema = [
            bigquery.SchemaField('col1', 'INTEGER'),
            bigquery.SchemaField('col2', 'STRING')
        ]
        table = bigquery.Table(table_reference, schema)

        assert schema_mapper.is_table_definition_in_match_with_bigquery(
            schema, table)
Ejemplo n.º 24
0
def deploy_features_table():
    """Copy the UNION temp features table to final destination"""

    logging.info("copy table")
    target_dataset_ref = bigquery.DatasetReference(GCP_PROJECT,
                                                   BQ_TARGET_DATASET)
    target_table_ref = bigquery.TableReference(target_dataset_ref, 'features')
    copyjob_config = bigquery.CopyJobConfig()
    copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    bq.copy_table(temp_table_ref, target_table_ref, job_config=copyjob_config)
Ejemplo n.º 25
0
def copy_tables_to_public_dataset():
    """copies all tables from internal dataset to BigQuery public dataset

    Service account which is used for access to public dataset project is stored in GCS.

    """

    gcs = storage.Client(project=GCP_PROJECT)

    bucket = gcs.bucket(GCS_BUCKET)
    sa_blob = bucket.blob(SERVICE_ACCOUNT_FILENAME)
    sa_blob.reload()
    f = BytesIO()
    sa_blob.download_to_file(f)
    sa_content = f.getvalue().decode('utf8')
    f.close()
    sa_content = json.loads(sa_content)
    credentials = service_account.Credentials.from_service_account_info(
        sa_content)

    bq = bigquery.Client(project=GCP_PROJECT, credentials=credentials)

    job_config = bigquery.CopyJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED

    source_dataset = bigquery.DatasetReference(GCP_PROJECT, BQ_DATASET)
    destination_dataset = bigquery.DatasetReference(BQ_PUBLIC_PROJECT,
                                                    BQ_PUBLIC_DATASET)
    table_refs = []
    tables_res = bq.list_tables(source_dataset)
    for table_ref in tables_res:
        table_refs.append(table_ref)

    for table_ref in table_refs:
        table_name = table_ref.table_id
        source_table = bigquery.TableReference(source_dataset, table_name)
        destination_table = bigquery.TableReference(destination_dataset,
                                                    table_name)
        bq.copy_table([source_table], destination_table, job_config=job_config)
        logging.info(f"copying table {table_name} to public dataset")
Ejemplo n.º 26
0
    def test_add_missing_fields_to_schema(self):
        """Tests that the add_missing_fields_to_schema function calls the client to update the table."""
        table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id)
        schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')]
        table = bigquery.Table(table_ref, schema_fields)
        self.mock_client.get_table.return_value = table

        new_schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')]

        self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields)

        self.mock_client.update_table.assert_called()
Ejemplo n.º 27
0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset,
                                        feast_client: Client,
                                        feast_spark_client: SparkClient):
    original = generate_data()
    bq_project = pytestconfig.getoption("bq_project")

    bq_client = bigquery.Client(project=bq_project)
    source_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}",
    )
    bq_client.load_table_from_dataframe(original, source_ref).result()

    view_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}",
    )
    view = bigquery.Table(view_ref)
    view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`"
    bq_client.create_table(view)

    entity = Entity(name="s2id",
                    description="S2id",
                    value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name="bq_ingestion",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=BigQuerySource(
            event_timestamp_column="event_timestamp",
            table_ref=
            f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}",
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
Ejemplo n.º 28
0
    def create_table_with_schema(self, dataset_id, table_id, schema_fields: List[bigquery.SchemaField]) -> \
            bigquery.Table:
        dataset_ref = self.dataset_ref_for_id(dataset_id)

        if self.table_exists(dataset_ref, table_id):
            raise ValueError(
                f"Trying to create a table that already exists: {dataset_id}.{table_id}."
            )

        table_ref = bigquery.TableReference(dataset_ref, table_id)
        table = bigquery.Table(table_ref, schema_fields)

        logging.info("Creating table %s.%s", dataset_id, table_id)
        return self.client.create_table(table)
Ejemplo n.º 29
0
 def create_table(self, schema, dataset_id, table_id):
     """
     :param schema:
     :param dataset_id:
     :param table_id:
     :return:
     """
     dataset_ref = self.get_dataset(dataset_id=dataset_id)
     table_id = bigquery.TableReference(dataset_ref=dataset_ref,
                                        table_id=table_id)
     table = bigquery.Table(table_id, schema=schema)
     table = self.client.create_table(table)  # API request
     print("Created table {}.{}.{}".format(table.project, table.dataset_id,
                                           table.table_id))
Ejemplo n.º 30
0
    def test_remove_unused_fields_from_schema_no_missing_fields(self) -> None:
        """Tests that remove_unused_fields_from_schema() does nothing if there are no missing fields."""
        table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id)
        schema_fields = [bigquery.SchemaField("field_1", "STRING")]
        table = bigquery.Table(table_ref, schema_fields)
        self.mock_client.get_table.return_value = table

        new_schema_fields = [bigquery.SchemaField("field_1", "STRING")]

        self.bq_client.remove_unused_fields_from_schema(
            self.mock_dataset_id, self.mock_table_id, new_schema_fields
        )

        self.mock_client.query.assert_not_called()