def reimport_all(self):
     last_imported = ImportLog.objects.latest_in_category(
         'prescribing').current_at
     self.date = last_imported - relativedelta(years=5)
     while self.date <= last_imported:
         date_str = self.date.strftime('%Y-%m-%d')
         sql = ('SELECT pct AS pct_id, practice AS practice_id, '
                'bnf_code AS presentation_code, items AS total_items, '
                'net_cost, actual_cost, quantity, '
                'FORMAT_TIMESTAMP("%%Y_%%m_%%d", month) AS processing_date '
                'FROM ebmdatalab.hscic.normalised_prescribing_standard '
                "WHERE month = '%s'" % date_str)
         table_name = "prescribing_%s" % date_str.replace('-', '_')
         bigquery.query_and_return('ebmdatalab', 'tmp_eu', table_name, sql)
         uri = "gs://ebmdatalab/tmp/%s-*.csv.gz" % table_name
         logger.info("Extracting data for %s" % self.date)
         client = bigquery.bigquery.Client(project='ebmdatalab')
         # delete the table if it exists
         dataset = Dataset("tmp_eu", client)
         table = dataset.table(table_name)
         logger.info("Copying data for %s to cloud storage" % self.date)
         bigquery.copy_table_to_gcs(table, uri)
         with tempfile.NamedTemporaryFile(mode='wb') as tmpfile:
             logger.info("Importing data for %s" % self.date)
             bigquery.download_from_gcs(uri, tmpfile.name)
             with transaction.atomic():
                 self.drop_partition()
                 self.create_partition()
                 self.import_prescriptions(tmpfile.name)
                 self.create_partition_indexes()
                 self.add_parent_trigger()
         self.date += relativedelta(months=1)
Ejemplo n.º 2
0
    def from_api_repr(cls, resource, client):
        """Factory:  construct a job given its API representation

        .. note:

           This method assumes that the project found in the resource matches
           the client's project.

        :type resource: dict
        :param resource: dataset job representation returned from the API

        :type client: :class:`google.cloud.bigquery.client.Client`
        :param client: Client which holds credentials and project
                       configuration for the dataset.

        :rtype: :class:`google.cloud.bigquery.job.CopyJob`
        :returns: Job parsed from ``resource``.
        """
        name, config = cls._get_resource_config(resource)
        dest_config = config['destinationTable']
        dataset = Dataset(dest_config['datasetId'], client)
        destination = Table(dest_config['tableId'], dataset)
        sources = []
        for source_config in config['sourceTables']:
            dataset = Dataset(source_config['datasetId'], client)
            sources.append(Table(source_config['tableId'], dataset))
        job = cls(name, destination, sources, client=client)
        job._set_properties(resource)
        return job
Ejemplo n.º 3
0
def create_bigquery_views():
    """Create BigQuery views on the main prescribing data which map
    historic BNF codes to their current equivalent.

    If they already exist, do nothing.

    """
    # We have to create legacy and standard versions of the view, as a
    # legacy query cannot address a standard view, and vice versa, and
    # we use both flavours in our code.
    sql = """
    SELECT
      prescribing.sha AS sha,
      practices.ccg_id AS pct,
      prescribing.practice AS practice,
      COALESCE(bnf_map.current_bnf_code, prescribing.bnf_code)
        AS bnf_code,
      prescribing.bnf_name AS bnf_name,
      prescribing.items AS items,
      prescribing.net_cost AS net_cost,
      prescribing.actual_cost AS actual_cost,
      prescribing.quantity AS quantity,
      prescribing.month AS month
    FROM
      ebmdatalab.hscic.prescribing AS prescribing
    LEFT JOIN
      ebmdatalab.hscic.bnf_map AS bnf_map
    ON
      bnf_map.former_bnf_code = prescribing.bnf_code
    INNER JOIN
      ebmdatalab.hscic.practices  AS practices
    ON practices.code = prescribing.practice
    """
    client = bigquery.client.Client(project='ebmdatalab')
    dataset = Dataset("hscic", client)
    table = dataset.table('normalised_prescribing_standard')
    table.view_query = sql
    table.view_use_legacy_sql = False
    try:
        table.create()
    except Conflict:
        pass
    table = dataset.table('normalised_prescribing_legacy')
    sql = sql.replace('ebmdatalab.hscic.prescribing',
                      '[ebmdatalab:hscic.prescribing]')
    sql = sql.replace(
        'ebmdatalab.hscic.bnf_map',
        '[ebmdatalab:hscic.bnf_map]',
    )
    sql = sql.replace(
        'ebmdatalab.hscic.practices',
        '[ebmdatalab:hscic.practices]',
    )
    table.view_query = sql
    table.view_use_legacy_sql = True
    try:
        table.create()
    except Conflict:
        pass
Ejemplo n.º 4
0
def ensure_dataset(
    project_id: str,
    dataset_id: str,
    location: str,
) -> Tuple[Client, Dataset]:
    """Create BigQuery dataset if not exists.

    Arguments:
        project_id {str} -- Project id
        dataset_id {str} -- Dataset id
        location {str} -- Dataset location

    Returns:
        Tuple[Client, Dataset] -- BigQuery Client and Dataset
    """
    client: Client = bigquery.Client(project=project_id, location=location)

    dataset_ref: Dataset = client.dataset(dataset_id)

    if not dataset_exists(client, dataset_id):
        # Create dataset
        LOGGER.info(
            f'Creating dataset: {project_id}.{dataset_id} in '
            f'location: {location}', )

        client.create_dataset(dataset_ref, exists_ok=True)

        LOGGER.info(
            f'Succesfully created dataset: {project_id}.{dataset_id} in '
            f'location: {location}', )

    return client, Dataset(dataset_ref)
Ejemplo n.º 5
0
    def test_from_query_job(self):
        from google.cloud.bigquery.dataset import Dataset
        from google.cloud.bigquery.job import QueryJob
        from google.cloud.bigquery._helpers import UDFResource
        DS_NAME = 'DATASET'
        RESOURCE_URI = 'gs://some-bucket/js/lib.js'
        client = _Client(self.PROJECT)
        job = QueryJob(
            self.JOB_NAME,
            self.QUERY,
            client,
            udf_resources=[UDFResource("resourceUri", RESOURCE_URI)])
        dataset = job.default_dataset = Dataset(DS_NAME, client)
        job.use_query_cache = True
        job.use_legacy_sql = True
        klass = self._getTargetClass()

        query = klass.from_query_job(job)

        self.assertEqual(query.name, self.JOB_NAME)
        self.assertEqual(query.query, self.QUERY)
        self.assertIs(query._client, client)
        self.assertIs(query._job, job)
        self.assertEqual(query.udf_resources, job.udf_resources)
        self.assertIs(query.default_dataset, dataset)
        self.assertTrue(query.use_query_cache)
        self.assertTrue(query.use_legacy_sql)
Ejemplo n.º 6
0
    def create_dataset_by_name(self, name, expiration_hours=None):
        # type: (str, Optional[float]) -> None
        """Create a new dataset within the current project.

        Args:
          name: The name of the new dataset.
          expiration_hours: The default expiration time for tables within the dataset.
        """
        if name not in self.get_datasets():
            # Initialize the Dataset instead of passing a reference so we can set expiration hours.
            dataset = Dataset(DatasetReference(self.project_id, str(name)))
            if expiration_hours:
                dataset.default_table_expiration_ms = expiration_hours * (60 * 60 * 1000)
            self.create_dataset(dataset)
        else:
            logging.warning('Dataset {} already exists.'.format(name))
Ejemplo n.º 7
0
    def dataset(self, dataset_name):
        """Construct a dataset bound to this client.

        :type dataset_name: str
        :param dataset_name: Name of the dataset.

        :rtype: :class:`google.cloud.bigquery.dataset.Dataset`
        :returns: a new ``Dataset`` instance
        """
        return Dataset(dataset_name, client=self)
 def load_datasets(self, client, project):
     for k, v in (('raw', 'raw_dataset'), ('views', 'view_dataset')):
         dataset = self.settings[v].value
         dataset = Dataset.from_string('{0}.{1}'.format(project, dataset))
         try:
             setattr(DataSets, k, client.get_dataset(dataset))
             cprint('Already have dataset {}'.format(dataset), 'green')
         except NotFound:
             setattr(DataSets, k, client.create_dataset(dataset))
             cprint('Created dataset {}'.format(dataset), 'green')
Ejemplo n.º 9
0
def test_create_dataset_w_client_location_w_dataset_location(
        PROJECT, DS_ID, LOCATION):
    PATH = "projects/%s/datasets" % PROJECT
    OTHER_LOCATION = "EU"
    RESOURCE = {
        "datasetReference": {
            "projectId": PROJECT,
            "datasetId": DS_ID
        },
        "etag": "etag",
        "id": "%s:%s" % (PROJECT, DS_ID),
        "location": OTHER_LOCATION,
    }
    client = make_client(location=LOCATION)
    conn = client._connection = make_connection(RESOURCE)

    ds_ref = DatasetReference(PROJECT, DS_ID)
    before = Dataset(ds_ref)
    before.location = OTHER_LOCATION
    after = client.create_dataset(before)

    assert after.dataset_id == DS_ID
    assert after.project == PROJECT
    assert after.etag == RESOURCE["etag"]
    assert after.full_dataset_id == RESOURCE["id"]
    assert after.location == OTHER_LOCATION

    conn.api_request.assert_called_once_with(
        method="POST",
        path="/%s" % PATH,
        data={
            "datasetReference": {
                "projectId": PROJECT,
                "datasetId": DS_ID
            },
            "labels": {},
            "location": OTHER_LOCATION,
        },
        timeout=DEFAULT_TIMEOUT,
    )
Ejemplo n.º 10
0
def _item_to_dataset(iterator, resource):
    """Convert a JSON dataset to the native object.

    :type iterator: :class:`~google.cloud.iterator.Iterator`
    :param iterator: The iterator that is currently in use.

    :type resource: dict
    :param resource: An item to be converted to a dataset.

    :rtype: :class:`.Dataset`
    :returns: The next dataset in the page.
    """
    return Dataset.from_api_repr(resource, iterator.client)
Ejemplo n.º 11
0
def _item_to_dataset(iterator, resource):
    """Convert a JSON dataset to the native object.

    :type iterator: :class:`~google.cloud.iterator.Iterator`
    :param iterator: The iterator that is currently in use.

    :type resource: dict
    :param resource: An item to be converted to a dataset.

    :rtype: :class:`.Dataset`
    :returns: The next dataset in the page.
    """
    return Dataset.from_api_repr(resource, iterator.client)
 def to_dataset(project, model):
     access_entries = model.access_entries
     if access_entries:
         access_entries = tuple(
             BigQueryAccessEntry.to_access_entry(a) for a in access_entries)
     else:
         access_entries = ()
     dataset_ref = DatasetReference(project, model.dataset_id)
     dataset = Dataset(dataset_ref)
     dataset.friendly_name = model.friendly_name
     dataset.description = model.description
     dataset.default_table_expiration_ms = model.default_table_expiration_ms
     dataset.location = model.location
     dataset.access_entries = access_entries
     dataset.labels = model.labels if model.labels is not None else dict()
     return dataset
    def dataset(self, dataset_name, project=None):
        """Construct a dataset bound to this client.

        :type dataset_name: str
        :param dataset_name: Name of the dataset.

        :type project: str
        :param project: (Optional) project ID for the dataset (defaults to
                        the project of the client).

        :rtype: :class:`google.cloud.bigquery.dataset.Dataset`
        :returns: a new ``Dataset`` instance
        """
        return Dataset(dataset_name, client=self, project=project)
Ejemplo n.º 14
0
    def create_dataset(self,
                       dataset  # type: DatasetReference, Dataset
                       ):
        # type: (...) -> None
        """
        Creates a dataset.

        Args:
            dataset: The Dataset object to create.
        """
        if isinstance(dataset, DatasetReference):
            dataset = Dataset(dataset)

        self.gclient.create_dataset(dataset)
Ejemplo n.º 15
0
def test_create_dataset_w_custom_property(client, PROJECT, DS_ID):
    # The library should handle sending properties to the API that are not
    # yet part of the library

    path = "/projects/%s/datasets" % PROJECT
    resource = {
        "datasetReference": {
            "projectId": PROJECT,
            "datasetId": DS_ID
        },
        "newAlphaProperty": "unreleased property",
    }
    conn = client._connection = make_connection(resource)

    ds_ref = DatasetReference(PROJECT, DS_ID)
    before = Dataset(ds_ref)
    before._properties["newAlphaProperty"] = "unreleased property"
    after = client.create_dataset(before)

    assert after.dataset_id == DS_ID
    assert after.project == PROJECT
    assert after._properties["newAlphaProperty"] == "unreleased property"

    conn.api_request.assert_called_once_with(
        method="POST",
        path=path,
        data={
            "datasetReference": {
                "projectId": PROJECT,
                "datasetId": DS_ID
            },
            "newAlphaProperty": "unreleased property",
            "labels": {},
        },
        timeout=DEFAULT_TIMEOUT,
    )
Ejemplo n.º 16
0
    def list_datasets(self,
                      include_all=False,
                      max_results=None,
                      page_token=None):
        """List datasets for the project associated with this client.

        See:
        https://cloud.google.com/bigquery/docs/reference/v2/datasets/list

        :type include_all: boolean
        :param include_all: True if results include hidden datasets.

        :type max_results: int
        :param max_results: maximum number of datasets to return, If not
                            passed, defaults to a value set by the API.

        :type page_token: str
        :param page_token: opaque marker for the next "page" of datasets. If
                           not passed, the API will return the first page of
                           datasets.

        :rtype: tuple, (list, str)
        :returns: list of :class:`~google.cloud.bigquery.dataset.Dataset`,
                  plus a "next page token" string:  if the token is not None,
                  indicates that more datasets can be retrieved with another
                  call (pass that value as ``page_token``).
        """
        params = {}

        if include_all:
            params['all'] = True

        if max_results is not None:
            params['maxResults'] = max_results

        if page_token is not None:
            params['pageToken'] = page_token

        path = '/projects/%s/datasets' % (self.project, )
        resp = self.connection.api_request(method='GET',
                                           path=path,
                                           query_params=params)
        datasets = [
            Dataset.from_api_repr(resource, self)
            for resource in resp.get('datasets', ())
        ]
        return datasets, resp.get('nextPageToken')
Ejemplo n.º 17
0
    def list_datasets(self, include_all=False, max_results=None,
                      page_token=None):
        """List datasets for the project associated with this client.

        See:
        https://cloud.google.com/bigquery/docs/reference/v2/datasets/list

        :type include_all: boolean
        :param include_all: True if results include hidden datasets.

        :type max_results: int
        :param max_results: maximum number of datasets to return, If not
                            passed, defaults to a value set by the API.

        :type page_token: str
        :param page_token: opaque marker for the next "page" of datasets. If
                           not passed, the API will return the first page of
                           datasets.

        :rtype: tuple, (list, str)
        :returns: list of :class:`~google.cloud.bigquery.dataset.Dataset`,
                  plus a "next page token" string:  if the token is not None,
                  indicates that more datasets can be retrieved with another
                  call (pass that value as ``page_token``).
        """
        params = {}

        if include_all:
            params['all'] = True

        if max_results is not None:
            params['maxResults'] = max_results

        if page_token is not None:
            params['pageToken'] = page_token

        path = '/projects/%s/datasets' % (self.project,)
        resp = self.connection.api_request(method='GET', path=path,
                                           query_params=params)
        datasets = [Dataset.from_api_repr(resource, self)
                    for resource in resp.get('datasets', ())]
        return datasets, resp.get('nextPageToken')
Ejemplo n.º 18
0
 def dataset(self, name):
     from google.cloud.bigquery.dataset import Dataset
     return Dataset(name, client=self)
Ejemplo n.º 19
0
def create_temporary_data_source(source_uri):
    """Create a temporary data source so BigQuery can query the CSV in
    Google Cloud Storage.

    Nothing like this is currently implemented in the
    google-cloud-python library.

    Returns a table reference suitable for using in a BigQuery SQL
    query (legacy format).

    """
    schema = [
        {
            "name": "Regional_Office_Name",
            "type": "string"
        },
        {
            "name": "Regional_Office_Code",
            "type": "string"
        },
        {
            "name": "Area_Team_Name",
            "type": "string"
        },
        {
            "name": "Area_Team_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "PCO_Name",
            "type": "string"
        },
        {
            "name": "PCO_Code",
            "type": "string"
        },
        {
            "name": "Practice_Name",
            "type": "string"
        },
        {
            "name": "Practice_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "BNF_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "BNF_Description",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "Items",
            "type": "integer",
            "mode": "required"
        },
        {
            "name": "Quantity",
            "type": "integer",
            "mode": "required"
        },
        {
            "name": "ADQ_Usage",
            "type": "float"
        },
        {
            "name": "NIC",
            "type": "float",
            "mode": "required"
        },
        {
            "name": "Actual_Cost",
            "type": "float",
            "mode": "required"
        },
    ]
    resource = {
        "tableReference": {
            "tableId": TEMP_SOURCE_NAME
        },
        "externalDataConfiguration": {
            "csvOptions": {
                "skipLeadingRows": "1"
            },
            "sourceFormat": "CSV",
            "sourceUris": [source_uri],
            "schema": {
                "fields": schema
            }
        }
    }
    client = bigquery.client.Client(project='ebmdatalab')
    # delete the table if it exists
    dataset = Dataset("tmp_eu", client)
    table = Table.from_api_repr(resource, dataset)
    try:
        table.delete()
    except NotFound:
        pass
    # Now create it
    path = "/projects/ebmdatalab/datasets/%s/tables" % TEMP_DATASET
    client._connection.api_request(method='POST', path=path, data=resource)
    return "[ebmdatalab:%s.%s]" % (TEMP_DATASET, TEMP_SOURCE_NAME)
Ejemplo n.º 20
0
def persist_lines_stream(  # noqa: 211
    client: Client,
    project_id,
    dataset: Dataset,
    lines: TextIO,
    truncate: bool,
    forced_fulltables: list,
    validate_records: bool = True,
    table_suffix: Optional[str] = None,
    table_prefix: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """Stream data into BigQuery.

    Arguments:
        client {Client} -- BigQuery client
        dataset {Dataset} -- BigQuery dataset
        lines {TextIO} -- Tap stream

    Keyword Arguments:
        truncate {bool} -- Whether to truncunate the table
        forced_fulltables {list} -- List of tables to truncunate
        validate_records {bool} -- Whether to alidate records (default: {True})
        table_suffix {Optional[str]} -- Suffix for tables (default: {None})
        table_prefix {Optional[str]} -- Prefix for tables (default: {None})

    Raises:
        SchemaNotFoundException: If the schema message was not received yet
        InvalidSingerMessage: Invalid Sinnger message

    Yields:
        Iterator[Optional[str]] -- State
    """
    # Create variable in which we save data in the upcomming loop
    state: Optional[str] = None
    schemas: dict = {}
    key_properties: dict = {}
    tables: dict = {}
    rows: dict = {}
    errors: dict = {}
    table_suffix = table_suffix or ''
    table_prefix = table_prefix or ''

    # For every Singer input message
    for line in lines:
        # Parse the message
        try:
            msg: Union[SchemaMessage, StateMessage,
                       RecordMessage] = (parse_message(line))
        except json.decoder.JSONDecodeError:
            LOGGER.error(f'Unable to parse Singer Message:\n{line}')
            raise

        # There can be several kind of messages. When inserting data, the
        # schema message comes first
        if isinstance(msg, SchemaMessage):
            # Schema message, create the table
            table_name: str = table_prefix + msg.stream + table_suffix

            # Save the schema, key_properties and message to use in the
            # record messages that are following
            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties

            tables[table_name] = bigquery.Table(
                dataset.table(table_name),
                schema=build_schema(schemas[table_name]),
            )

            rows[table_name] = 0
            errors[table_name] = None

            dataset_id: str = dataset.dataset_id
            if not table_exists(client, project_id, dataset_id, table_name):
                # Create the table
                client.create_table(tables[table_name])
            elif truncate or table_name in forced_fulltables:
                LOGGER.info(f'Load {table_name} by FULL_TABLE')

                # When truncating is enabled and the table exists, the table
                # has to be recreated. Because of this, we have to wait
                # otherwise data can be lost, see:
                # https://stackoverflow.com/questions/36846571/
                # bigquery-table-truncation-before-streaming-not-working
                LOGGER.info(f'Deleting table {table_name} because it exists')
                client.delete_table(tables[table_name])
                LOGGER.info(f'Recreating table {table_name}')
                client.create_table(tables[table_name])
                LOGGER.info(
                    'Sleeping for 5 minutes before streaming data, '
                    f'to avoid streaming data loss in {table_name}', )
                time.sleep(FIVE_MINUTES)

                # Delete table

        elif isinstance(msg, RecordMessage):
            # Record message
            table_name = table_prefix + msg.stream + table_suffix

            if table_name not in schemas:
                raise SchemaNotFoundException(
                    f'A record for stream {table_name} was encountered before '
                    'a corresponding schema', )

            # Retrieve schema
            schema: dict = schemas[table_name]

            # Retrieve table
            table_ref: TableReference = tables[table_name]

            # Validate the record
            if validate_records:
                # Raises ValidationError if the record has invalid schema
                validate(msg.record, schema)

            # Filter the record
            record_input: Optional[Union[dict, str, list]] = filter_schema(
                schema,
                msg.record,
            )

            # Somewhere in the process, the input record can have decimal
            # values e.g. "value": Decimal('10.25'). These are not JSON
            # erializable. Therefore, we dump the JSON here, which converts
            # them to string. Thereafter, we load the dumped JSON so we get a
            # dictionary again, which we can insert to BigQuery
            record_json: str = json.dumps(record_input, cls=DecimalEncoder)
            record: dict = json.loads(record_json)

            # Save the error
            err: Optional[list] = None

            try:
                # Insert record
                err = client.insert_rows(table_ref, [record])
            except Exception as exc:
                LOGGER.error(
                    f'Failed to insert rows for {table_name}: {exc}\n'
                    f'{record}\n{err}', )
                raise

            # Save errors of the stream and increate the insert rows
            errors[msg.stream] = err
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, StateMessage):
            # State messages
            LOGGER.debug(f'Setting state to {msg.value}')
            state = msg.value

        else:
            raise InvalidSingerMessage(f'Unrecognized Singer Message:\n {msg}')

    for table in errors.keys():
        if errors[table]:
            logging.error(f'Errors: {errors[table]}')
        else:
            logging.info(
                'Loaded {rows} row(s) from {source} into {tab}:{path}'.format(
                    rows=rows[table],
                    source=dataset.dataset_id,
                    tab=table,
                    path=tables[table].path,
                ), )
            yield state
Ejemplo n.º 21
0
 def get_dataset_metadata(self, dataset: str):
     ## adapt to query information schema
     ds_ref = self.client.dataset(dataset)
     ds = Dataset(ds_ref)
     return ds.to_api_repr()
Ejemplo n.º 22
0
def test_create_dataset_w_attrs(client, PROJECT, DS_ID):
    from google.cloud.bigquery.dataset import AccessEntry

    PATH = "projects/%s/datasets" % PROJECT
    DESCRIPTION = "DESC"
    FRIENDLY_NAME = "FN"
    LOCATION = "US"
    USER_EMAIL = "*****@*****.**"
    LABELS = {"color": "red"}
    VIEW = {
        "projectId": "my-proj",
        "datasetId": "starry-skies",
        "tableId": "northern-hemisphere",
    }
    RESOURCE = {
        "datasetReference": {
            "projectId": PROJECT,
            "datasetId": DS_ID
        },
        "etag": "etag",
        "id": "%s:%s" % (PROJECT, DS_ID),
        "description": DESCRIPTION,
        "friendlyName": FRIENDLY_NAME,
        "location": LOCATION,
        "defaultTableExpirationMs": "3600",
        "labels": LABELS,
        "access": [{
            "role": "OWNER",
            "userByEmail": USER_EMAIL
        }, {
            "view": VIEW
        }],
    }
    conn = client._connection = make_connection(RESOURCE)
    entries = [
        AccessEntry("OWNER", "userByEmail", USER_EMAIL),
        AccessEntry(None, "view", VIEW),
    ]

    ds_ref = DatasetReference(PROJECT, DS_ID)
    before = Dataset(ds_ref)
    before.access_entries = entries
    before.description = DESCRIPTION
    before.friendly_name = FRIENDLY_NAME
    before.default_table_expiration_ms = 3600
    before.location = LOCATION
    before.labels = LABELS
    after = client.create_dataset(before)

    assert after.dataset_id == DS_ID
    assert after.project == PROJECT
    assert after.etag == RESOURCE["etag"]
    assert after.full_dataset_id == RESOURCE["id"]
    assert after.description == DESCRIPTION
    assert after.friendly_name == FRIENDLY_NAME
    assert after.location == LOCATION
    assert after.default_table_expiration_ms == 3600
    assert after.labels == LABELS

    conn.api_request.assert_called_once_with(
        method="POST",
        path="/%s" % PATH,
        data={
            "datasetReference": {
                "projectId": PROJECT,
                "datasetId": DS_ID
            },
            "description":
            DESCRIPTION,
            "friendlyName":
            FRIENDLY_NAME,
            "location":
            LOCATION,
            "defaultTableExpirationMs":
            "3600",
            "access": [{
                "role": "OWNER",
                "userByEmail": USER_EMAIL
            }, {
                "view": VIEW
            }],
            "labels":
            LABELS,
        },
        timeout=DEFAULT_TIMEOUT,
    )
Ejemplo n.º 23
0
def persist_lines_job(  # noqa: WPS210, WPS211, WPS213, WPS231, WPS238
    client: Client,
    dataset: Dataset,
    lines: TextIO,
    truncate: bool,
    forced_fulltables: list,
    validate_records: bool = True,
    table_suffix: Optional[str] = None,
    table_prefix: Optional[str] = None,
) -> Iterator[Optional[str]]:
    """Perform a load job into BigQuery.

    Arguments:
        client {Client} -- BigQuery client
        dataset {Dataset} -- BigQuery dataset
        lines {TextIO} -- Tap stream

    Keyword Arguments:
        truncate {bool} -- Whether to truncunate the table
        forced_fulltables {list} -- List of tables to truncunate
        validate_records {bool} -- Whether to alidate records (default: {True})
        table_suffix {Optional[str]} -- Suffix for tables (default: {None})
        table_prefix {Optional[str]} -- Prefix for tables (default: {None})

    Raises:
        SchemaNotFoundException: If the schema message was not received yet
        InvalidSingerMessage: Invalid Sinnger message

    Yields:
        Iterator[Optional[str]] -- State
    """
    # Create variable in which we save data in the upcomming loop
    state: Optional[str] = None
    schemas: dict = {}
    key_properties: dict = {}
    rows: dict = {}
    errors: dict = {}
    table_suffix = table_suffix or ''
    table_prefix = table_prefix or ''

    # For every Singer input message
    for line in lines:
        # Parse the message
        try:
            msg: Union[SchemaMessage, StateMessage, RecordMessage] = (
                parse_message(line)
            )
        except json.decoder.JSONDecodeError:
            LOGGER.error(f'Unable to parse Singer Message:\n{line}')
            raise

        # There can be several kind of messages. When inserting data, the
        # schema message comes first
        if isinstance(msg, SchemaMessage):
            # Schema message, save schema
            table_name: str = table_prefix + msg.stream + table_suffix

            # Skip schema if already created
            if table_name in rows:
                continue

            # Save schema and setup a temp file for data storage
            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties
            rows[table_name] = TemporaryFile(mode='w+b')
            errors[table_name] = None

        elif isinstance(msg, RecordMessage):
            # Record message
            table_name = table_prefix + msg.stream + table_suffix

            if table_name not in schemas:
                raise SchemaNotFoundException(
                    f'A record for stream {table_name} was encountered before '
                    'a corresponding schema',
                )

            # Retrieve schema
            schema: dict = schemas[table_name]

            # Validate the record
            if validate_records:
                # Raises ValidationError if the record has invalid schema
                validate(msg.record, schema)

            record_input: Optional[Union[dict, str, list]] = filter_schema(
                schema,
                msg.record,
            )

            # Somewhere in the process, the input record can have decimal
            # values e.g. "value": Decimal('10.25'). These are not JSON
            # erializable. Therefore, we dump the JSON here, which converts
            # them to string. Thereafter, we load the dumped JSON so we get a
            # dictionary again, which we can insert to BigQuery
            record_str: str = '{rec}\n'.format(
                rec=json.dumps(record_input, cls=DecimalEncoder),
            )

            record: bytes = bytes(record_str, 'UTF-8')

            # Save data to load later
            rows[table_name].write(record)

            state = None

        elif isinstance(msg, StateMessage):
            # State messages
            LOGGER.debug(f'Setting state to {msg.value}')
            state = msg.value

        else:
            raise InvalidSingerMessage(
                f'Unrecognized Singer Message:\n {msg}',
            )

    # After all recordsa are received, setup a load job per stream
    for table in rows.keys():
        # Prepare load job
        key_props: str = key_properties[table]
        load_config: LoadJobConfig = LoadJobConfig()
        load_config.schema = build_schema(
            schemas[table],
            key_properties=key_props,
        )
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        # Overwrite the table if truncate is enabled
        if truncate or table in forced_fulltables:
            LOGGER.info(f'Load {table} by FULL_TABLE')
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        LOGGER.info(f'loading {table} to Bigquery.')

        # Setup load job
        load_job: LoadJob = client.load_table_from_file(
            rows[table],
            dataset.table(table),
            job_config=load_config,
            rewind=True,
        )

        LOGGER.info(f'loading job {load_job.job_id}')

        # Run load job
        try:
            load_job.result()
        except google_exceptions.GoogleAPICallError as err:
            # Parse errors
            LOGGER.error(f'failed to load table {table} from file: {err}')

            if load_job.errors:
                messages: list = [
                    f"reason: {err['reason']}, message: {err['message']}"
                    for err in load_job.errors
                ]
                messages_str: str = '\n'.join(messages)
                LOGGER.error(f'errors:\n{messages_str}')
            raise
        LOGGER.info(
            f'Loaded {load_job.output_rows} row(s) in '
            f'{load_job.destination}',
        )

    yield state
Ejemplo n.º 24
0
def get_dataset_object(dataset_id, client):
    return Dataset(dataset_id, client)