Exemple #1
0
    def from_api_repr(cls, resource, client):
        """Factory:  construct a job given its API representation

        .. note:

           This method assumes that the project found in the resource matches
           the client's project.

        :type resource: dict
        :param resource: dataset job representation returned from the API

        :type client: :class:`google.cloud.bigquery.client.Client`
        :param client: Client which holds credentials and project
                       configuration for the dataset.

        :rtype: :class:`google.cloud.bigquery.job.LoadTableFromStorageJob`
        :returns: Job parsed from ``resource``.
        """
        name, config = cls._get_resource_config(resource)
        dest_config = config['destinationTable']
        dataset = Dataset(dest_config['datasetId'], client)
        destination = Table(dest_config['tableId'], dataset)
        source_urls = config.get('sourceUris', ())
        job = cls(name, destination, source_urls, client=client)
        job._set_properties(resource)
        return job
Exemple #2
0
    def download_table_as_file(self, table_id, dest, staging_location,
                               file_type):
        """
        Download a bigquery table as file
        Args:
            table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            staging_location (str): url to staging_location (currently
                support a folder in GCS)
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
        Returns: (str) path to the downloaded file

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = 'temp_{}'.format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(table_id)
        job = self.bq.extract_table(src_table,
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.gcs.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Exemple #3
0
    def download_table_as_df(self, full_table_id, staging_location=None):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not staging_location:
            table = bigquery.TableReference.from_string(full_table_id)
            rows = self.bqclient.list_rows(table)
            return rows.to_dataframe(bqstorage_client=self.bqstorageclient)

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = DestinationFormat.CSV
        job = self.bqclient.extract_table(Table.from_string(full_table_id),
                                          staging_file_path,
                                          job_config=job_config)

        # await completion
        job.result()
        return gcs_to_df(staging_file_path)
Exemple #4
0
    def test_download_table_as_df(self, mocker):
        self._stop_time(mocker)
        mocked_gcs_to_df = mocker.patch(
            "feast.sdk.utils.bq_util.gcs_to_df",
            return_value=None)

        staging_path = "gs://temp/"
        staging_file_name = "temp_0"
        table_id = "project_id.dataset_id.table_id"

        table_dldr = TableDownloader()
        exp_staging_path = os.path.join(staging_path, staging_file_name)

        table_dldr._bq = _Mock_BQ_Client()
        mocker.patch.object(table_dldr._bq, "extract_table",
                            return_value=_Job())

        table_dldr.download_table_as_df(table_id,
                                        staging_location=staging_path)

        assert len(
            table_dldr._bq.extract_table.call_args_list) == 1
        args, kwargs = \
            table_dldr._bq.extract_table.call_args_list[0]
        assert args[0].full_table_id == Table.from_string(
            table_id).full_table_id
        assert args[1] == exp_staging_path
        assert kwargs['job_config'].destination_format == "CSV"
        mocked_gcs_to_df.assert_called_once_with(exp_staging_path)
Exemple #5
0
    def _test_download_file(self, mocker, type):
        staging_path = "gs://temp/"
        staging_file_name = "temp_0"
        dst_path = "/tmp/myfile.csv"
        table_id = "project_id.dataset_id.table_id"

        table_dldr = TableDownloader()
        mock_blob = _Blob()
        mocker.patch.object(mock_blob, "download_to_filename")
        table_dldr._bq = _Mock_BQ_Client()
        mocker.patch.object(table_dldr._bq, "extract_table",
                            return_value=_Job())
        table_dldr._gcs = _Mock_GCS_Client()
        mocker.patch.object(table_dldr._gcs, "get_bucket",
                            return_value=_Bucket(mock_blob))

        table_dldr.download_table_as_file(table_id,
                                          dst_path,
                                          staging_location=staging_path,
                                          file_type=type)

        exp_staging_path = os.path.join(staging_path, staging_file_name)
        assert len(
            table_dldr._bq.extract_table.call_args_list) == 1
        args, kwargs = \
            table_dldr._bq.extract_table.call_args_list[0]
        assert args[0].full_table_id == Table.from_string(table_id).full_table_id
        assert args[1] == exp_staging_path
        assert kwargs['job_config'].destination_format == str(type)

        mock_blob.download_to_filename.assert_called_once_with(dst_path)
Exemple #6
0
def main(
    project: Optional[str],
    dataset: Optional[str],
    module_path: str,
    apply: bool,
    validate: bool,
) -> None:
    client = create_connection()
    for local_table in set(find_tables(module_path)):
        project = project or local_table.project
        assert project, "Project has not been set."
        dataset = dataset or local_table.dataset
        assert dataset, "Dataset has not been set."

        table_identifier = f"{project}.{dataset}.{local_table.full_table_name()}"
        print(f"Checking migrations for: {table_identifier}")

        try:
            remote_table = client.get_table(table_identifier)
        except NotFound as not_found:
            table_exists_msg = f"Table does not exist in bq: {table_identifier}"
            if validate:
                raise Exception(table_exists_msg) from not_found

            print(table_exists_msg)
            if apply:
                print("Creating table.")
                table = Table(
                    table_identifier,
                    schema=local_table.get_schema_fields(),
                )
                if local_table.time_partitioning:
                    table.time_partitioning = local_table.time_partitioning
                print(client.create_table(table))
        else:
            new_columns = list(
                find_new_columns(local_table.get_schema_fields(),
                                 remote_table.schema))
            if new_columns:
                new_columns_message = f"Found new columns: {new_columns}"
                if validate:
                    raise Exception(new_columns_message)
                print(new_columns_message)
                if apply:
                    print("Applying changes")
                    remote_table.schema = local_table.get_schema_fields()
                    print(client.update_table(remote_table, ["schema"]))
Exemple #7
0
    def populate_table(self, table_path, schema, data=[], make_immediately_available=False,
                       replace_existing_table=False):
        # type: (str, List[SchemaField], Optional[List[Any]], Optional[bool], Optional[bool]) -> None
        """Creates a table and populates it with a list of rows.

        If make_immediately_available is False, the table will be created using streaming inserts.
        Note that streaming inserts are immediately available for querying, but not for exporting or
        copying, so if you need that capability you should set make_immediately_available to True.
        https://cloud.google.com/bigquery/streaming-data-into-bigquery

        If the table is already created, it will raise a RuntimeError, unless replace_existing_table
        is True.

        Args:
          table_path: A string of the form '<dataset id>.<table name>'
              or '<project id>.<dataset id>.<table name>'.
          schema: A list of SchemaFields to represent the table's schema.
          data: A list of rows, each of which corresponds to a row to insert into the table.
          make_immediately_available: If False, the table won't immediately be available for
              copying or exporting, but will be available for querying. If True, after this
              operation returns, it will be available for copying and exporting too.
          replace_existing_table: If set to True, the table at table_path will be deleted and
              recreated if it's already present.

        Raises:
            RuntimeError if the table at table_path is already there and replace_existing_table
                is False
        """
        # Use the Table object so we can pass through the schema.
        table = Table(self.get_table_reference_from_path(table_path), schema)
        if self.table_exists(table):
            if replace_existing_table:
                self.delete_table(table)
            else:
                raise RuntimeError('The table {} already exists.'.format(table_path))
        self.create_table(table)

        if data:
            if make_immediately_available:
                output = cStringIO.StringIO()

                csv_out = csv.writer(output)
                for row in data:
                    csv_out.writerow(row)

                job_config = LoadJobConfig()
                job_config.source_format = 'text/csv'
                # By default this does six retries. It does not accept any other timeout or
                # retry parameters.
                job = self.gclient.load_table_from_file(output, table.reference,
                                                        job_config=job_config,
                                                        rewind=True)
                job.result()

                output.close()
            else:
                self._stream_chunks_of_rows(table, data, schema)
    def query_to_table(self, query, job_name, dataset_name=None):
        # external logging if required
        if self.log_lambda is not None:
            self.log_lambda(query)
        
        # Do nothing if use_cache
        if BigQueryExporter._use_cache:
            return
        
        #logging
        logging.info('[BigQueryExporter] ['+job_name+'] ::query_to_table start')
        startTime= datetime.now()
        
        # initialize variables
        if dataset_name is None:
            dataset_name = self.dataset_name
        logging.info('[BigQueryExporter] ['+job_name+'] ::dataset is set to %s' % dataset_name )
        bigquery_client = self.bigquery_client
        
        # Point to the dataset and table
        destination_dataset = self.bigquery_client.dataset(dataset_name)
        destination_table = destination_dataset.table(job_name)

        # Create an empty table
        try:
            logging.info('[BigQueryExporter] ['+job_name+'] ::bigqueyr_client.get_table(%s) ...' % destination_table )
            self.bigquery_client.get_table(destination_table)
            logging.info('[BigQueryExporter] ['+job_name+'] ::bigqueyr_client.delete_table(%s) ...' % destination_table )
            self.bigquery_client.delete_table(destination_table)
        except:
            logging.info('[BigQueryExporter] ['+job_name+'] ::execption point 01 ...' % dataset_name )
            pass
        logging.info('[BigQueryExporter] ['+job_name+'] ::bigqueyr_client.create_table( Table( %s ) ) ...' % destination_table )
        self.bigquery_client.create_table(Table(destination_table))
        # destination_table.create()
        
        # Execute the job and save to table
        # unique_id = str(uuid.uuid4())
        # job = bigquery_client.run_async_query(unique_id, query)
        job_config = bigquery.QueryJobConfig()
        job_config.allow_large_results = True
        job_config.use_legacy_sql = False
        job_config.destination = destination_table
        
        logging.info('[BigQueryExporter] ['+job_name+'] ::bigqueyr_client.query() starts ...' )
        logging.info('[BigQueryExporter] ['+job_name+'] ::job_config: %s' % str(job_config) )
        job = self.bigquery_client.query(query, job_config=job_config)
        
        # Wait till the job done
        while not job.done():
            time.sleep(1)
        
        # logging
        timeElapsed=datetime.now()-startTime 
        logging.info('[BigQueryExporter] ['+job_name+'] ::query_to_table completed, elpased {}s'.format(timeElapsed.seconds))
        
        return destination_table
Exemple #9
0
 def _use_query_results(self, response_json):
     # NB: be sure to remove the jobReference from the api response used to
     #     create the Table instance.
     response_json_copy = response_json.copy()
     del response_json_copy['jobReference']
     mock_dataset = mock.Mock()
     mock_dataset._client = self._client
     mock_table = Table('mock_table', mock_dataset)
     self._client._connection.api_request.return_value = response_json
     self._client.dataset.return_value = mock_dataset
     mock_dataset.table.return_value = mock_table
    def test__row_from_mapping_wo_schema(self):
        from google.cloud.bigquery.table import Table, _TABLE_HAS_NO_SCHEMA
        MAPPING = {'full_name': 'Phred Phlyntstone', 'age': 32}
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        table = Table(table_ref)

        with self.assertRaises(ValueError) as exc:
            self._call_fut(MAPPING, table.schema)

        self.assertEqual(exc.exception.args, (_TABLE_HAS_NO_SCHEMA,))
    def create_table(self, is_temporary=False):
        stream_schema_message = self.stream_schema_message

        client = self.open_connection()
        project_id = self.connection_config['project_id']
        dataset_id = self.schema_name
        table_name = self.table_name(stream_schema_message['stream'],
                                     is_temporary,
                                     without_schema=True)

        schema = [
            column_type(name, schema)
            for (name, schema) in self.flatten_schema.items()
        ]

        table = Table('{}.{}.{}'.format(project_id, dataset_id, table_name),
                      schema)
        if is_temporary:
            table.expires = datetime.datetime.now() + datetime.timedelta(
                days=1)

        client.create_table(table, schema)
Exemple #12
0
def _item_to_table(iterator, resource):
    """Convert a JSON table to the native object.

    :type iterator: :class:`~google.cloud.iterator.Iterator`
    :param iterator: The iterator that is currently in use.

    :type resource: dict
    :param resource: An item to be converted to a table.

    :rtype: :class:`~google.cloud.bigquery.table.Table`
    :returns: The next table in the page.
    """
    return Table.from_api_repr(resource, iterator.dataset)
Exemple #13
0
    def table(self, name, schema=()):
        """Construct a table bound to this dataset.

        :type name: str
        :param name: Name of the table.

        :type schema: list of :class:`google.cloud.bigquery.table.SchemaField`
        :param schema: The table's schema

        :rtype: :class:`google.cloud.bigquery.table.Table`
        :returns: a new ``Table`` instance
        """
        return Table(name, dataset=self, schema=schema)
Exemple #14
0
def _item_to_table(iterator, resource):
    """Convert a JSON table to the native object.

    :type iterator: :class:`~google.api.core.page_iterator.Iterator`
    :param iterator: The iterator that is currently in use.

    :type resource: dict
    :param resource: An item to be converted to a table.

    :rtype: :class:`~google.cloud.bigquery.table.Table`
    :returns: The next table in the page.
    """
    return Table.from_api_repr(resource, iterator.dataset)
def apply_schema_differences(
    schema_diffs: _SchemaDiffs,
    bigquery_client: BigQueryClient,
) -> None:
    print("Applying changes...")
    for table_identifier, difference in schema_diffs.items():
        if isinstance(difference, MissingTable):
            print("Creating table...")
            table = Table(
                table_identifier,
                schema=difference.local_table.get_schema_fields(),
            )
            if difference.local_table.time_partitioning:
                table.time_partitioning = difference.local_table.time_partitioning
            remote_table = bigquery_client.create_table(table)
            print(remote_table)
        elif isinstance(difference, ExistingTable):
            difference.remote_table.schema = difference.local_table.get_schema_fields(
            )
            print(
                bigquery_client.update_table(difference.remote_table,
                                             ["schema"]))
 def to_table(dataset_ref, model):
     schema = model.schema
     if schema:
         schema = tuple(
             BigQuerySchemaField.to_schema_field(s) for s in schema)
     else:
         schema = None
     table_ref = TableReference(dataset_ref, model.table_id)
     table = Table(table_ref, schema)
     table.friendly_name = model.friendly_name
     table.description = model.description
     table.expires = model.expires
     table.partitioning_type = model.partitioning_type
     if model.view_use_legacy_sql is not None:
         table.view_use_legacy_sql = model.view_use_legacy_sql
     if model.view_query is not None:
         table.view_query = model.view_query
     table.labels = model.labels if model.labels is not None else dict()
     return table
Exemple #17
0
    def download_table_as_file(self,
                               full_table_id,
                               dest,
                               file_type,
                               staging_location=None):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
            staging_location (str, optional): url to staging_location (currently
                support a folder in GCS)
        Returns: (str) path to the downloaded file

        """
        if not staging_location:
            df = self.download_table_as_df(full_table_id)
            if file_type == FileType.CSV:
                df.to_csv(dest, index=False)
            elif file_type == FileType.JSON:
                df.to_json(dest, index=False)
            else:
                raise ValueError(
                    "Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
                )
            return dest

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bqclient.extract_table(src_table,
                                          staging_file_path,
                                          job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.storageclient.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Exemple #18
0
    def from_api_repr(cls, resource, client):
        """Factory:  construct a job given its API representation

        .. note:

           This method assumes that the project found in the resource matches
           the client's project.

        :type resource: dict
        :param resource: dataset job representation returned from the API

        :type client: :class:`google.cloud.bigquery.client.Client`
        :param client: Client which holds credentials and project
                       configuration for the dataset.

        :rtype: :class:`google.cloud.bigquery.job.CopyJob`
        :returns: Job parsed from ``resource``.
        """
        name, config = cls._get_resource_config(resource)
        dest_config = config['destinationTable']
        dataset = Dataset(dest_config['datasetId'], client)
        destination = Table(dest_config['tableId'], dataset)
        sources = []
        source_configs = config.get('sourceTables')
        if source_configs is None:
            single = config.get('sourceTable')
            if single is None:
                raise KeyError(
                    "Resource missing 'sourceTables' / 'sourceTable'")
            source_configs = [single]
        for source_config in source_configs:
            dataset = Dataset(source_config['datasetId'], client)
            sources.append(Table(source_config['tableId'], dataset))
        job = cls(name, destination, sources, client=client)
        job._set_properties(resource)
        return job
Exemple #19
0
    def create_table(self,
                     table  # type: Table, TableReference
                     ):
        # type: (Table) -> None
        """
        Creates a table.

        Args:
            table: The Table or TableReference object to create. Note that if you pass a
                TableReference the table will be created with no schema.
        """
        if isinstance(table, TableReference):
            # Normally you'd pass in the schema here upon Table instantiation
            table = Table(table)

        self.gclient.create_table(table)
Exemple #20
0
    def __extract_table_to_shard_folder(self, full_table_id,
                                        staging_location, file_type):
        shard_folder = os.path.join(staging_location,
                                    'temp_%d' % int(round(time.time() * 1000)))
        staging_file_path = os.path.join(shard_folder, "shard_*")

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        job = self.bqclient.extract_table(
            Table.from_string(full_table_id),
            staging_file_path,
            job_config=job_config
        )
        # await completion
        job.result()
        return shard_folder
Exemple #21
0
    def test__row_from_mapping_w_schema(self):
        from google.cloud.bigquery.table import Table, SchemaField
        MAPPING = {
            'full_name': 'Phred Phlyntstone',
            'age': 32,
            'colors': ['red', 'green'],
            'extra': 'IGNORED',
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField('full_name', 'STRING', mode='REQUIRED')
        age = SchemaField('age', 'INTEGER', mode='REQUIRED')
        colors = SchemaField('colors', 'DATETIME', mode='REPEATED')
        joined = SchemaField('joined', 'STRING', mode='NULLABLE')
        table = Table(table_ref, schema=[full_name, age, colors, joined])

        self.assertEqual(self._call_fut(MAPPING, table.schema),
                         ('Phred Phlyntstone', 32, ['red', 'green'], None))
    def test__row_from_mapping_w_invalid_schema(self):
        from google.cloud.bigquery.table import Table, SchemaField
        MAPPING = {
            'full_name': 'Phred Phlyntstone',
            'age': 32,
            'colors': ['red', 'green'],
            'bogus': 'WHATEVER',
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField('full_name', 'STRING', mode='REQUIRED')
        age = SchemaField('age', 'INTEGER', mode='REQUIRED')
        colors = SchemaField('colors', 'DATETIME', mode='REPEATED')
        bogus = SchemaField('joined', 'STRING', mode='BOGUS')
        table = Table(table_ref, schema=[full_name, age, colors, bogus])

        with self.assertRaises(ValueError) as exc:
            self._call_fut(MAPPING, table.schema)

        self.assertIn('Unknown field mode: BOGUS', str(exc.exception))
    def list_tables(self, max_results=None, page_token=None):
        """List tables for the project associated with this client.

        See:
        https://cloud.google.com/bigquery/docs/reference/v2/tables/list

        :type max_results: int
        :param max_results: maximum number of tables to return, If not
                            passed, defaults to a value set by the API.

        :type page_token: string
        :param page_token: opaque marker for the next "page" of datasets. If
                           not passed, the API will return the first page of
                           datasets.

        :rtype: tuple, (list, str)
        :returns: list of :class:`google.cloud.bigquery.table.Table`, plus a
                  "next page token" string:  if not ``None``, indicates that
                  more tables can be retrieved with another call (pass that
                  value as ``page_token``).
        """
        params = {}

        if max_results is not None:
            params['maxResults'] = max_results

        if page_token is not None:
            params['pageToken'] = page_token

        path = '/projects/%s/datasets/%s/tables' % (self.project, self.name)
        connection = self._client.connection
        resp = connection.api_request(method='GET',
                                      path=path,
                                      query_params=params)
        tables = [
            Table.from_api_repr(resource, self)
            for resource in resp.get('tables', ())
        ]
        return tables, resp.get('nextPageToken')
Exemple #24
0
    def test_ctor(self):
        from google.cloud.bigquery.table import Table

        client = _make_client(project=self.PROJECT)
        source = Table(self.TABLE_REF)
        job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI],
                             client)
        self.assertEqual(job.source.project, self.PROJECT)
        self.assertEqual(job.source.dataset_id, self.DS_ID)
        self.assertEqual(job.source.table_id, self.TABLE_ID)
        self.assertEqual(job.destination_uris, [self.DESTINATION_URI])
        self.assertIs(job._client, client)
        self.assertEqual(job.job_type, self.JOB_TYPE)
        self.assertEqual(job.path,
                         "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID))

        self._verifyInitialReadonlyProperties(job)

        # set/read from resource['configuration']['extract']
        self.assertIsNone(job.compression)
        self.assertIsNone(job.destination_format)
        self.assertIsNone(job.field_delimiter)
        self.assertIsNone(job.print_header)
Exemple #25
0
    def list_tables(self, max_results=None, page_token=None):
        """List tables for the project associated with this client.

        See:
        https://cloud.google.com/bigquery/docs/reference/v2/tables/list

        :type max_results: int
        :param max_results: maximum number of tables to return, If not
                            passed, defaults to a value set by the API.

        :type page_token: str
        :param page_token: opaque marker for the next "page" of datasets. If
                           not passed, the API will return the first page of
                           datasets.

        :rtype: tuple, (list, str)
        :returns: list of :class:`google.cloud.bigquery.table.Table`, plus a
                  "next page token" string:  if not ``None``, indicates that
                  more tables can be retrieved with another call (pass that
                  value as ``page_token``).
        """
        params = {}

        if max_results is not None:
            params['maxResults'] = max_results

        if page_token is not None:
            params['pageToken'] = page_token

        path = '/projects/%s/datasets/%s/tables' % (self.project, self.name)
        connection = self._client.connection
        resp = connection.api_request(method='GET', path=path,
                                      query_params=params)
        tables = [Table.from_api_repr(resource, self)
                  for resource in resp.get('tables', ())]
        return tables, resp.get('nextPageToken')
Exemple #26
0
from google.api_core.exceptions import NotFound
from google.cloud.bigquery import SchemaField
from google.cloud.bigquery.table import Table

from pontoz.bigquery.client import client

for pontoz_dataset in client.list_datasets():
    pass
_transactions_ref = pontoz_dataset.table('transactions')
try:
    transactions_table = client.get_table(_transactions_ref)
except NotFound:
    transactions_table = Table(_transactions_ref)

    SCHEMA = [
        SchemaField('id', 'INT64', 'REQUIRED', None, ()),
        SchemaField('sale', 'FLOAT64', 'REQUIRED', None, ()),
        SchemaField('pointz_sale', 'FLOAT64', 'REQUIRED', None, ()),
        SchemaField('year', 'INT64', 'REQUIRED', None, ()),
        SchemaField('month', 'INT64', 'REQUIRED', None, ()),
        SchemaField('day', 'INT64', 'REQUIRED', None, ()),
        SchemaField('store_name', 'string', 'REQUIRED', None, ()),
        SchemaField('store_id', 'INT64', 'REQUIRED', None, ()),
        SchemaField('region_name', 'string', 'REQUIRED', None, ()),
        SchemaField('region_id', 'INT64', 'REQUIRED', None, ()),
        SchemaField('client_name', 'string', 'REQUIRED', None, ()),
        SchemaField('client_id', 'INT64', 'REQUIRED', None, ()),
        SchemaField('segment_name', 'string', 'REQUIRED', None, ()),
    ]
    transactions_table.schema = SCHEMA
    transactions_table = client.create_table(transactions_table)
Exemple #27
0
def create_temporary_data_source(source_uri):
    """Create a temporary data source so BigQuery can query the CSV in
    Google Cloud Storage.

    Nothing like this is currently implemented in the
    google-cloud-python library.

    Returns a table reference suitable for using in a BigQuery SQL
    query (legacy format).

    """
    schema = [
        {
            "name": "Regional_Office_Name",
            "type": "string"
        },
        {
            "name": "Regional_Office_Code",
            "type": "string"
        },
        {
            "name": "Area_Team_Name",
            "type": "string"
        },
        {
            "name": "Area_Team_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "PCO_Name",
            "type": "string"
        },
        {
            "name": "PCO_Code",
            "type": "string"
        },
        {
            "name": "Practice_Name",
            "type": "string"
        },
        {
            "name": "Practice_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "BNF_Code",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "BNF_Description",
            "type": "string",
            "mode": "required"
        },
        {
            "name": "Items",
            "type": "integer",
            "mode": "required"
        },
        {
            "name": "Quantity",
            "type": "integer",
            "mode": "required"
        },
        {
            "name": "ADQ_Usage",
            "type": "float"
        },
        {
            "name": "NIC",
            "type": "float",
            "mode": "required"
        },
        {
            "name": "Actual_Cost",
            "type": "float",
            "mode": "required"
        },
    ]
    resource = {
        "tableReference": {
            "tableId": TEMP_SOURCE_NAME
        },
        "externalDataConfiguration": {
            "csvOptions": {
                "skipLeadingRows": "1"
            },
            "sourceFormat": "CSV",
            "sourceUris": [source_uri],
            "schema": {
                "fields": schema
            }
        }
    }
    client = bigquery.client.Client(project='ebmdatalab')
    # delete the table if it exists
    dataset = Dataset("tmp_eu", client)
    table = Table.from_api_repr(resource, dataset)
    try:
        table.delete()
    except NotFound:
        pass
    # Now create it
    path = "/projects/ebmdatalab/datasets/%s/tables" % TEMP_DATASET
    client._connection.api_request(method='POST', path=path, data=resource)
    return "[ebmdatalab:%s.%s]" % (TEMP_DATASET, TEMP_SOURCE_NAME)
Exemple #28
0
def get_table_object(table_id, dataset, *args):
    return Table(table_id, dataset, *args)
Exemple #29
0
def bq_insert(rows: List):
    """
    Inserts rows into BigQuery
    :param rows: list of dictionaries which are representing rows
    :return:
    """
    from google.cloud import bigquery

    if not rows:
        logging.error("no rows to upload")
        return
    bq = bigquery.Client(project=GCP_PROJECT)
    table_ref = TableReference.from_string(
        f"{GCP_PROJECT}.live.om_state_latencies")

    schema = [
        {
            "name": "date",
            "type": "DATE"
        },
        {
            "name": "sym",
            "type": "STRING"
        },
        {
            "name": "from_state",
            "type": "STRING"
        },
        {
            "name": "to_state",
            "type": "STRING"
        },
        {
            "name": "count",
            "type": "INTEGER"
        },
        {
            "name": "average",
            "type": "FLOAT"
        },
        {
            "name": "percentile_10",
            "type": "FLOAT"
        },
        {
            "name": "percentile_50",
            "type": "FLOAT"
        },
        {
            "name": "percentile_90",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99_99",
            "type": "FLOAT"
        },
    ]

    table = Table(table_ref)
    table.schema = schema
    table = bq.create_table(table, exists_ok=True)
    logging.info("inserting {} rows".format(len(rows)))
    res = bq.insert_rows(table, rows)
    logging.info(res)