Ejemplo n.º 1
0
def test_natality_tutorial():
    client = bigquery.Client()
    dataset_ref = client.dataset('natality_regression')
    assert not dataset_exists(dataset_ref, client)

    natality_tutorial.run_natality_tutorial()

    assert dataset_exists(dataset_ref, client)

    table = client.get_table(
        bigquery.Table(dataset_ref.table('regression_input')))
    assert table.num_rows > 0

    client.delete_dataset(dataset_ref, delete_contents=True)
    def create_table(self, dataset_id, table_id, schema):

        # TODO(developer): Set table_id to the ID of the table to create.
        table_id = f"{self.client.project}.{dataset_id}.{table_id}"

        table = bigquery.Table(table_id, schema=schema)
        try:
            table = self.client.create_table(table)  # Make an API request.
            print("Created table {}.{}.{}".format(table.project,
                                                  table.dataset_id,
                                                  table.table_id))
        except Exception as e:
            print(e)
            pass
Ejemplo n.º 3
0
def test_client_query_relax_column(capsys, client, random_table_id):

    schema = [
        bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]

    client.create_table(bigquery.Table(random_table_id, schema=schema))

    client_query_relax_column.client_query_relax_column(
        client, random_table_id)
    out, err = capsys.readouterr()
    assert "2 fields in the schema are required." in out
    assert "0 fields in the schema are now required." in out
    def create_table(self, dataset_id, table_name, schema):
        """Creates BigQuery table.

        Args:
            dataset_id (str): BigQuery dataset id.
            table_name (str): BigQuery table name.
            schema (List[google.cloud.bigquery.schema.SchemaField]): Schema
                of the table to be created.
        """

        dataset_ref = self.client.dataset(dataset_id)
        table_ref = dataset_ref.table(table_name)
        table = bigquery.Table(table_ref, schema)
        self.client.create_table(table)
Ejemplo n.º 5
0
def create_ga_kb_exit_rate_by_product():
    schema = [
        bigquery.SchemaField('ga_date', 'DATE', mode='NULLABLE'),
        bigquery.SchemaField('ga_exitPagePath', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('ga_exitRate', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('ga_exits', 'INTEGER', mode='NULLABLE'),
        bigquery.SchemaField('ga_pageviews', 'INTEGER', mode='NULLABLE'),
        bigquery.SchemaField('product', 'STRING', mode='NULLABLE')
    ]
    table_ref = dataset_ref.table('ga_kb_exit_rate_by_product')
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)  # API request

    assert table.table_id == 'ga_kb_exit_rate_by_product'
Ejemplo n.º 6
0
def create_new_table(client, dataset):
    """
    Function to create table
    :param client: instance of Bigquery client
    :param dataset: instance of dataset
    :return: table instance
    """
    table_id = "{}.{}.corona_cases_table".format(client.project, dataset.dataset_id)
    table = bigquery.Table(table_id)
    table = client.create_table(table, exists_ok=True)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )
    return table
def bq_create_table(client, dataset, tablename, schemaref):
    bigquery_client = client
    dataset_ref = bigquery_client.dataset(dataset)

    # Prepares a reference to the table
    table_ref = dataset_ref.table(tablename)

    try:
        bigquery_client.get_table(table_ref)
    except Exception:
        schema = schemaref
        table = bigquery.Table(table_ref, schema=schema)
        table = bigquery_client.create_table(table)
        print('table {} created.'.format(table.table_id))
def test_table_insert_rows_explicit_none_insert_ids(capsys, random_table_id,
                                                    client):

    schema = [
        bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]

    table = bigquery.Table(random_table_id, schema=schema)
    table = client.create_table(table)

    mut.table_insert_rows_explicit_none_insert_ids(random_table_id)
    out, err = capsys.readouterr()
    assert "New rows have been added." in out
    def make_bq_empty_table(
        self,
        aa_datafeed_header_file_path: str,
        bq_table_id: str,
    ) -> bool:
        """make bq empty table."""
        if len(self.bq_schema) == 0:
            self.__make_schema(aa_datafeed_header_file_path)

        bq_table = self.bq_dataset.table(bq_table_id)
        bq_table_ref = bigquery.Table(bq_table, schema=self.bq_schema)
        self.bigquery_client.create_table(bq_table_ref)

        return True
Ejemplo n.º 10
0
def build_and_copy_contents(client, src_dataset, dest_dataset):
    """
    Uses google client object to copy non-schemaed data to schemaed table.

    :param client: google client object with permissions
    :param src_dataset: The dataset to copy data from
    :param des_dataset: The dataset to copy data to.  It's tables are
        created with valid schemas before inserting data.
    """
    LOGGER.info(f'Beginning copy of data from unschemaed dataset, '
                f'`{src_dataset}`, to schemaed dataset, `{dest_dataset}`.')
    table_list = client.list_tables(src_dataset)

    for table_item in table_list:
        # create empty schemaed tablle with client object
        schema_list = get_table_schema(table_item.table_id)
        dest_table = f'{client.project}.{dest_dataset}.{table_item.table_id}'
        dest_table = bigquery.Table(dest_table, schema=schema_list)
        dest_table = client.create_table(dest_table)  # Make an API request.
        LOGGER.info(
            f'Created empty table `{dest_table.project}.{dest_table.dataset_id}.{dest_table.table_id}`'
        )

        fields_name_str = ',\n'.join([item.name for item in schema_list])
        # copy contents from non-schemaed source to schemaed dest
        sql = (
            f'SELECT {fields_name_str} '
            f'FROM `{table_item.project}.{table_item.dataset_id}.{table_item.table_id}`'
        )
        job_config = bigquery.job.QueryJobConfig(
            write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY,
            priority=bigquery.job.QueryPriority.BATCH,
            destination=dest_table,
            labels={
                'table_name': table_item.table_id,
                'copy_from': table_item.dataset_id,
                'copy_to': dest_dataset
            })
        job_id = (f'schemaed_copy_{table_item.table_id.lower()}_'
                  f'{datetime.now().strftime("%Y%m%d_%H%M%S")}')
        job = client.query(sql, job_config=job_config, job_id=job_id)
        job.result()  # Wait for the job to complete.

        LOGGER.info(
            f'Table contents `{table_item.project}.{table_item.dataset_id}.'
            f'{table_item.table_id}` were copied to `{dest_table.project}.'
            f'{dest_table.dataset_id}.{dest_table.table_id}`')

    LOGGER.info(f'Completed copy of data from unschemaed dataset, '
                f'`{src_dataset}`, to schemaed dataset, `{dest_dataset}`.')
Ejemplo n.º 11
0
def create_gtrends_queries(dataset_name, table_name):
    schema = [
        bigquery.SchemaField('update_date', 'DATE', mode='NULLABLE'),
        bigquery.SchemaField('region', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('original_query', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('translated_query', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('query_key_ts', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('search_increase_pct', 'INTEGER',
                             mode='NULLABLE'),
    ]
    dataset_ref = client.dataset(dataset_name)
    table_ref = dataset_ref.table(table_name)
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)
Ejemplo n.º 12
0
    def test_remove_unused_fields_from_schema_no_missing_fields(self) -> None:
        """Tests that remove_unused_fields_from_schema() does nothing if there are no missing fields."""
        table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id)
        schema_fields = [bigquery.SchemaField("field_1", "STRING")]
        table = bigquery.Table(table_ref, schema_fields)
        self.mock_client.get_table.return_value = table

        new_schema_fields = [bigquery.SchemaField("field_1", "STRING")]

        self.bq_client.remove_unused_fields_from_schema(
            self.mock_dataset_id, self.mock_table_id, new_schema_fields
        )

        self.mock_client.query.assert_not_called()
Ejemplo n.º 13
0
    def test_add_missing_fields_to_schema(self) -> None:
        """Tests that the add_missing_fields_to_schema function calls the client to update the table."""
        table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id)
        schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")]
        table = bigquery.Table(table_ref, schema_fields)
        self.mock_client.get_table.return_value = table

        new_schema_fields = [bigquery.SchemaField("new_schema_field", "STRING")]

        self.bq_client.add_missing_fields_to_schema(
            self.mock_dataset_id, self.mock_table_id, new_schema_fields
        )

        self.mock_client.update_table.assert_called()
Ejemplo n.º 14
0
def add_bigquery(temp_data, table_name, table_path, dataset_name, schema):
    """
    Given a dataset name and a table_name (in bigquery), create
    a table in bigquery with schema given in 'schema' and 
    the data stored in Google Storage path 'table_path'.
    
    It deduces the file format (NJSON or CSV) from temp_data.
    """

    if local:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/tmp/key.json'
    bq = bigquery.Client(project='gabinete-compartilhado')

    ds = bq.dataset(dataset_name)

    # Create dataset if non-existent:
    try:
        bq.create_dataset(ds)
    except:
        pass

    # Create a local object (bigQuery table):
    table_ref = ds.table(table_name)
    table = bigquery.Table(table_ref)

    # Configure the bigquery table:
    if os.path.exists(temp_data):
        external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON')
    elif os.path.exists(temp_data.replace('.json', '.csv')):
        external_config = bigquery.ExternalConfig('CSV')
    else:
        raise Exception('unknown temp_data file extension')

    external_config.schema = schema
    # external_config.autodetect = True
    external_config.ignore_unknown_values = True
    external_config.max_bad_records = 100
    source_uris = [table_path]
    external_config.source_uris = source_uris
    table.external_data_configuration = external_config

    # create table (first delete existent table):
    try:
        bq.delete_table(table)
    except Exception as e:
        print(e)
        pass

    bq.create_table(table)
    print('Table Cr')
Ejemplo n.º 15
0
def check_table_existance(client, full_table_id, schema):
    '''
        @parameters client, full_table_id, schema
        
        Using the full full table see if the table exists already and if not 
        create the empty table. 
    '''

    table_exists = None

    try:
        client.get_table(full_table_id)
        table_exists = True
    except NotFound:
        table_exists = False

    if table_exists == True:
        print(f'{full_table_id} exists. Making deletion of the table')
        client.delete_table(full_table_id)
        table = bigquery.Table(full_table_id, schema=schema)
        table.clustering_fields = [
            "CHROM", "ID", "analysis_workflow_type", "project_short_name"
        ]
        table = client.create_table(table)
        print(
            f"Created clustered table {table.project}, {table.dataset_id}, {table.table_id}"
        )
    else:
        print(f'{full_table_id} does not exist. Creating the table')
        table = bigquery.Table(full_table_id, schema=schema)
        table.clustering_fields = [
            "CHROM", "ID", "analysis_workflow_type", "project_short_name"
        ]
        table = client.create_table(table)
        print(
            f"Created clustered table {table.project}, {table.dataset_id}, {table.table_id}"
        )
Ejemplo n.º 16
0
    def bigquery_commit(element):
        """
        Initialize bigquery client
        Specify dataset, table and schema
        Append keys to key list
        Commit batches to bigquery in stream format
        """

        source = contact_options.source.get()

        logging.info("Element BigQuery length: {}".format(len(element)))

        client = bigquery.Client(project=PROJECT)
        dataset_ref = client.dataset('campaign_contact')
        table_ref = dataset_ref.table(source)

        schema = [bigquery.SchemaField('id', 'STRING', mode='REQUIRED')]

        # BigQuery table checker function
        def if_tbl_exists(clients, table_refs):
            """Check if table exists, if False, create table"""
            logging.info("Check Table")
            try:
                clients.get_table(table_refs)
                return True
            except NotFound:
                return False

        if if_tbl_exists(client, table_ref) is False:
            table = bigquery.Table(table_ref, schema=schema)
            client.create_table(table)
            logging.info("Table created: {}".format(table_ref))
        else:
            pass

        key_data = []

        for elem in element:
            key_data.append({"id": elem[0][0]})

        if len(key_data) == len(element):
            client.insert_rows_json(table_ref,
                                    key_data,
                                    ignore_unknown_values=True,
                                    skip_invalid_rows=True)
            phone_numbers = []
            logging.info(
                "BigQuery Batch Commit, number of elements: {}".format(
                    len(phone_numbers)))
Ejemplo n.º 17
0
def generate_bigquery_assets(client_bq, bigquery_asset_list):
    """ This function provisions all required BigQuery datasets & tables.
    We combine datasets & tables within a single function as creating tables
    requires the same references when creating datasets.
    """

    from common.bigquery_schema import bigquery_table_schema_dict
    from google.cloud.exceptions import NotFound
    from google.cloud import bigquery

    def asset_exists(client, asset_type, reference):

        if asset_type not in ['dataset', 'table']:
            raise TypeError(
                'Error - asset_type must be one of: {dataset, table}!')

        try:
            if asset_type == 'dataset':
                client.get_dataset(reference)
            elif asset_type == 'table':
                client.get_table(reference)
            return True
        except NotFound:
            return False

    # Create dataset if it does not exist..
    for dataset_name in set([bq_asset[0] for bq_asset in bigquery_asset_list]):
        dataset_ref = client_bq.dataset(dataset_name)
        if not asset_exists(client_bq, 'dataset', dataset_ref):
            dataset = bigquery.Dataset(dataset_ref)
            client_bq.create_dataset(dataset)

    # Create table if it does not exist..
    table_list = []
    for bq_asset in bigquery_asset_list:
        dataset_name, table_name, table_schema, table_partition = bq_asset
        table_ref = client_bq.dataset(dataset_name).table(table_name)
        if not asset_exists(client_bq, 'table', table_ref):
            table = bigquery.Table(
                table_ref, schema=bigquery_table_schema_dict[table_schema])
            if table_partition:
                table.time_partitioning = bigquery.TimePartitioning(
                    type_=bigquery.TimePartitioningType.DAY,
                    field=table_partition)
            table_list.append(client_bq.create_table(table))
        else:
            table_list.append(client_bq.get_table(table_ref))

    return table_list
    def merge_task(country, **context):
        client = bigquery.Client()

        table_ref = create_dataset(
            client,
            destination_dataset_name,
            project=destination_dataset_project_id,
        ).table(destination_table_name)
        if not does_table_exist(client, table_ref):
            table = bigquery.Table(table_ref,
                                   schema=read_bigquery_schema("world_pop"))
            table.range_partitioning = RangePartitioning(
                field="year",
                range_=PartitionRange(start=1900, end=2100, interval=1),
            )
            table.clustering_fields = [
                "geography",
                "geography_polygon",
                "country",
            ]
            client.create_table(table)

        job_config = bigquery.QueryJobConfig()
        job_config.priority = bigquery.QueryPriority.INTERACTIVE

        sql_path = os.path.join(
            dags_folder, "resources/stages/load/sqls/merge_worldpop.sql")
        sql_template = read_file(sql_path)

        execution_date = context["execution_date"]
        year = execution_date.strftime("%Y")
        staging_table_name = "{table}_{country}_{year}".format(
            table=destination_table_name, country=country, year=year)

        template_context = {
            "year": year,
            "country": country,
            "source_table": staging_table_name,
            "source_project_id": staging_dataset_project_id,
            "source_dataset_name": staging_dataset_name,
            "destination_table": destination_table_name,
            "destination_dataset_project_id": destination_dataset_project_id,
            "destination_dataset_name": destination_dataset_name,
        }

        sql = context["task"].render_template(sql_template, template_context)
        job = client.query(sql, location="US", job_config=job_config)
        submit_bigquery_job(job, job_config)
        assert job.state == "DONE"
Ejemplo n.º 19
0
def create_table_with_schema(out_ref, flavor):
    table = bq.Table(out_ref, schema=bq_schema.make_aug_npl_schema(flavor))
    if any(
        [
            out_ref.table_id == table.table_id
            for table in client.list_tables(out_ref.dataset_id)
        ]
    ):
        client.delete_table(table)

        msg.info(
            f"{out_ref.dataset_id}.{out_ref.table_id} already exists. Will overwrite."
        )

    client.create_table(table)
Ejemplo n.º 20
0
def test_table_exists(client, to_delete):
    """Determine if a table exists."""
    DATASET_ID = 'get_table_dataset_{}'.format(_millis())
    TABLE_ID = 'get_table_table_{}'.format(_millis())
    dataset = bigquery.Dataset(client.dataset(DATASET_ID))
    dataset = client.create_dataset(dataset)
    to_delete.append(dataset)

    table_ref = dataset.table(TABLE_ID)
    table = bigquery.Table(table_ref, schema=SCHEMA)
    table = client.create_table(table)
    to_delete.insert(0, table)

    assert table_exists(client, table_ref)
    assert not table_exists(client, dataset.table('i_dont_exist'))
Ejemplo n.º 21
0
        def create_mapping_table(self):
            """
            Create a mapping table with a mapping table schema.
            """

            # create a false mapping table
            schema = [
                bigquery.SchemaField("person_id", "INTEGER", mode="REQUIRED"),
                bigquery.SchemaField("research_id", "INTEGER",
                                     mode="REQUIRED"),
                bigquery.SchemaField("shift", "INTEGER", mode="REQUIRED"),
            ]

            table = bigquery.Table(self.fq_mapping_tablename, schema=schema)
            table = self.client.create_table(table)  # Make an API request.
Ejemplo n.º 22
0
def table(dataset, scope='function'):
    name = f'{TEST_PROJECT_ID}.{dataset.dataset_id}.{__generate_uuid()}_quickstart_test_table'
    schema = [
        bigquery.SchemaField('name', 'STRING', 'REQUIRED'),
        bigquery.SchemaField('email', 'STRING', 'REQUIRED')
    ]
    table = bigquery_client.create_table(bigquery.Table(name, schema=schema))

    time.sleep(
        2)  # Wait a few seconds for Data Catalog's search index sync/update.
    yield table

    bigquery_client.delete_table(table)
    time.sleep(
        2)  # Wait a few seconds for Data Catalog's search index sync/update.
Ejemplo n.º 23
0
def newCase():
    uid = uuid.uuid4()
    name = request.args.get("name")
    day = datetime.datetime.now().strftime("%Y-%m-%d")
    if not name:
        name = ""

    #Write the new Case details to Datastore
    wazePut = caseModel(uid=str(uid), day=day, name=name)
    wazeKey = wazePut.put()

    #Create the BigQuery Client
    client = bigquery.Client()
    datasetRef = client.dataset(bqDataset)
    tableSuffix = str(uid).replace('-', '_')

    #Create the Jams Table
    jamsTable = 'jams_' + tableSuffix
    tableRef = datasetRef.table(jamsTable)
    table = bigquery.Table(tableRef, schema=jamsSchema)
    table = client.create_table(table)
    assert table.table_id == jamsTable

    #Create the Alerts Table
    alertsTable = 'alerts_' + tableSuffix
    tableRef = datasetRef.table(alertsTable)
    table = bigquery.Table(tableRef, schema=alertsSchema)
    table = client.create_table(table)
    assert table.table_id == alertsTable

    #Create the Irregularities Table
    irregularitiesTable = 'irregularities_' + tableSuffix
    tableRef = datasetRef.table(irregularitiesTable)
    table = bigquery.Table(tableRef, schema=irregularitiesSchema)
    table = client.create_table(table)
    assert table.table_id == irregularitiesTable
Ejemplo n.º 24
0
def create_table(datasetId, tableId):
    bq_client = bigquery.Client()
    dataset_ref = bq_client.dataset(datasetId)
    # Prepares a reference to the table
    table_ref = dataset_ref.table(tableId)
    try:
        bq_client.get_table(table_ref)
    except:
        schema = [
            bigquery.SchemaField('slug', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('environment', 'STRING', mode='REQUIRED')
        ]
        table = bigquery.Table(table_ref, schema=schema)
        table = bq_client.create_table(table)
        return 'table {} created.'.format(table.table_id)
Ejemplo n.º 25
0
def create_view(client, dataset_id, view_id, query_string):

    view_id = f"zapp-case-study.{dataset_id}.{view_id}"
    view = bigquery.Table(view_id)

    # The source table in this example is created from a CSV file in Google
    # Cloud Storage located at
    # `gs://cloud-samples-data/bigquery/us-states/us-states.csv`. It contains
    # 50 US states, while the view returns only those states with names
    # starting with the letter 'W'.
    view.view_query = query_string

    # Make an API request to create the view.
    view = client.create_table(view)
    print(f"Created {view.table_type}: {str(view.reference)}")
Ejemplo n.º 26
0
    def deploy(self, destination_table: str):
        """Deploy the schema to BigQuery named after destination_table."""
        client = bigquery.Client()
        tmp_schema_file = NamedTemporaryFile()
        self.to_json_file(Path(tmp_schema_file.name))
        bigquery_schema = client.schema_from_json(tmp_schema_file.name)

        try:
            # destination table already exists, update schema
            table = client.get_table(destination_table)
            table.schema = bigquery_schema
            client.update_table(table, ["schema"])
        except NotFound:
            table = bigquery.Table(destination_table, schema=bigquery_schema)
            client.create_table(table)
Ejemplo n.º 27
0
def create_table(client: bigquery.Client, dataset_id: str, table_id: str,
                 fields: List[Dict]):
    dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id))
    table_ref = dataset.table(table_id)
    table = bigquery.Table(table_ref, schema=fields)
    table.time_partitioning = bigquery.table.TimePartitioning(
    )  # partition by day

    try:
        table = client.create_table(table)
        print("Created table {}.{}.{}".format(table.project, table.dataset_id,
                                              table.table_id))
    except gexceptions.GoogleAPIError as e:
        print("Table {} could not be created: {}. Skipping...".format(
            table_id, e))
def test_update_table_require_partition_filter(capsys, random_table_id,
                                               client):

    # Make a partitioned table.
    schema = [bigquery.SchemaField("transaction_timestamp", "TIMESTAMP")]
    table = bigquery.Table(random_table_id, schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
        field="transaction_timestamp")
    table = client.create_table(table)

    update_table_require_partition_filter.update_table_require_partition_filter(
        random_table_id)
    out, _ = capsys.readouterr()
    assert ("Updated table '{}' with require_partition_filter=True".format(
        random_table_id) in out)
Ejemplo n.º 29
0
def create_table(project, dataset_id, table_id):
    try:
        client = bigquery.Client(project=project)
        dataset_ref = client.dataset(dataset_id)

        schema = [
            bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED'),
        ]
        table_ref = dataset_ref.table(table_id)
        table = bigquery.Table(table_ref, schema=schema)
        table = client.create_table(table)
        print('Table {} created'.format(table_id))
    except:
        print('Table {} already exists'.format(table_id))
Ejemplo n.º 30
0
 def run_query_job(self, query, table_type='flat'):
     client = bigquery.Client()
     table_name = "{p}.{ds}.{t}_{d}"\
         .format(p=self.gcp_project, ds=self.dataset,t=table_type, d=self.date_shard)
     table_id = bigquery.Table(table_name)
     query_job_config = bigquery.QueryJobConfig(
         destination=table_id
         ,dry_run=False
         ,use_query_cache=False
         ,labels={"queryfunction":"flatteningquery"}  #todo: apply proper labels
         ,write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE)
     query_job = client.query(query,
                              job_config=query_job_config)
     query_job.result()  # Waits for job to complete.
     return