def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(
        bigquery_client, dataset_id):
    """Test that a DataFrame containing column with None-type values and int64 datatype
    can be uploaded without specifying a schema.

    https://github.com/googleapis/python-bigquery/issues/22
    """

    table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format(
        bigquery_client.project, dataset_id)
    df_data = collections.OrderedDict([("x",
                                        pandas.Series([1, 2, None, 4],
                                                      dtype="Int64"))])
    dataframe = pandas.DataFrame(df_data, columns=df_data.keys())
    load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
    load_job.result()
    table = bigquery_client.get_table(table_id)
    assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"), )
    assert table.num_rows == 4
Exemple #2
0
def gcs_to_gbq():
    # Connect to Google Cloud Storage and set bucket
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(config.gcp['bucket_name'])
    blobs = bucket.list_blobs()
    client = bigquery.Client()
    # Connect to Google BigQuery and define BigQuery options
    dataset_ref = client.dataset(config.gcp['dataset_id'])
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    job_config.allow_quoted_newlines = True
    job_config.allow_jagged_rows = True
    job_config.autodetect = False
    job_config.skip_leading_rows = 1
    job_config.field_delimiter = "|"
    job_config.source_format = bigquery.SourceFormat.CSV
    # Iterate though all files of defined Google Cloud Storage bucket
    for blob in blobs:
        # Set dynamic URL for current Cloud Storage file and BigQuery schema file
        uri = 'gs://' + config.gcp['bucket_name'] + '/' + blob.name
        file_name_json = blob.name.replace(".csv", ".json")
        file_name_json_path = os.path.join(config.general['csv_cache_folder'],
                                           file_name_json)
        # Load JSON File for schema and set schema fields for BigQuery
        input_json = open(file_name_json_path)
        input_json_config = json.load(input_json)
        job_config.schema = [
            bigquery.SchemaField(item["name"], item["type"])
            for item in input_json_config["fields"]
        ]
        # Set dynamic table name for Google BigQuery
        table_name = blob.name.replace(".csv", "")
        # Create new big query table / replace existing
        load_job = client.load_table_from_uri(uri,
                                              dataset_ref.table(table_name),
                                              job_config=job_config)
        assert load_job.job_type == 'load'
        load_job.result()  # Waits for table load to complete.
        assert load_job.state == 'DONE'
    # Send slack message when load job done
    send_message_to_slack(
        ':bigquery: Backend Data Sync: Update DWH Data OK :thumbs-up-green:',
        config.slack_hooks['slack_channel_1'])
Exemple #3
0
def _construct_schema(uuid):
    """
    Creates a BigQuery Schema for the data set with uuid.
    https://cloud.google.com/bigquery/docs/schemas
    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.schema.SchemaField.html
    https://socratadiscovery.docs.apiary.io/#reference/0/find-by-id/search-by-id
    """
    catalog_url = '{0}/api/catalog/v1?ids={1}'.format(URI, uuid)
    response = urllib.request.urlopen(catalog_url, context=context)
    catalog_data  = json.load(response)["results"][0]["resource"]

    schema = []
    for i in range(0, len(catalog_data["columns_field_name"])):
        name = catalog_data["columns_field_name"][i]
        field_type = _encode_datatype(catalog_data["columns_datatype"][i])
        description = catalog_data["columns_description"][i]
        schema.append(bigquery.SchemaField(name, field_type, mode='NULLABLE', description=description))

    return schema
Exemple #4
0
def prepare_steaming_sink(project_id, bq_dataset, bq_table):

    # create BigQuery table
    schema = [bigquery.SchemaField(field_name, data_type)
              for field_name, data_type in BQ_SCHEMA_DEF.items()]

    bq_client = bigquery.Client(project=project_id)
    dataset = bigquery.Dataset(bq_dataset, bq_client)
    table = bigquery.Table(bq_table, dataset, schema=schema)

    if table.exists():
        print('Deleting BQ Table {}...'.format(bq_table))
        table.delete()

    print('Creating BQ Table {}...'.format(bq_table))
    table.create()

    print('BQ Table {} is up and running'.format(bq_table))
    print("")
Exemple #5
0
    def finalize(self, client):
        i_dataset, i_table = self.get_tablename().split('.')
        ischema = client.get_table(client.dataset(i_dataset).table(i_table)).schema
        table = client.get_table(client.dataset(i_dataset + '_deid').table(i_table))
        fields = [field.name for field in table.schema]

        newfields = []
        for field in ischema:
            if field.name in fields:
                temp_field = bq.SchemaField(
                    name=field.name,
                    field_type=field.field_type,
                    description=field.description,
                    mode=field.mode
                )
                newfields.append(temp_field)

        table.schema = newfields
        client.update_table(table, ['schema'])
Exemple #6
0
    def test_copy_bq_views_not_table(self, mock_table_exists: mock.MagicMock,
                                     mock_copy_view: mock.MagicMock) -> None:
        """Check that copy_view is not called on a table that does not have a view_query."""
        table_ref = bigquery.TableReference(self.mock_source_dataset,
                                            "table_not_view")
        schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")]
        table_not_view = bigquery.Table(table_ref, schema_fields)
        self.mock_client.list_tables.return_value = [table_not_view]
        self.mock_client.get_table.return_value = table_not_view
        mock_table_exists.side_effect = self.table_exists_side_effect

        copy_bq_views(
            source_project_id=self.mock_source_project_id,
            source_dataset_id=self.mock_source_dataset_id,
            destination_project_id=self.mock_destination_project_id,
            destination_dataset_id=self.mock_destination_dataset_id,
        )

        mock_copy_view.assert_not_called()
Exemple #7
0
    def update_table(self):
        if is_exist(self.dataset_id, table_id=self.table_id) == True:
            table = client.get_table(self.table_ref)
            original_schema = table.schema
            new_schema = [
                bigquery.SchemaField(field["name"], field["type"])
                for field in self.parse_file.schema()
            ]

            if schema_match(original_schema, new_schema):
                pass
            else:
                print("Recreating table {} with new schema...".format(
                    self.table_id))
                self.create_table()
        else:
            print("table does not exist. Creating {} table...".format(
                self.table_id))
            self.create_table()
Exemple #8
0
    def _extract_schema(self, df):
        schema = []
        for column_name, dtype in dict(df.dtypes).items():
            type_map = {
                np.dtype('object'): bigquery.SchemaField(column_name, 'STRING'),
                np.dtype('int32'): bigquery.SchemaField(column_name, 'INTEGER'),
                np.dtype('int64'): bigquery.SchemaField(column_name, 'INTEGER'),
                np.dtype('float64'): bigquery.SchemaField(column_name, 'FLOAT'),
                np.dtype('datetime64[ns]'): bigquery.SchemaField(column_name, 'DATE'),
                pd.DatetimeTZDtype(tz='UTC'): bigquery.SchemaField(column_name, 'DATE')
            }
            schema.append(type_map[dtype])

        return schema
Exemple #9
0
def uploadToBigQuery(df):
    # Since string columns use the "object" dtype, pass in a (partial) schema
    # to ensure the correct BigQuery data type.
    job_config = bigquery.LoadJobConfig(schema=[
        bigquery.SchemaField("Timestamp", "TIMESTAMP"),
        bigquery.SchemaField("Activity", "STRING"),
        bigquery.SchemaField("Description", "STRING"),
        bigquery.SchemaField("Location", "STRING"),
        bigquery.SchemaField("SoraRank", "INTEGER"),
        bigquery.SchemaField("client_ip", "STRING"),
    ])
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    # Wait for the load job to complete.
    print(job.result())
Exemple #10
0
def create_ga_kb_exit_rate_by_product():
    schema = [
        bigquery.SchemaField('ga_date', 'DATE', mode='NULLABLE'),
        bigquery.SchemaField('ga_exitPagePath', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('ga_exitRate', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('ga_exits', 'INTEGER', mode='NULLABLE'),
        bigquery.SchemaField('ga_pageviews', 'INTEGER', mode='NULLABLE'),
        bigquery.SchemaField('product', 'STRING', mode='NULLABLE')
    ]
    table_ref = dataset_ref.table('ga_kb_exit_rate_by_product')
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)  # API request

    assert table.table_id == 'ga_kb_exit_rate_by_product'
Exemple #11
0
def csv_loader(data, context):
        client = bigquery.Client()
        dataset_id = os.environ['DATASET']
        dataset_ref = client.dataset(dataset_id)
        job_config = bigquery.LoadJobConfig()
        job_config.schema = [
                bigquery.SchemaField('Date', 'DATE'),
                bigquery.SchemaField('Fund', 'STRING'),
                bigquery.SchemaField('Company', 'STRING'),
                bigquery.SchemaField('Ticker', 'STRING'),
                bigquery.SchemaField('CUSIP', 'STRING'),
                bigquery.SchemaField('Shares', 'FLOAT'),
                bigquery.SchemaField('Market_Value', 'FLOAT'),
                bigquery.SchemaField('Weight', 'FLOAT'),
                ]
        job_config.skip_leading_rows = 1
        job_config.source_format = bigquery.SourceFormat.CSV

        # get the URI for uploaded CSV in GCS from 'data'
        uri = f"gs://{os.environ['BUCKET']}/{data['name']}"

        # lets do this
        load_job = client.load_table_from_uri(
                uri,
                dataset_ref.table(os.environ['TABLE']),
                job_config=job_config)

        print(f'Starting job: {load_job.job_id}')
        print(f"Function=csv_loader, Version={os.environ['VERSION']}")
        print(f"Loading file: {data['name']}")

        load_job.result()  # wait for table load to complete.
        print('Job finished.')

        destination_table = client.get_table(dataset_ref.table(os.environ['TABLE']))
        print(f"Table: {os.environ['TABLE']} now have {destination_table.num_rows} rows.")

        time.sleep(1)
        tbl = f"{dataset_id}.{os.environ['TABLE']}"
        dedup(tbl)
Exemple #12
0
def create_gtrends_queries(dataset_name, table_name):
    schema = [
        bigquery.SchemaField('update_date', 'DATE', mode='NULLABLE'),
        bigquery.SchemaField('region', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('original_query', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('translated_query', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('query_key_ts', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('search_increase_pct', 'INTEGER',
                             mode='NULLABLE'),
    ]
    dataset_ref = client.dataset(dataset_name)
    table_ref = dataset_ref.table(table_name)
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)
Exemple #13
0
def stage_imdb_title_principals():
    uri = 'gs://udacity-de/imdb/title.principals.tsv.gz'
    source_format = bigquery.SourceFormat.CSV
    table_id = 'imdb.title_principals'
    schema = [
        bigquery.SchemaField("tconst", "STRING"),
        bigquery.SchemaField("ordering", "INT64"),
        bigquery.SchemaField("nconst", "STRING"),
        bigquery.SchemaField("category", "STRING"),
        bigquery.SchemaField("job", "STRING"),
        bigquery.SchemaField("characters", "STRING"),
    ]
    field_delimiter = '\t'
    load_csv(uri, table_id, schema, field_delimiter)
Exemple #14
0
def push_data_to_bigquery(client, df, dataset_id, table_id, date_column_name):
    table_schema = []
    for name, dtype in zip(df.columns, df.dtypes):
        if name == date_column_name:
            table_schema.append('DATETIME')
        elif dtype.name == 'object':
            table_schema.append('STRING')
        else:
            table_schema.append(str(dtype).upper())

    # Assigning the references
    dataset_id = dataset_id
    table_id = table_id
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    # Extracting the column names
    bq_column_names = list(df.columns)

    # Converting the date column
    df[date_column_name] = pd.to_datetime(df[date_column_name])
    df[date_column_name] = df[date_column_name].dt.strftime(
        "%Y-%m-%dT%H:%M:%S")

    # Customise the Jobconfig setup
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    schema_results = []

    # Dynamically creating the SQL Schema From Two Lists (Column Names)
    for name, schema in zip(bq_column_names, table_schema):
        schema_results.append(
            bigquery.SchemaField(name, schema, mode='NULLABLE'))

    job_config.schema = schema_results

    # Running the Job iside of a StringIO stream:
    with io.StringIO(df.to_json(orient="records", lines=True)) as source_file:
        job = client.load_table_from_file(source_file,
                                          table_ref,
                                          job_config=job_config)
    job.result()
def create_table(client, table_name, schema, project=None, dataset=None, partitioning_type=None, partitioned_field=None, clustering_fields=None):
    """
    Create Table with Schema
    :param client: BQ Client
    :param table_name: Table name
    :param schema: Table Schema
    :param project: default to client.project
    :param dataset: default to client.dataset
    :param partitioning_type: either : `time` or `range` partitioned
    :param partitioned_field: field name use for partitionning
    :param clustering_fields: fields to use for clustering
    :return: created table
    """
    partitioning_types = {
        "time" : TimePartitioning(type_= TimePartitioningType.HOUR, field=partitioned_field, require_partition_filter=True),
        "range" : RangePartitioning(range_= PartitionRange(start=0, end=100, interval=10), field=partitioned_field)
    }
    try:
        if project is None:
            project = client.project
        if dataset is None:
            dataset = client.dataset
        logging.info("Project: {}\tDataset: {}\tTable: {}\t\tPartitioning Type:{}".format(project, dataset.dataset_id, table_name, partitioning_type))
        table_id = "{}.{}.{}".format(project, dataset.dataset_id, table_name)
        table = bigquery.Table(table_id, schema=schema)
        if partitioning_type is not None:
            partitioning_type = partitioning_type.lower()
            if partitioning_type == "time":
                logging.info("Table Partitioning: {}".format(partitioning_type))
                schema.append(bigquery.SchemaField("ZONE_PARTITIONTIME","TIMESTAMP"))
                table.schema = schema
                table.time_partitioning = partitioning_types.get(partitioning_type)
            elif partitioning_type == "range":
                table.range_partitioning = partitioning_types.get(partitioning_type)
        if clustering_fields is not None:
            table.clustering_fields = clustering_fields
        client.create_table(table, exists_ok=True)
        table = client.get_table(table)
        logging.info("Table {} created successfully.".format(table_id))
        return table
    except Exception as error:
        raise error
def download_race_pitstop(year, round_start, round_end, key_path):
    round_num = round_start
    # Obtain schema for upload
    bigquery_client = create_bigquery_client(key_path)
    dataset = bigquery_client.dataset('F1_Modelling_Raw')
    schema_for_upload = [
        bigquery.SchemaField("year", "INTEGER"),
        bigquery.SchemaField("round", "INTEGER"),
        bigquery.SchemaField("driverId", "STRING"),
        bigquery.SchemaField("stop", "INTEGER"),
        bigquery.SchemaField("lap", "INTEGER"),
        bigquery.SchemaField("time", "TIME"),
        bigquery.SchemaField("duration", "FLOAT")
    ]

    clear_uploads_folder('bigquery_upload')

    while round_num <= round_end:
        print('***Attempting year {0}, round number {1}'.format(year, round_num))
        if len(str(round_num)) == 1:
            round_num_csv_suffix = '0' + str(round_num)
        else:
            round_num_csv_suffix = str(round_num)

        # Query the Ergast API
        print('Round {0}: Attempting to request Pitstops json from Ergast API'.format(round_num))
        response = rq.get('http://ergast.com/api/f1/{0}/{1}/pitstops.json?limit=100'.format(year,round_num))
        pitstops_dict = response.json()
        print('Round {0}: Successfully converted response into pitstops_dict'.format(round_num))

        # Reformat the resulting dataframe
        pitstops_df = pd.DataFrame(data = pitstops_dict['MRData']['RaceTable']['Races'][0]['PitStops'])
        pitstops_df['year'] = year
        pitstops_df['round'] = round_num
        pitstops_df = pitstops_df[['year', 'round', 'driverId', 'stop', 'lap', 'time', 'duration']]

        with open('bigquery_upload/pitstops_{0}{1}.csv'.format(year,round_num_csv_suffix), 'w',newline='') as csv_file:
            pitstops_df.to_csv(csv_file,index=False)
        print('Round {0}: Successfully saved into a csv file'.format(round_num))

        table = dataset.table('Race_Pitstops_{0}{1}'.format(year,round_num_csv_suffix))
        upload_to_bigquery('bigquery_upload/pitstops_{0}{1}.csv'.format(year, round_num_csv_suffix), bigquery_client, schema_for_upload, table)
        print('***Successfully completed upload of year {0}, round {1}'.format(year,round_num))
        round_num += 1
def up(client):
    migration = BigQueryMigration(client)
    dataset = migration.dataset(dataset_name)

    table = migration.client.get_table(dataset.table(table_name))
    orig_schema = table.schema
    new_schema = orig_schema.copy()

    if table.schema[-1].name == 'product_type':
        logging.warning(
            f'product_type already added to {table_name} in {dataset_name} dataset!'
        )
        return dataset

    new_schema.append(bigquery.SchemaField('product_type', 'STRING'))
    table.schema = new_schema

    migration.client.update_table(table, ['schema'])

    return dataset
Exemple #18
0
def create_schema_object_from_json(table_name):
    f = open("./{0}.json".format(table_name))
    txt = f.read()
    json_data = json.loads(txt)

    datatype = ''
    schema = []

    for e in json_data:
        if e['type'] == 'INTEGER':
            datatype = 'INT64'
        elif e['type'] == 'FLOAT':
            datatype = 'FLOAT64'
        else:
            datatype = e['type']

        schemaField = bigquery.SchemaField(e['name'], datatype)
        schema.append(schemaField)

    return schema
    def create_table(self, dest_dataset, dest_table, list_schema):

        dataset = self.client.dataset(dest_dataset)
        if not dataset.exists():
            print("Dataset {} does not exist.".format(dest_dataset))
            return

        table = dataset.table(dest_table)
        if table.exists():
            print("Table [{}.{}] existed already. Skip creating.".format(
                dest_dataset, dest_table))
            return

        schemas = []
        for (item, type) in list_schema:
            schemas.append(bigquery.SchemaField(item, type))
        table.schema = schemas

        table.create()
        print("Empty Table [{}.{}] created.".format(dest_dataset, dest_table))
def load_data():
    client = bigquery.Client()
    dataset_id = 'landing'
    dataset_ref = client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    # Allows to update with any new schema/field additions. Optional Extra (Required in this case).
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.schema_update_options = [
        bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
    ]
    job_config.schema = [
        bigquery.SchemaField("title", "STRING"),
        bigquery.SchemaField("subreddit", "STRING"),
        bigquery.SchemaField("score", "Integer"),
        bigquery.SchemaField("id", "STRING"),
        bigquery.SchemaField("url", "STRING"),
        bigquery.SchemaField("comms_num", "Integer"),
        bigquery.SchemaField("created", "DATE"),
    ]
    job_config.skip_leading_rows = 1
    # The source format defaults to CSV, so the line below is optional.
    job_config.source_format = bigquery.SourceFormat.CSV
    uri = "gs://dubai_source_api/source-api"

    # API request
    try:
        load_job = client.load_table_from_uri(uri,
                                              dataset_ref.table("dubai_posts"),
                                              job_config=job_config)
    except Exception as e:
        print('Failed to load data due to an expected error.')
        print('System abort!')
        print(str(e))
        sys.exit()
    else:
        print("Starting job {}".format(load_job.job_id))

    # Waits for table load to complete.
    load_job.result()
    print("Job finished.")

    destination_table = client.get_table(dataset_ref.table("dubai_posts"))
    table = "landing.dubai_posts"
    print("Loaded {} rows into {}.".format(destination_table.num_rows, table))
def BuildSchema(host, database, user, password, table):
    logging.debug('build schema for table %s in database %s' % (table, database))
    conn = Connect(host, database, user, password)
    cursor = conn.cursor()
    cursor.execute("DESCRIBE %s;" % table)

    tableDecorator = cursor.fetchall()
    schema = []

    for col in tableDecorator:
        colType = col[1].split("(")[0]
        if colType not in bqTypeDict:
            logging.warning("Unknown type detected, using string: %s", str(col[1]))

        field_mode = "NULLABLE" if col[2] == "YES" else "REQUIRED"
        field = bigquery.SchemaField(col[0], bqTypeDict.get(colType, "STRING"), mode=field_mode)

        schema.append(field)

    return tuple(schema)
Exemple #22
0
def resetOuputTable(bigquery_client, project, dataset, table_name):
    dataset_ref = bigquery_client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)
    try:
        bigquery_client.delete_table(table_ref)
    except NotFound:
        pass
    schema = [
        bigquery.SchemaField('asofdate', 'DATE', mode='REQUIRED'),
        bigquery.SchemaField('datasource', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('date', 'DATE', mode='REQUIRED'),
        bigquery.SchemaField('type', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('mau', 'FLOAT', mode='REQUIRED'),
        bigquery.SchemaField('low90', 'FLOAT', mode='REQUIRED'),
        bigquery.SchemaField('high90', 'FLOAT', mode='REQUIRED'),
    ] + [
        bigquery.SchemaField('p{}'.format(q), 'FLOAT', mode='REQUIRED')
        for q in range(10, 100, 10)
    ]
    table = bigquery.Table(table_ref, schema=schema)
    table = bigquery_client.create_table(table)
    return table
    def _create_schemafield_list(self, schema_file_path):
        """
        スキーマのjsonを引数にとって、BigQueryロード用のSchemaFieldを生成する
        see: https://cloud.google.com/bigquery/docs/schemas?hl=ja#creating_a_json_schema_file
        """
        with open(schema_file_path, mode='r') as input_file:
            json_file = input_file.read()
            schema_json = json.loads(json_file)

            schema_field_list = []
            for schema in schema_json:
                # fixme NULLABLE以外のmodeも対応できるように
                schema_field = bigquery.SchemaField(
                    schema['name'],
                    schema['type'],
                    mode="NULLABLE",
                    description=schema['description'])
                schema_field_list.append(schema_field)

        return schema_field_list
Exemple #24
0
    def test_run_query_write_to_table(self, tmp_path, bigquery_client,
                                      project_id, temporary_dataset):
        query_file_path = tmp_path / "sql" / "test" / "query_v1"
        os.makedirs(query_file_path)

        query_file = query_file_path / "query.sql"
        query_file.write_text("SELECT 'foo' AS a")

        schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")]
        table = bigquery.Table(f"{project_id}.{temporary_dataset}.query_v1",
                               schema=schema)
        bigquery_client.create_table(table)

        try:
            result = subprocess.check_output(
                [
                    ENTRYPOINT_SCRIPT,
                    "query",
                    "--dataset_id=" + temporary_dataset,
                    "--destination_table=query_v1",
                    "--project_id=" + project_id,
                    "--replace",
                    str(query_file),
                ],
                stderr=subprocess.STDOUT,
            )
            assert b"Current status: DONE" in result
            assert b"No metadata.yaml found for {}" in result

            result = bigquery_client.query(
                f"SELECT a FROM {project_id}.{temporary_dataset}.query_v1"
            ).result()
            assert result.total_rows == 1
            for row in result:
                assert row.a == "foo"
        except subprocess.CalledProcessError as e:
            assert b"No such file or directory: 'bq'" in e.output
            assert b"No metadata.yaml found for {}" in e.output
            assert (
                b'subprocess.check_call(["bq"] + query_arguments, stdin=query_stream)'
                in e.output)
Exemple #25
0
    def process(self, element):
        if self._initialized is False:
            logging.info(
                '>>> load_sql_to_bq WriteToBigQuery - loading schema list')
            schema = []
            loaded = json.loads(self._schema.get())
            for f in loaded['fields']:
                schema.append(
                    bigquery.SchemaField(f['name'], f['type'], f['mode']))
            self._schema = schema

            logging.info(
                '>>> load_sql_to_bq WriteToBigQuery - initializing client, refs, data set and table'
            )
            dataset = 'pyr_{}_{}'.format(self._client.get(), self._env.get())
            self._client = bigquery.Client(project=self._project)
            self._dataset_ref = self._client.dataset(dataset)
            self._table_ref = self._dataset_ref.table(self._table.get())

            try:
                # Delete the table if it exists - jc 2/19/20
                self._client.delete_table(self._table_ref)
            except Exception:
                pass

            self._client.create_dataset(self._dataset_ref, exists_ok=True)
            self._client.create_table(
                bigquery.Table(self._table_ref, schema=self._schema))
            self._initialized = True

        logging.info(
            '>>> load_sql_to_bq WriteToBigQuery - writing {} records to bigquery'
            .format(len(element)))
        logging.info(
            '>>> load_sql_to_bq WriteToBigQuery - element len is {}'.format(
                len(json.dumps(element, default=str))))
        rs = self._client.insert_rows(self._table_ref, element, self._schema)
        logging.info(
            '>>> load_sql_to_bq WriteToBigQuery - committed {} records to bigquery'
            .format(len(element)))
        logging.info(rs)
def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats(
        bigquery_client, dataset_id):
    from google.cloud.bigquery.job import SourceFormat

    table_schema = (bigquery.SchemaField("float_col", "FLOAT"), )
    df_data = collections.OrderedDict([
        (
            "float_col",
            [
                0.14285714285714285,
                0.51428571485748,
                0.87128748,
                1.807960649,
                2.0679610649,
                2.4406779661016949,
                3.7148514257,
                3.8571428571428572,
                1.51251252e40,
            ],
        ),
    ])
    dataframe = pandas.DataFrame(df_data,
                                 dtype="object",
                                 columns=df_data.keys())

    table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format(
        bigquery_client.project, dataset_id)

    job_config = bigquery.LoadJobConfig(schema=table_schema,
                                        source_format=SourceFormat.CSV)
    load_job = bigquery_client.load_table_from_dataframe(dataframe,
                                                         table_id,
                                                         job_config=job_config)
    load_job.result()

    table = bigquery_client.get_table(table_id)
    rows = bigquery_client.list_rows(table_id)
    floats = [r.values()[0] for r in rows]
    assert tuple(table.schema) == table_schema
    assert table.num_rows == 9
    assert floats == df_data["float_col"]
Exemple #27
0
    def store_to_bigquery_(context, target):
        from google.cloud import bigquery

        context = Context().context()
        client = bigquery.Client(project=Context().project_id)
        dataset_id, table_id, target = context['global'][
            'dataset_id'], context[target]['table_id_raw']['id'], target
        bucket_path = '{}/'.format(context[target]["location"])
        dataset_ref = client.dataset(dataset_id)
        job_config = bigquery.LoadJobConfig()
        job_config.create_disposition = 'CREATE_IF_NEEDED'
        job_config.schema = [
            bigquery.SchemaField(element, 'STRING')
            for element in context[target]['table_id_raw']['schema']
        ]
        job_config.write_disposition = 'WRITE_TRUNCATE'
        job_config.skip_leading_rows = 1
        uri = 'gs://{}.csv'.format(context[target]['location'])

        try:
            job_config.field_delimiter = ';'
            load_job = client.load_table_from_uri(uri,
                                                  dataset_ref.table(table_id),
                                                  job_config=job_config)
            load_job.result()
        except:
            job_config.field_delimiter = ','
            load_job = client.load_table_from_uri(uri,
                                                  dataset_ref.table(table_id),
                                                  job_config=job_config)
            load_job.result()

        job_config = bigquery.QueryJobConfig()
        table_ref = client.dataset(dataset_id).table(target)
        job_config.destination = table_ref
        job_config.use_legacy_sql = True
        job_config.create_disposition = 'CREATE_IF_NEEDED'
        job_config.write_disposition = 'WRITE_APPEND'
        query = open_query(context[target]['table_id']['keygen'])
        query_job = client.query(query, job_config=job_config)
        results = query_job.result()
Exemple #28
0
def test_client_library_upload_from_dataframe(temp_dataset):
    # [START bigquery_migration_client_library_upload_from_dataframe]
    from google.cloud import bigquery
    import pandas

    df = pandas.DataFrame(
        {
            'my_string': ['a', 'b', 'c'],
            'my_int64': [1, 2, 3],
            'my_float64': [4.0, 5.0, 6.0],
            'my_timestamp': [
                pandas.Timestamp("1998-09-04T16:03:14"),
                pandas.Timestamp("2010-09-13T12:03:45"),
                pandas.Timestamp("2015-10-02T16:00:00")
            ],
        }
    )
    client = bigquery.Client()
    table_id = 'my_dataset.new_table'
    # [END bigquery_migration_client_library_upload_from_dataframe]
    table_id = (
        temp_dataset.dataset_id
        + ".test_client_library_upload_from_dataframe"
    )
    # [START bigquery_migration_client_library_upload_from_dataframe]
    # Since string columns use the "object" dtype, pass in a (partial) schema
    # to ensure the correct BigQuery data type.
    job_config = bigquery.LoadJobConfig(schema=[
        bigquery.SchemaField("my_string", "STRING"),
    ])

    job = client.load_table_from_dataframe(
        df, table_id, job_config=job_config
    )

    # Wait for the load job to complete.
    job.result()
    # [END bigquery_migration_client_library_upload_from_dataframe]
    client = bigquery.Client()
    table = client.get_table(table_id)
    assert table.num_rows == 3
Exemple #29
0
def load_gcs_to_bq(log_uri):
    dataset_id = get_env_var('DATASET_ID')
    target_table_name = get_env_var('TABLE_NAME')
    client = bigquery.Client()

    temp_table_name = table_name_generator()
    target_table_id = '{}.{}'.format(dataset_id, target_table_name)
    insert_sql = insert_table_query_generator(temp_table_name, target_table_id)

    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [
        log_uri,
    ]
    external_config.schema = [
        bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
    ]
    job_config = bigquery.QueryJobConfig(
        table_definitions={temp_table_name: external_config})
    query_job = client.query(insert_sql, job_config=job_config)
    if len(list(query_job)) == 0:
        print("Load {} to {} completed!".format(log_uri, target_table_id))
Exemple #30
0
    def AddTable(self):

        try:

            schema_bq = []
            for field, datatype in self.settings['schema'][
                    self.settings['table']].items():
                schema_bq.append(
                    bigquery.SchemaField(field, datatype[0], mode=datatype[1]))

            table_ref = self.client.dataset(self.settings['dataset']).table(
                self.settings['table'])
            table = bigquery.Table(table_ref, schema=schema_bq)
            table = self.client.create_table(table)  # API request

            assert table.table_id == self.settings['table']
            logger.info("Table has been created {}".format(
                self.settings['table']))

        except Exception as e:
            logger.info("Failed adding table {}".format(e))