Ejemplo n.º 1
0
    def write_sentences_to_bq(self):

        dataset_ref = self.client.dataset(dataset_id)
        job_config = bigquery.LoadJobConfig()
        job_config.autodetect = True
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        uri = "gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/sentences/part-00000-a04d02d9-794e-4546-a9af-38dbb086452f-c000.json"

        try:
            load_job = self.client.load_table_from_uri(
                uri, dataset_ref.table("sentences"),
                job_config=job_config)  # API request
        except Exception as e:
            print(e)
        finally:
            print(load_job.result())
            # load_job.result()  # Waits for table load to complete.
            print("Job finished.")
Ejemplo n.º 2
0
    def write_contents_to_bq(self):

        dataset_ref = self.client.dataset(dataset_id)
        job_config = bigquery.LoadJobConfig()
        job_config.autodetect = True
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        uri = "gs://dataproc-7e10897a-5391-4ea0-b815-f6e72cf284f7-asia-east1/data/contents/data_modified.json/part-00000-3873cd9e-dd6c-4e9f-a94c-6362a265d946-c000.json"

        try:
            load_job = self.client.load_table_from_uri(
                uri, dataset_ref.table("contents"),
                job_config=job_config)  # API request
        except Exception as e:
            print(e)
        finally:
            print(load_job.result())
            # load_job.result()  # Waits for table load to complete.
            print("Job finished.")
Ejemplo n.º 3
0
    def truncate(self):
        """
        Truncate the table.
        """
        # BigQuery does not support truncate natively, so we will "load" an empty dataset
        # with write disposition of "truncate"
        table_ref = get_table_ref(self.db.client, self.table)
        bq_table = self.db.client.get_table(table_ref)

        # BigQuery wants the schema when we load the data, so we will grab it from the table
        job_config = bigquery.LoadJobConfig()
        job_config.schema = bq_table.schema

        empty_table = Table([])
        self.db.copy(empty_table,
                     self.table,
                     if_exists='truncate',
                     job_config=job_config)
Ejemplo n.º 4
0
def load_to_bigquery(filename, dataset_id, table_id):
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(filename, "rb") as source_file:
        job = client.load_table_from_file(source_file,
                                          table_ref,
                                          job_config=job_config)

    job.result()  # Waits for table load to complete.

    print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id,
                                              table_id))
Ejemplo n.º 5
0
def save_run_results_in_bq(project_id, dbt_project_name, run_results_path):
    """
    Load run_results json file in BigQuery. As a first step is checked if the table
    already exists in the schema and if not, it's created.

    To fit BQ schema, the field metadata.env (JSON object) must be serialised
    and results.message converted to string,
    because depending on the task it can be an integer or a string
    """
    table_id = f"{project_id}.{dbt_project_name}.{DBT_RUN_RESULTS_TABLE}"
    client = bigquery.Client(project=project_id)
    check_run_results_table(client, table_id)

    run_results = {}
    if run_results_path.startswith("gs://"):
        storage_client = storage.Client()
        bucket, path = _parse_gcs_url(run_results_path)
        bucket = storage_client.get_bucket(bucket)
        blob = bucket.blob(path)
        run_results = json.loads(blob.download_as_string())
    else:
        with open(run_results_path) as run_results_file:
            run_results = json.load(run_results_file)

    if run_results["metadata"]["env"]:
        run_results["metadata"]["env"] = json.dumps(
            run_results["metadata"]["env"])
    else:
        del run_results["metadata"]["env"]
    for item in run_results["results"]:
        item["message"] = str(item["message"])

    data_as_file = io.StringIO(json.dumps(run_results))
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON)
    job = client.load_table_from_file(data_as_file,
                                      table_id,
                                      job_config=job_config)
    try:
        result = job.result()  # Waits for table load to complete.
        logging.info("Pushed {} rows into run_results table".format(
            result.output_rows))
    except Exception:
        logging.info(f"Error loading run_results to BigQuery: {job.errors}")
Ejemplo n.º 6
0
    def load_comp_boss(self, client, dataset_id, load_data):

        table_id = "{}.{}".format(dataset_id, "comp_boss")

        schema = [
            bq.SchemaField("component_id", "STRING", mode="NULLABLE"),
            bq.SchemaField("component_type_id", "STRING", mode="NULLABLE"),
            bq.SchemaField("type", "STRING", mode="NULLABLE"),
            bq.SchemaField("connection_type_id", "STRING", mode="NULLABLE"),
            bq.SchemaField("outside_shape", "STRING", mode="NULLABLE"),
            bq.SchemaField("base_type", "STRING", mode="NULLABLE"),
            bq.SchemaField("height_over_tube", "STRING", mode="NULLABLE"),
            bq.SchemaField("bolt_pattern_long", "STRING", mode="NULLABLE"),
            bq.SchemaField("bolt_pattern_wide", "STRING", mode="NULLABLE"),
            bq.SchemaField("groove", "STRING", mode="NULLABLE"),
            bq.SchemaField("base_diameter", "STRING", mode="NULLABLE"),
            bq.SchemaField("shoulder_diameter", "STRING", mode="NULLABLE"),
            bq.SchemaField("unique_feature", "STRING", mode="NULLABLE"),
            bq.SchemaField("orientation", "STRING", mode="NULLABLE"),
            bq.SchemaField("weight", "STRING", mode="NULLABLE")
        ]

        client.delete_table(table=table_id, not_found_ok=True)
        table = bq.Table(table_id, schema=schema)
        client.create_table(table, exists_ok=True)

        if load_data:
            dataset_ref = client.dataset(dataset_id)
            table_ref = dataset_ref.table(table_id)

            job_config = bq.LoadJobConfig()
            job_config.source_format = bq.SourceFormat.CSV
            job_config.skip_leading_rows = 1
            job_config.autodetect = False
            job_config.schema_update_options = []
            job_config.max_bad_records = 0

            with open(self.data_dir + "comp_boss.csv", "rb") as source_file:
                job = client.load_table_from_file(file_obj=source_file,
                                                  destination=table_id,
                                                  job_config=job_config)
            job.result()
            print("Loaded {} rows into {}:{}.".format(job.output_rows,
                                                      dataset_id, table_id))
Ejemplo n.º 7
0
def persist_JSON(json_dict, dataset_id, table_id):
    """
    Persists provided dictionary as a SINGLE row with nested and repeated columns into BigQuery.
    If unfamiliar with nested and repeated columns, refer to https://cloud.google.com/bigquery/docs/nested-repeated.

    :param json_dict: a SINGLE object definition to be persisted.
    :param dataset_id: The Id of an EXISTING BigQuery dataset.
    :param table_id: The Id of the BigQuery table where the JSON is to be persisted.
    If table doesn't exist, it will be created.

    :return: None
    """

    from google.cloud import bigquery
    import os

    _tempFile = "tmp.json"

    save_new_line_delimited_JSON([json_dict], _tempFile)

    client = bigquery.Client()

    dataset_ref = client.dataset(dataset_id)
    dataset_location = client.get_dataset(dataset_ref).location

    table_ref = dataset_ref.table(table_id)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.autodetect = True

    with open('tmp.json', 'rb') as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            location=
            dataset_location,  # Must match the destination dataset location.
            job_config=job_config)  # API request

    job.result()  # Waits for table load to complete.

    print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id,
                                              table_id))

    os.remove(_tempFile)  # Delete temp json file
Ejemplo n.º 8
0
def load_data():
    client = bigquery.Client()
    dataset_id = 'landing'
    dataset_ref = client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    # Allows to update with any new schema/field additions. Optional Extra (Required in this case).
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.schema_update_options = [
        bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
    ]
    job_config.schema = [
        bigquery.SchemaField("title", "STRING"),
        bigquery.SchemaField("subreddit", "STRING"),
        bigquery.SchemaField("score", "Integer"),
        bigquery.SchemaField("id", "STRING"),
        bigquery.SchemaField("url", "STRING"),
        bigquery.SchemaField("comms_num", "Integer"),
        bigquery.SchemaField("created", "DATE"),
    ]
    job_config.skip_leading_rows = 1
    # The source format defaults to CSV, so the line below is optional.
    job_config.source_format = bigquery.SourceFormat.CSV
    uri = "gs://dubai_source_api/source-api"

    # API request
    try:
        load_job = client.load_table_from_uri(uri,
                                              dataset_ref.table("dubai_posts"),
                                              job_config=job_config)
    except Exception as e:
        print('Failed to load data due to an expected error.')
        print('System abort!')
        print(str(e))
        sys.exit()
    else:
        print("Starting job {}".format(load_job.job_id))

    # Waits for table load to complete.
    load_job.result()
    print("Job finished.")

    destination_table = client.get_table(dataset_ref.table("dubai_posts"))
    table = "landing.dubai_posts"
    print("Loaded {} rows into {}.".format(destination_table.num_rows, table))
Ejemplo n.º 9
0
def bq_load_csv(url):
    import logging
    from google.cloud import bigquery

    logging.info('***In bq_load_csv *** {}'.format(MESSAGE_FROM_PUBSUB))

    ## schema code here 
    schema_input = bq_schema()

    bigquery_client = bigquery.Client()
    dataset_ref = bigquery_client.dataset('test')
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1

    if 'ABI-L1b' in MESSAGE_FROM_PUBSUB['name']: 
      table_ref = dataset_ref.table('abi_l1b_radiance')
      table = bigquery.Table(table_ref,schema=schema_input['abi_l1b_radiance'])

    elif 'ABI-L2-CMIP' in MESSAGE_FROM_PUBSUB['name']:
      table_ref = dataset_ref.table('abi_l2_cmip')
      table = bigquery.Table(table_ref,schema=schema_input['abi_l2_cmip'])

    elif 'ABI-L2-MCMIP' in MESSAGE_FROM_PUBSUB['name']:
      table_ref = dataset_ref.table('abi_l2_mcmip')
      table = bigquery.Table(table_ref,schema=schema_input['abi_l2_mcmip'])

    elif 'GLM-L2-LCFA' in MESSAGE_FROM_PUBSUB['name']:
      table_ref = dataset_ref.table('glm_l2_lcfa')
      table = bigquery.Table(table_ref,schema=schema_input['glm_l2_lcfa'])

    if not if_table_exists(bigquery_client, table_ref): 
        table =  bigquery_client.create_table(table)

    load_job = bigquery_client.load_table_from_uri(
        url,
        table_ref,
        job_config=job_config)

    assert load_job.job_type == 'load'
    load_job.result()  # Waits for table load to complete.
    assert load_job.state == 'DONE'

    logging.info('**** Bigquery job completed ****')
Ejemplo n.º 10
0
def prep_bq_fs_and_fv(
    bq_source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = bigquery.Client()
    gcp_project = client.project
    bigquery_dataset = "test_ingestion"
    dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
    client.create_dataset(dataset, exists_ok=True)
    dataset.default_table_expiration_ms = (1000 * 60 * 60 * 24 * 14
                                           )  # 2 weeks in milliseconds
    client.update_dataset(dataset, ["default_table_expiration_ms"])

    df = create_dataset()

    job_config = bigquery.LoadJobConfig()
    table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time())}"
    query = f"SELECT * FROM `{table_ref}`"
    job = client.load_table_from_dataframe(df,
                                           table_ref,
                                           job_config=job_config)
    job.result()

    bigquery_source = BigQuerySource(
        table_ref=table_ref if bq_source_type == "table" else None,
        query=query if bq_source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = get_feature_view(bigquery_source)
    with tempfile.TemporaryDirectory() as repo_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{uuid.uuid4()}",
            provider="gcp",
        )
        fs = FeatureStore(config=config)
        fs.apply([fv])

        yield fs, fv
Ejemplo n.º 11
0
def load_table(client, table_id):
    """Load local CSV file to an external BigQuery table.

    Args:
        client (google.cloud.bigquery.client.Client): BigQuery client object.
        table_id (str): Desired table id in BigQuery.

    Returns:
        None

    """
    path_absolute_output = os.path.join(os.environ['RESULTS_DATA_LOCAL'], '{}.csv'.format(os.environ['SCRIPT']))

    # Set the LoadJobConfig for creating the table.
    job_config = bigquery.LoadJobConfig()
    job_config.skip_leading_rows = 1
    job_config.create_disposition = 'CREATE_IF_NEEDED'

    # Define the schema.
    with open(path_absolute_output) as f:
        headers = f.readline().replace('"', '').replace('\n', '').split(',')
    schema = []
    for i in headers:
        if i in ['province']:
            schema.append(bigquery.SchemaField(i, 'STRING'))
        elif i == 'ts_load':
            schema.append(bigquery.SchemaField(i, 'TIMESTAMP'))
        else:
            schema.append(bigquery.SchemaField(i, 'INTEGER'))
    job_config.schema = schema

    # Load the table.
    load_job = client.load_table_from_file(
        open(path_absolute_output, 'rb'),
        table_id,
        job_config=job_config
    )

    # Waits for table load to complete and raises errors, like BadRequest if
    # file in combination with schema has errors.
    result_load_job = load_job.result()
    if result_load_job is not None and result_load_job.error_result is not None:
        raise Exception('Load job result errors: {}.'.format(
                result_load_job.error_result))
Ejemplo n.º 12
0
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset,
                tables: Iterable[Table]):
    """Load tables for a test."""
    for table in tables:
        destination = dataset.table(table.name)
        job_config = bigquery.LoadJobConfig(
            source_format=table.source_format,
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        )

        if table.schema is None:
            # autodetect schema if not provided
            job_config.autodetect = True
        else:
            job_config.schema = table.schema
            # look for time_partitioning_field in provided schema
            for field in job_config.schema:
                if field.description == "time_partitioning_field":
                    job_config.time_partitioning = bigquery.TimePartitioning(
                        field=field.name)
                    break  # stop because there can only be one time partitioning field

        if isinstance(table.source_path, str):
            with open(table.source_path, "rb") as file_obj:
                job = bq.load_table_from_file(file_obj,
                                              destination,
                                              job_config=job_config)
        else:
            file_obj = BytesIO()
            for row in load(*table.source_path):
                file_obj.write(
                    json.dumps(row, default=default_encoding).encode() + b"\n")
            file_obj.seek(0)
            job = bq.load_table_from_file(file_obj,
                                          destination,
                                          job_config=job_config)

        try:
            job.result()
        except BadRequest:
            # print the first 5 rows for debugging errors
            for row in job.errors[:5]:
                print(row)
            raise
Ejemplo n.º 13
0
def main():
    args = parser.parse_args()
    client = bigquery.Client(args.project)

    experiments = get_experiments()

    destination_table = (
        f"{args.project}.{args.destination_dataset}.{args.destination_table}")

    bq_schema = (
        bigquery.SchemaField("experimenter_slug", "STRING"),
        bigquery.SchemaField("normandy_slug", "STRING"),
        bigquery.SchemaField("type", "STRING"),
        bigquery.SchemaField("status", "STRING"),
        bigquery.SchemaField("start_date", "DATE"),
        bigquery.SchemaField("end_date", "DATE"),
        bigquery.SchemaField("proposed_enrollment", "INTEGER"),
        bigquery.SchemaField("reference_branch", "STRING"),
        bigquery.SchemaField("is_high_population", "BOOL"),
        bigquery.SchemaField(
            "branches",
            "RECORD",
            mode="REPEATED",
            fields=[
                bigquery.SchemaField("slug", "STRING"),
                bigquery.SchemaField("ratio", "INTEGER"),
            ],
        ),
    )

    job_config = bigquery.LoadJobConfig(destination=destination_table,
                                        write_disposition="WRITE_TRUNCATE")
    job_config.schema = bq_schema

    converter = cattr.Converter()
    converter.register_unstructure_hook(
        datetime.datetime,
        lambda d: datetime.datetime.strftime(d, format="%Y-%m-%d"))

    client.load_table_from_json(converter.unstructure(experiments),
                                destination_table,
                                job_config=job_config).result()
    print(f"Loaded {len(experiments)} experiments")
Ejemplo n.º 14
0
    def cloud_storage_to_table(self, bucket_name, filename,
                               dataset_id, table_id, job_config=None,
                               import_format="csv", location="US", **kwargs):
        """Extract table from GoogleStorage and send to BigQuery"""
        self.create_table(dataset_id, table_id)

        dataset_ref = self._client.dataset(dataset_id)
        table_ref = dataset_ref.table(table_id)

        job_config = job_config if job_config else bigquery.LoadJobConfig()
        job_config.source_format = self.FILE_FORMATS.get(import_format)

        return self._client.load_table_from_uri(
            "gs://{}/{}".format(bucket_name, filename),
            table_ref,
            job_config=job_config,
            location=location,
            **kwargs
        ).result()
Ejemplo n.º 15
0
  def load_data(self, gcs_file_path):
    """Loads sample data to sample table.

    Args:
      gcs_file_path: Cloud storage path for CSV.
    """
    destination_table = self._client.get_table(self._table_ref)
    if destination_table.num_rows < 1:
      job_config = bigquery.LoadJobConfig()
      job_config.autodetect = True
      job_config.skip_leading_rows = 1
      job_config.source_format = bigquery.SourceFormat.CSV
      load_job = self._client.load_table_from_uri(
          gcs_file_path, self._table_ref, job_config=job_config)
      load_job.result()  # Waits for table load to complete.
      logger.info('Finished loading data. Job id: "%s"', load_job.job_id)
    else:
      logger.info('Required data already exists in "%s".',
                  destination_table.table_id)
Ejemplo n.º 16
0
def loaddatafromfiletotable(bqclient, newdatasetname, newtablename):

    dataset_ref = newdatasetname
    table = newtablename.table_id
    table_ref = "{}.{}".format(dataset_ref, table)

    jobconfig = bigquery.LoadJobConfig()  # Initialize job configuration.
    jobconfig.source_format = 'CSV'
    jobconfig.skip_leading_rows = 1
    jobconfig.write_disposition = "JOB_WRITE_TRUNCATE"
    job = bqclient.load_table_from_uri(loadFilePath,
                                       table_ref,
                                       job_config=jobconfig)
    result = job.result()  # wait for job to complete

    print(" job-type = {}".format(job.job_type))
    print("Load job status")
    print(result.state)
    print("Load job statistics")
Ejemplo n.º 17
0
def load_data_from_gcs(dataset_id, table_id, source):
    bigquery_client = bigquery.Client()
    dataset_ref = bigquery_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = False
    job_config.source_format = 'CSV'
    job_config.skip_leading_rows = 1
    job_config.write_disposition = 'WRITE_APPEND'  # WRITE_EMPTY, WRITE_APPEND, WRITE_TRUNCATE

    job = bigquery_client.load_table_from_uri(source,
                                              table_ref,
                                              job_config=job_config)

    job.result()  # Waits for job to complete

    print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id,
                                              table_id))
Ejemplo n.º 18
0
    def _upload_2_bigquery(self, file_path, table_id):
        job_config = bigquery.LoadJobConfig(
            source_format=bigquery.SourceFormat.CSV,
            skip_leading_rows=1,
            autodetect=True,
            allow_quoted_newlines=True,
            write_disposition="WRITE_TRUNCATE",
        )
        with open(file_path, "rb") as source_file:
            job = self.client.load_table_from_file(source_file,
                                                   table_id,
                                                   job_config=job_config)

        job.result()  # Waits for the job to complete.

        table = self.client.get_table(table_id)  # Make an API request.
        print(
            f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}"
        )
Ejemplo n.º 19
0
def update_bq_table(uri, fn, table_ref, table_schema):
  '''Saves data from a bq bucket to a table'''

  job_config = bigquery.LoadJobConfig()
  job_config.write_disposition = "WRITE_APPEND"
  job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
  
  #job_config.autodetect = True
  job_config.autodetect = False
  job_config.schema = table_schema
  
  orig_rows =  bq_client.get_table(table_ref).num_rows

  load_job = bq_client.load_table_from_uri(uri + fn, table_ref, job_config=job_config)  # API request
  print("Starting job {}".format(load_job.job_id))

  load_job.result()  # Waits for table load to complete.
  destination_table = bq_client.get_table(table_ref)
  print('Loaded {} rows into {}:{}.'.format(destination_table.num_rows-orig_rows, 'sumo', table_ref.table_id))
Ejemplo n.º 20
0
        def load_task():
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri, task=task, file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Ejemplo n.º 21
0
def upload_csv(table_id):
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True,
    )
    job_config.write_disposition = 'WRITE_APPEND'
    job_config.schemaUpdateOptions = [
        'ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'
    ]
    with open("interview.csv", "rb") as source_file:
        job = client.load_table_from_file(source_file,
                                          table_id,
                                          job_config=job_config)
    job.result()  # Waits for the job to complete.
    table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows and {} columns to {}".format(table.num_rows,
                                                       len(table.schema),
                                                       table_id))
    def job_config(self):
        job_config = bigquery.LoadJobConfig()
        job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
        job_config.source_format = bigquery.job.SourceFormat.NEWLINE_DELIMITED_JSON
        job_config.autodetect = True

        if self._write_disposition == "truncate":
            job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
        elif self._write_disposition == "append":
            job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_APPEND
        else:
            raise Exception("Unknown BigQuery write disposition")

        if self._partition_column:
            job_config.time_partitioning = bigquery.table.TimePartitioning(
                bigquery.table.TimePartitioningType.DAY, self._partition_column
            )

        return job_config
Ejemplo n.º 23
0
def upload_to_bq(gs_path, table_name, source_format ='csv'):
    client = bigquery.Client()
    job_config = bigquery.LoadJobConfig()
    if source_format == 'csv':
        job_config.source_format = bigquery.SourceFormat.CSV
    elif source_format == 'json':
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.write_disposition = 'WRITE_TRUNCATE'
    if source_format == 'csv':
        job_config.skip_leading_rows = 1
    dataset_ref = client.dataset('covid19')
    dataset = bigquery.Dataset(dataset_ref)
    load_job = client.load_table_from_uri(
           gs_path, dataset_ref.table(table_name),
          job_config=job_config)
    try:
        load_job.result()
    except google.api_core.exceptions.BadRequest:
        raise ValueError(load_job.errors)
Ejemplo n.º 24
0
def runme():

    dataset_id = 'IanTest'
    table_id = 'ia'
    
    path1 = os.path.dirname(os.path.realpath(__file__))
    parentPath = os.path.dirname(path1)
    #type = sys.argv[1]
    #file = os.path.join(parentPath,"store",type + ".pkl")
    filename = os.path.join(parentPath,"store","adviser.csv")
    creds = os.path.join(parentPath,"creds","backlogger_bq.json")
    client = bigquery.Client.from_service_account_json(creds)

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = True
    job_config.WriteDisposition = "WRITE_TRUNCATE"
    job_config.CreateDisposition = "CREATE_IF_NEEDED"

    with open(filename, "rb") as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            #location="EU",  # Must match the destination dataset location.
            job_config=job_config
        )  # API request


    try:
        job.result()  # Waits for table load to complete.

    except:
        for er in job.errors:
            print(er)





    print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
Ejemplo n.º 25
0
    def load_task(**context):
        dags_folder = os.environ.get("DAGS_FOLDER", "/home/airflow/gcs/dags")
        schema_path = os.path.join(
            dags_folder,
            "resources/stages/load/schemas/{schema}.json".format(
                schema="annual_npp"),
        )
        client = bigquery.Client()
        job_config = bigquery.LoadJobConfig()
        job_config.schema = read_bigquery_schema_from_file(schema_path)
        job_config.source_format = bigquery.SourceFormat.PARQUET
        job_config.write_disposition = "WRITE_TRUNCATE"
        job_config.ignore_unknown_values = True
        job_config.clustering_fields = [
            "geography",
            "geography_polygon",
        ]
        job_config.range_partitioning = RangePartitioning(
            field="year",
            range_=PartitionRange(start=1900, end=2100, interval=1),
        )
        execution_date = context["execution_date"]
        load_table_name = "{table}${partition}".format(
            table=destination_table_name,
            partition=execution_date.strftime("%Y"))
        table_ref = create_dataset(
            client,
            destination_dataset_name,
            project=destination_dataset_project_id,
        ).table(load_table_name)

        load_uri = "gs://{bucket}/{prefix}/annual_npp/parquet/{date}.parquet".format(
            bucket=output_bucket,
            prefix=output_path_prefix,
            date=execution_date.strftime("%Y_%m_%d"),
        )
        load_job = client.load_table_from_uri(
            load_uri,
            table_ref,
            job_config=job_config,
        )
        submit_bigquery_job(load_job, job_config)
        assert load_job.state == "DONE"
def load_csv(uri, table_id, schema):
    job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        write_disposition=
        'WRITE_TRUNCATE',  # if table already exists, drops and recreates
    )

    load_job = client.load_table_from_uri(uri, table_id, job_config=job_config)

    try:
        load_job.result()
    except BadRequest as e:
        for e in load_job.errors:
            print('ERROR: {}'.format(e['message']))

    table = client.get_table(table_id)
    print("Loaded {} rows to table {}".format(table.num_rows, table_id))
Ejemplo n.º 27
0
def load_data_to_bigquery(project):
    dataset = 'fashion_mnist'
    train = 'train'
    test = 'test'

    # Create dataset, train and test tables.
    client = bigquery.Client(project=project)
    client.create_dataset(dataset)
    train_table = client.create_table(f'{project}.{dataset}.{train}')
    test_table = client.create_table(f'{project}.{dataset}.{test}')

    # Set job configuration.
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.field_delimiter = "|"
    job_config.schema = [
        bigquery.schema.SchemaField("image_raw", "STRING"),
        bigquery.schema.SchemaField("label", "INTEGER"),
    ]
    job_config.autodetect = True

    # Load train data to BigQuery.
    with open(_train_file, "rb") as input:
        train_load = client.load_table_from_file(input,
                                                 train_table,
                                                 job_config=job_config)

    # Load test data to BigQuery.
    with open(_test_file, "rb") as input:
        test_load = client.load_table_from_file(input,
                                                test_table,
                                                job_config=job_config)

    # The loading is an async operation. Wait for it to finish.
    train_load.result()
    test_load.result()

    print(
        f'Loaded {train_load.output_rows} rows from {_train_file} into {project}.{dataset}.{train}'
    )
    print(
        f'Loaded {test_load.output_rows} rows from {_test_file} into {project}.{dataset}.{test}'
    )
Ejemplo n.º 28
0
def load_functional_alltypes_parted_data(
        request, bqclient, create_functional_alltypes_parted_table):
    if request.config.getoption("--no-refresh-dataset"):
        return

    table = create_functional_alltypes_parted_table
    load_config = bigquery.LoadJobConfig()
    load_config.write_disposition = "WRITE_TRUNCATE"
    load_config.skip_leading_rows = 1  # skip the header row.
    filepath = download_file(
        "{}/functional_alltypes.csv".format(TESTING_DATA_URI))
    with open(filepath.name, "rb") as csvfile:
        job = bqclient.load_table_from_file(
            csvfile,
            table,
            job_config=load_config,
        ).result()
    if job.error_result:
        print("error")
Ejemplo n.º 29
0
def cloudsql_to_bigquery(request):
    request_obj = request.get_json(silent=True)
    job_done = False
    if request_obj and request_obj['message'] == os.getenv('TASK_TO_EXECUTE'):
        # Connect to CloudSql
        connection = pymysql.connect(
            unix_socket=os.getenv('CONNECTION_STRING'),
            user=os.getenv('USERNAME'),
            password=os.getenv('PASSWORD'),
            db=os.getenv('DATABASE'),
            cursorclass=pymysql.cursors.DictCursor)
        print('Connection:', connection)
            
        # Query to read data from CloudSql table
        sql_query = 'SELECT * FROM <sql_table_name>'
        
        # Create dataframe reading table data
        dataframe = pb.read_sql(sql_query, connection, index_col='<col_name>Ex:id')
        print(dataframe.head(5))
        
        # Connect to BigQuery client
        bq_client = bigquery.Client()
        bq_dataset = bq_client.dataset('<bq_dataset_name>')
        bq_table_name = bq_dataset.table('<bq_table_name>')
        
        # Create job configurations
        bq_job_config = bigquery.LoadJobConfig()
        bq_job_config.autodetect = True
        bq_job_config.write_disposition = 'WRITE_TRUNCATE' # WRITE_APPEND, WRITE_EMPTY, WRITE_DISPOSITION_UNSPECIFIED
        
        # Load data into BigQuery table
        write_data = bq_client.load_table_from_dataframe(dataframe, bq_table_name, job_config=bq_job_config)
        write_data.result() # Wait for the job to finish
        
        # Print write job task id
        print('Running task {}'.format(write_data))
        job_done = True
        
    # Return a valid response
    if job_done is True:
        return ('Success', 200)
    else:
        return ('Failed', 500)
Ejemplo n.º 30
0
def gcs_to_gbq():
    # Connect to Google Cloud Storage and set bucket
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(config.gcp['bucket_name'])
    blobs = bucket.list_blobs()
    client = bigquery.Client()
    # Connect to Google BigQuery and define BigQuery options
    dataset_ref = client.dataset(config.gcp['dataset_id'])
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    job_config.allow_quoted_newlines = True
    job_config.allow_jagged_rows = True
    job_config.autodetect = False
    job_config.skip_leading_rows = 1
    job_config.field_delimiter = "|"
    job_config.source_format = bigquery.SourceFormat.CSV
    # Iterate though all files of defined Google Cloud Storage bucket
    for blob in blobs:
        # Set dynamic URL for current Cloud Storage file and BigQuery schema file
        uri = 'gs://' + config.gcp['bucket_name'] + '/' + blob.name
        file_name_json = blob.name.replace(".csv", ".json")
        file_name_json_path = os.path.join(config.general['csv_cache_folder'],
                                           file_name_json)
        # Load JSON File for schema and set schema fields for BigQuery
        input_json = open(file_name_json_path)
        input_json_config = json.load(input_json)
        job_config.schema = [
            bigquery.SchemaField(item["name"], item["type"])
            for item in input_json_config["fields"]
        ]
        # Set dynamic table name for Google BigQuery
        table_name = blob.name.replace(".csv", "")
        # Create new big query table / replace existing
        load_job = client.load_table_from_uri(uri,
                                              dataset_ref.table(table_name),
                                              job_config=job_config)
        assert load_job.job_type == 'load'
        load_job.result()  # Waits for table load to complete.
        assert load_job.state == 'DONE'
    # Send slack message when load job done
    send_message_to_slack(
        ':bigquery: Backend Data Sync: Update DWH Data OK :thumbs-up-green:',
        config.slack_hooks['slack_channel_1'])