Esempio n. 1
0
def insertCSV(stock):

    client = bigquery.Client(project_id)

    SCHEMA = [
        SchemaField('symbol', 'STRING', mode='required'),
        SchemaField('date', 'DATE', mode='required'),
        SchemaField('close', 'FLOAT', mode='required'),
        SchemaField('high', 'FLOAT', mode='required'),
        SchemaField('low', 'FLOAT', mode='required'),
        SchemaField('open', 'FLOAT', mode='required'),
        SchemaField('volume', 'INTEGER', mode='required'),
    ]

    table_ref = client.dataset(dataset_id).table(stock)

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 1
    load_config.schema = SCHEMA

    with open('Data/%s.csv' % stock, 'rb') as readable:
        r = csv.reader(readable, delimiter=',')
        client.load_table_from_file(readable,
                                    table_ref,
                                    job_config=load_config)
Esempio n. 2
0
    def process_response_rows_for_bigquery(self, rows: list,
                                           table_reference: TableReference):
        rows_dataframe = DataFrame.from_records(rows)

        rows_dataframe = concat(
            [rows_dataframe, rows_dataframe['dimensions'].apply(Series)],
            axis=1,
            join='inner')
        rows_dataframe = rows_dataframe.drop(['dimensions'], axis=1)
        rows_dataframe['date'] = rows_dataframe['date'].apply(
            lambda x: x.date())

        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column)
            for column in list(rows_dataframe.columns.values)
        ]

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                rows_dataframe, table_reference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
Esempio n. 3
0
def load_from_file(file_path, table):
    table_ref = dataset.table(table)
    load_config = LoadJobConfig()
    load_config.autodetect = True

    with open(file_path, 'rb') as readable:
        result = client.load_table_from_file(readable, table_ref, job_config=load_config)

    # return result
    return table_ref.dataset_id + '.' + table_ref.table_id
    def create_table_from_csv(self, dataset, table_name, file_path, schema):
        table_ref = dataset.table(table_name)

        load_config = LoadJobConfig()
        load_config.skip_leading_rows = 1
        load_config.schema = schema

        with open(file_path, 'rb') as readable:
            self.client.load_table_from_file(
                readable, table_ref, job_config=load_config)  # API request

        return
    def _load_to_bq(self, client, dataset, table_name, table_schema,
                    table_config, key_props, metadata_columns, truncate, rows):
        logger = self.logger
        partition_field = table_config.get("partition_field", None)
        cluster_fields = table_config.get("cluster_fields", None)
        force_fields = table_config.get("force_fields", {})

        schema = build_schema(table_schema,
                              key_properties=key_props,
                              add_metadata=metadata_columns,
                              force_fields=force_fields)
        load_config = LoadJobConfig()
        load_config.ignore_unknown_values = True
        load_config.schema = schema
        if partition_field:
            load_config.time_partitioning = bigquery.table.TimePartitioning(
                type_=bigquery.table.TimePartitioningType.DAY,
                field=partition_field)

        if cluster_fields:
            load_config.clustering_fields = cluster_fields

        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            logger.info(f"Load {table_name} by FULL_TABLE (truncate)")
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        else:
            logger.info(f"Appending to {table_name}")
            load_config.write_disposition = WriteDisposition.WRITE_APPEND

        logger.info("loading {} to BigQuery".format(table_name))

        load_job = None
        try:
            load_job = client.load_table_from_file(rows,
                                                   dataset.table(table_name),
                                                   job_config=load_config,
                                                   rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            job = load_job.result()
            logger.info(job._properties)

            return job

        except google_exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table_name, str(err)))
            if load_job and load_job.errors:
                reason = err.errors[0]["reason"]
                messages = [f"{err['message']}" for err in load_job.errors]
                logger.error("reason: {reason}, errors:\n{e}".format(
                    reason=reason, e="\n".join(messages)))
                err.message = f"reason: {reason}, errors: {';'.join(messages)}"

            raise err
Esempio n. 6
0
        def load_task():
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Esempio n. 7
0
    def _process_data_for_bigquery(self, data: DataFrame,
                                   output_tablereference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                data, output_tablereference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
Esempio n. 8
0
def upload_prevalence_data(client, tmp_f):
    client.load_table_from_file(
        tmp_f,
        "qof.prevalence",
        rewind=True,
        job_config=LoadJobConfig(schema=PREVALENCE_SCHEMA),
    )
Esempio n. 9
0
def upload_achievement_data(client, tmp_f):
    client.load_table_from_file(
        tmp_f,
        "qof.achievement",
        rewind=True,
        job_config=LoadJobConfig(schema=ACHIEVEMENT_SCHEMA),
    )
Esempio n. 10
0
def gcs_csv_to_table(full_table_id: str, remote_csv_path: str) -> Table:
    """
    Insert CSV from Google Storage to BigQuery Table.

    :param full_table_id: Full ID of a Google BigQuery table.
    :type full_table_id: str
    :param remote_csv_path: Path to uploaded CSV.
    :type remote_csv_path: str
    :returns: str
    """
    try:
        gcs_csv_uri = f"gs://{GCP_BUCKET_NAME}/{remote_csv_path}"
        job_config = LoadJobConfig(
            autodetect=True,
            skip_leading_rows=1,
            source_format=SourceFormat.CSV,
        )
        load_job = gbq.load_table_from_uri(gcs_csv_uri,
                                           full_table_id,
                                           job_config=job_config)
        LOGGER.info(f"Starting job {load_job.job_id}.")
        LOGGER.info(load_job.result())  # Waits for table load to complete.
        return gbq.get_table(full_table_id)
    except BadRequest as e:
        LOGGER.error(
            f"Invalid GCP request when creating table `{full_table_id}`: {e}")
    except Exception as e:
        LOGGER.error(
            f"Unexpected error when creating table `{full_table_id}`: {e}")
Esempio n. 11
0
def store_participant_data(df, project_id, destination_table, schema=None):
    """
    Stores the fetched participant data in a BigQuery dataset. If the
    table doesn't exist, it will create that table. If the table does
    exist, it will append the data onto that designated table.

    :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API
    :param project_id: identifies the project
    :param destination_table: name of the table to be written in the form of dataset.tablename
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of participant data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the tables')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(destination_table.split('.')[-1])

    # Dataframe data fields must be of type datetime
    df = set_dataframe_date_fields(df, schema)

    load_job_config = LoadJobConfig(schema=schema)
    job = client.load_table_from_dataframe(df,
                                           destination_table,
                                           job_config=load_job_config)
    job.result()

    return job.job_id
Esempio n. 12
0
def upload_tweets():
    big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json')

    dataset_ref = big_query_client.dataset('Tweets')
    dataset = Dataset(dataset_ref)
    dataset.description = 'This represents tweets of trending topics'
    dataset = big_query_client.create_dataset(dataset)

    SCHEMA = [
        SchemaField('Tweets', 'STRING', mode='Nullable'),

    ]
    table_ref = big_query_client.dataset('Tweets').table('tabletweet')

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 0
    load_config.schema = SCHEMA
    load_config.allow_quoted_newlines = True
    load_config.ignore_unknown_values = False
    load_config.max_bad_records = 200


    with open('tweets.csv', 'rb') as readable:
        big_query_client.load_table_from_file(
            readable, table_ref, job_config=load_config)
    print('tweets file uploaded to big query')
def loadjob_one(client, dataset_ref, table_name):
    data = [{'keyword': 'dummy-{}'.format(str(time.time())),
             'partition_value': '2019-12-24'}]
    table_ref = dataset_ref.table(table_name)
    table_obj = Table(table_ref, schema=TABLE_SCHEMA)
    job_config = LoadJobConfig()
    job_config.schema = TABLE_SCHEMA
    result_obj = client.load_table_from_json(data, table_obj, job_config=job_config)

    sleep_time = 1
    while result_obj.done() is False:
        LOGGER.info('waiting for %s second. data insertion.', sleep_time)
        time.sleep(sleep_time)

    if result_obj.errors:
        error_msg = 'Failed to insert: error_msg=%s' % result_obj.errors
        LOGGER.error(error_msg)
        raise FailedInsertingSerpCacheBigQueryException(error_msg)
Esempio n. 14
0
    def create_table(self, path, table_from='uri'):
        bp = BQParser(path)
        dataset_name = bp.dataset_name
        table_name = bp.table_name
        skip_leading_rows = bp.skip_leading_rows
        schema = bp.schema

        table_ref = self.client.dataset(dataset_name).table(table_name)
        load_config = LoadJobConfig()
        load_config.skip_leading_rows = skip_leading_rows
        load_config.schema = schema
        file_source = bp.properties.get('inputPath')

        if table_from == 'uri':
            self.client.load_table_from_uri(source_uris=file_source,
                                            destination=table_ref,
                                            job_config=load_config)
        else:
            raise ValueError('Not supported')
Esempio n. 15
0
    def push_bq():
        for table in rows.keys():

            table_ref = bigquery_client.dataset(dataset_id).table(table)
            SCHEMA = build_schema(schemas[table])
            load_config = LoadJobConfig()
            load_config.schema = SCHEMA
            load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

            if truncate:
                load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

            rows[table].seek(0)
            logger.info('loading {} to Bigquery.\n'.format(table))
            load_job = bigquery_client.load_table_from_file(
                rows[table], table_ref, job_config=load_config)
            logger.info('loading job {}'.format(load_job.job_id))
            logger.info(load_job.result())
            rows[table] = TemporaryFile(mode='w+b')
Esempio n. 16
0
 def __create_load_job_config(
         self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig:
     config = LoadJobConfig()
     config.labels = ems_load_job_config.labels
     config.create_disposition = ems_load_job_config.create_disposition.value
     config.write_disposition = ems_load_job_config.write_disposition.value
     config.schema = _parse_schema_resource(ems_load_job_config.schema)
     config.skip_leading_rows = ems_load_job_config.skip_leading_rows
     return config
Esempio n. 17
0
def get_table_from_load_job_config(config: bigquery.LoadJobConfig):
    """
    The BigQuery python library does not currently expose destinationTable
    as a property in the LoadJobConfig class. Because of this, you have to
    convert the LoadJobConfig object to the API representation and then
    extract the dictionary's value for key: destinationTable.

    :param config: bigquery.LoadJobConfig
    :return: bigquery.TableReference
    """
    print(f"Retrieving table for LoadJobConfig: {config.to_api_repr()}")
    if config.to_api_repr().get('load'):
        config = config.to_api_repr().get('load')
        if config.get('destinationTable'):
            project_id = config.get('destinationTable').get('projectId')
            dataset_id = config.get('destinationTable').get('datasetId')
            table_id = config.get('destinationTable').get('tableId')
            return bigquery.TableReference.from_string(
                f"{project_id}.{dataset_id}.{table_id}")
    return None
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str,
                prefix: str, gcs_client: GCSClient,
                hpo_id: str) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: Identifies the destination dataset
    :param bq_client: a BigQuery client object
    :param bucket_name: the bucket in GCS containing the archive files
    :param prefix: prefix of the filepath URI
    :param gcs_client: a Cloud Storage client object
    :param hpo_id: Identifies the HPO site
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix))

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        if table_name not in AOU_REQUIRED:
            LOGGER.debug(f'Skipping file for {table_name}')
            continue
        schema = get_table_schema(table_name)
        hpo_table_name = f'{hpo_id}_{table_name}'
        fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}'
        destination = Table(fq_hpo_table, schema=schema)
        destination = bq_client.create_table(destination)
        job_config = LoadJobConfig()
        job_config.schema = schema
        job_config.skip_leading_rows = 1
        job_config.source_format = 'CSV'
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(
            source_uri,
            destination,
            job_config=job_config,
            job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_")
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
Esempio n. 19
0
    def load_data(self, dataframe, dataset_id, table_id, chunksize):
        from google.cloud.bigquery import LoadJobConfig
        from six import BytesIO

        destination_table = self.client.dataset(dataset_id).table(table_id)
        job_config = LoadJobConfig()
        job_config.write_disposition = 'WRITE_APPEND'
        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        rows = []
        remaining_rows = len(dataframe)

        total_rows = remaining_rows
        self._print("\n\n")

        for index, row in dataframe.reset_index(drop=True).iterrows():
            row_json = row.to_json(force_ascii=False,
                                   date_unit='s',
                                   date_format='iso')
            rows.append(row_json)
            remaining_rows -= 1

            if (len(rows) % chunksize == 0) or (remaining_rows == 0):
                self._print("\rLoad is {0}% Complete".format(
                    ((total_rows - remaining_rows) * 100) / total_rows))

                body = '{}\n'.format('\n'.join(rows))
                if isinstance(body, bytes):
                    body = body.decode('utf-8')
                body = body.encode('utf-8')
                body = BytesIO(body)

                try:
                    self.client.load_table_from_file(
                        body, destination_table,
                        job_config=job_config).result()
                except self.http_error as ex:
                    self.process_http_error(ex)

                rows = []

        self._print("\n")
Esempio n. 20
0
def loadDataFromCSV(tablename, global_dataset_ref, filename):

    schema = getTableSchema(tablename, global_dataset_ref)

    table_ref = global_dataset_ref.table(tablename)

    load_config = LoadJobConfig()
    load_config.source_format = bigquery.SourceFormat.CSV
    load_config.schema = schema
    load_config.autodetect = True
    load_config.allow_quoted_newlines = True
    load_config.encoding = 'UTF-8'

    try:
        with open(filename, 'rb') as readable:
            job = bigquery_client.load_table_from_file(readable,
                                                       table_ref,
                                                       location='US',
                                                       job_config=load_config)
    except Exception as e:
        print("Error")
        print(e)

    job.result()

    print('Loaded {} rows into {}:{}.'.format(job.output_rows,
                                              global_dataset_ref,
                                              table_ref.table_id))

    return


# Testing
# if __name__ == '__main__':
# datasetname = 'Testing'
# tablename = 'SOViews'

# sqlquery = '''SELECT CONCAT(
#   'https://stackoverflow.com/questions/',
#   CAST(id as STRING)) as url,
# view_count
# FROM `bigquery-public-data.stackoverflow.posts_questions`
# WHERE tags like '%google-bigquery%'
# ORDER BY view_count DESC
# LIMIT 10'''

#createDataset(datasetname) #Successfully tested this code 2018-09-24
# global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24

#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
# getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24

# runBigQueryQuery(sqlquery) #Successfully tested this code 2018-09-24

#loadDataFromCSV(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
Esempio n. 21
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
Esempio n. 22
0
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str,
               gcs_client: storage.Client) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: reference to destination dataset object
    :param bq_client: a BigQuery client object
    :param bucket_name: the location in GCS containing the vocabulary files
    :param gcs_client: a Cloud Storage client object
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name))

    table_blobs = [_filename_to_table_name(blob.name) for blob in blobs]
    missing_blobs = [
        table for table in VOCABULARY_TABLES if table not in table_blobs
    ]
    if missing_blobs:
        raise RuntimeError(
            f'Bucket {bucket_name} is missing files for tables {missing_blobs}'
        )

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        # ignore any non-vocabulary files
        if table_name not in VOCABULARY_TABLES:
            continue
        destination = dst_dataset.table(table_name)
        safe_schema = safe_schema_for(table_name)
        job_config = LoadJobConfig()
        job_config.schema = safe_schema
        job_config.skip_leading_rows = 1
        job_config.field_delimiter = FIELD_DELIMITER
        job_config.max_bad_records = MAX_BAD_RECORDS
        job_config.source_format = 'CSV'
        job_config.quote_character = ''
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(source_uri,
                                                 destination,
                                                 job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
Esempio n. 23
0
    def load_csv_uri(self, table_id, schemas_orig):
        full_table_id = f'{self.DATASET}.{table_id}'

        job_config = LoadJobConfig(schema=schemas_orig[table_id],
                                   source_format=SourceFormat.CSV,
                                   skip_leading_rows=1)

        uri = f'gs://{self.BUCKET}/{table_id}.csv'
        load_job = self.bq_client.load_table_from_uri(
            uri,
            full_table_id,
            job_config=job_config,
            job_id_prefix=f'{table_id}_')
        print(f'job {load_job.job_id} started')
        load_job.add_done_callback(self.load_job_cb)

        return load_job
Esempio n. 24
0
    def load_test_data(self, df, project_id, dataset_id, table):
        """
        Add data to the tables for the rule to run on.

        :param df: a dataframe containing data to insert
        :param project_id
        :param dataset_id
        :param table
        """
        client = get_client(project_id)
        schema = get_table_schema(table)
        schema = [field for field in schema if field.name in list(df.columns)]
        load_job_config = LoadJobConfig(schema=schema)
        load_job = client.load_table_from_dataframe(df,
                                                    f'{dataset_id}.{table}',
                                                    job_config=load_job_config)
        load_job.result()
Esempio n. 25
0
    def load_json_file(self, table_id, schemas_orig):
        full_table_id = f'{self.DATASET}.{table_id}'

        job_config = LoadJobConfig(
            schema=schemas_orig[table_id],
            source_format=SourceFormat.NEWLINE_DELIMITED_JSON)

        file_path = f'{table_id}.json'
        with open(file_path, "rb") as source_file:
            load_job = self.bq_client.load_table_from_file(
                source_file,
                full_table_id,
                job_config=job_config,
                job_id_prefix=f'{table_id}_')
        print(f'job {load_job.job_id} started')
        load_job.add_done_callback(self.load_job_cb)

        return load_job
    def __init__(self,
                 *,
                 table,
                 partition: Optional[str] = None,
                 from_: PackageFileLoader,
                 bq_client: Client,
                 load_job_config: LoadJobConfig = LoadJobConfig()):
        """Constructor of BaseDataLoader

        Args:
            table (Table): table to load data into.
            from_ (PackageFileLoader): specifies where data is.
            bq_client (Client): instance of bigquery client to use accross the DSL.
            partition (Optional[str], optional): if you plan to load into a specific partition. Used as a decorator.
                Defaults to None.
            load_job_config (LoadJobConfig, optional): Big Query load job config.
                This is the object updated by this DSL. Defaults to LoadJobConfig().
        """
        self.load_job_config = load_job_config
        self.table = table
        self.from_ = from_
        self._bq_client = bq_client
        self.partition = partition
Esempio n. 27
0
def sync_bigquery_release_files(request):
    bq = request.find_service(name="gcloud.bigquery")
    # Multiple table names can be specified by separating them with whitespace
    table_names = request.registry.settings[
        "warehouse.release_files_table"].split()

    for table_name in table_names:
        table_schema = bq.get_table(table_name).schema

        # Using the schema to populate the data allows us to automatically
        # set the values to their respective fields rather than assigning
        # values individually
        def populate_data_using_schema(file):
            release = file.release
            project = release.project

            row_data = dict()
            for sch in table_schema:
                # The order of data extraction below is determined based on the
                # classes that are most recently updated
                if hasattr(file, sch.name):
                    field_data = getattr(file, sch.name)
                elif hasattr(release, sch.name) and sch.name == "description":
                    field_data = getattr(release, sch.name).raw
                elif sch.name == "description_content_type":
                    field_data = getattr(release, "description").content_type
                elif hasattr(release, sch.name):
                    field_data = getattr(release, sch.name)
                elif hasattr(project, sch.name):
                    field_data = getattr(project, sch.name)
                else:
                    field_data = None

                if isinstance(field_data, datetime.datetime):
                    field_data = field_data.isoformat()

                # Replace all empty objects to None will ensure
                # proper checks if a field is nullable or not
                if not isinstance(field_data, bool) and not field_data:
                    field_data = None

                if field_data is None and sch.mode == "REPEATED":
                    row_data[sch.name] = []
                elif field_data and sch.mode == "REPEATED":
                    # Currently, some of the metadata fields such as
                    # the 'platform' tag are incorrectly classified as a
                    # str instead of a list, hence, this workaround to comply
                    # with PEP 345 and the Core Metadata specifications.
                    # This extra check can be removed once
                    # https://github.com/pypa/warehouse/issues/8257 is fixed
                    if isinstance(field_data, str):
                        row_data[sch.name] = [field_data]
                    else:
                        row_data[sch.name] = list(field_data)
                else:
                    row_data[sch.name] = field_data
            return row_data

        for first, second in product("fedcba9876543210", repeat=2):
            db_release_files = (request.db.query(File.md5_digest).filter(
                File.md5_digest.like(f"{first}{second}%")).yield_per(
                    1000).all())
            db_file_digests = [file.md5_digest for file in db_release_files]

            bq_file_digests = bq.query(
                "SELECT md5_digest "
                f"FROM {table_name} "
                f"WHERE md5_digest LIKE '{first}{second}%'").result()
            bq_file_digests = [
                row.get("md5_digest") for row in bq_file_digests
            ]

            md5_diff_list = list(set(db_file_digests) -
                                 set(bq_file_digests))[:1000]
            if not md5_diff_list:
                # There are no files that need synced to BigQuery
                continue

            release_files = (request.db.query(File).join(
                Release, Release.id == File.release_id).filter(
                    File.md5_digest.in_(md5_diff_list)).all())

            json_rows = [
                populate_data_using_schema(file) for file in release_files
            ]

            bq.load_table_from_json(
                json_rows,
                table_name,
                job_config=LoadJobConfig(schema=table_schema)).result()
            break
Esempio n. 28
0
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    # try:
    #     dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)
            # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
            # try:
            #     tables[table] = bigquery_client.create_table(tables[table])
            # except exceptions.Conflict:
            #     pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))
        load_job = bigquery_client.load_table_from_file(
            rows[table], table_ref, job_config=load_config)
        logger.info("loading job {}".format(load_job.job_id))
        logger.info(load_job.result())

    # for table in errors.keys():
    #     if not errors[table]:
    #         print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
    #     else:
    #         print('Errors:', errors[table], sep=" ")

    return state
Esempio n. 29
0
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef):
    """
    https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition

    TO

    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob

    :param dts_tabledef:
    :return:
    """
    from bq_dts import rest_client
    job_config = LoadJobConfig()

    dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema'])
    job_config.schema = dts_schema

    # BQ DTS does not provide controls for the following dispositions
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    if 'format' in dts_tabledef:
        dts_format = dts_tabledef['format']
        source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[dts_format]
        assert source_format is not None
        job_config.source_format = source_format

    if 'max_bad_records' in dts_tabledef:
        job_config.max_bad_records = dts_tabledef['max_bad_records']

    if 'encoding' in dts_tabledef:
        dts_encoding = dts_tabledef['encoding']
        job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[dts_encoding]

    if 'csv_options' in dts_tabledef:
        csv_opts = dts_tabledef['csv_options']
        if 'field_delimiter' in csv_opts:
            job_config.field_delimiter = csv_opts['field_delimiter']
        if 'allow_quoted_newlines' in csv_opts:
            job_config.allow_quoted_newlines = csv_opts['allow_quoted_newlines']
        if 'quote_char' in csv_opts:
            job_config.quote_character = csv_opts['quote_char']
        if 'skip_leading_rows' in csv_opts:
            job_config.skip_leading_rows = csv_opts['skip_leading_rows']

    return job_config
def persist_lines_job(project_id,
                      dataset_id,
                      lines=None,
                      truncate=False,
                      validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                log_message = ('A record for stream {} was encountered '
                               'before a corresponding schema')
                raise Exception(log_message.format(msg.stream))

            schema = schemas[msg.stream]

            msg.record = convert_dict_keys_to_bigquery_format(
                record=msg.record)

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data,
            # with a newline character splitting each row.
            dat = bytes(simplejson.dumps(msg.record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream

            schema = convert_schema_column_names_to_bigquery_format(
                schema=msg.schema)

            schemas[table] = schema
            key_properties[table] = msg.key_properties
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)

        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()

        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))

        load_job = bigquery_client.load_table_from_file(rows[table],
                                                        table_ref,
                                                        job_config=load_config)

        logger.info("loading job {}".format(load_job.job_id))

    return state