def insertCSV(stock): client = bigquery.Client(project_id) SCHEMA = [ SchemaField('symbol', 'STRING', mode='required'), SchemaField('date', 'DATE', mode='required'), SchemaField('close', 'FLOAT', mode='required'), SchemaField('high', 'FLOAT', mode='required'), SchemaField('low', 'FLOAT', mode='required'), SchemaField('open', 'FLOAT', mode='required'), SchemaField('volume', 'INTEGER', mode='required'), ] table_ref = client.dataset(dataset_id).table(stock) load_config = LoadJobConfig() load_config.skip_leading_rows = 1 load_config.schema = SCHEMA with open('Data/%s.csv' % stock, 'rb') as readable: r = csv.reader(readable, delimiter=',') client.load_table_from_file(readable, table_ref, job_config=load_config)
def process_response_rows_for_bigquery(self, rows: list, table_reference: TableReference): rows_dataframe = DataFrame.from_records(rows) rows_dataframe = concat( [rows_dataframe, rows_dataframe['dimensions'].apply(Series)], axis=1, join='inner') rows_dataframe = rows_dataframe.drop(['dimensions'], axis=1) rows_dataframe['date'] = rows_dataframe['date'].apply( lambda x: x.date()) job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column) for column in list(rows_dataframe.columns.values) ] try: load_job = self.bigquery.client.load_table_from_dataframe( rows_dataframe, table_reference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def load_from_file(file_path, table): table_ref = dataset.table(table) load_config = LoadJobConfig() load_config.autodetect = True with open(file_path, 'rb') as readable: result = client.load_table_from_file(readable, table_ref, job_config=load_config) # return result return table_ref.dataset_id + '.' + table_ref.table_id
def create_table_from_csv(self, dataset, table_name, file_path, schema): table_ref = dataset.table(table_name) load_config = LoadJobConfig() load_config.skip_leading_rows = 1 load_config.schema = schema with open(file_path, 'rb') as readable: self.client.load_table_from_file( readable, table_ref, job_config=load_config) # API request return
def _load_to_bq(self, client, dataset, table_name, table_schema, table_config, key_props, metadata_columns, truncate, rows): logger = self.logger partition_field = table_config.get("partition_field", None) cluster_fields = table_config.get("cluster_fields", None) force_fields = table_config.get("force_fields", {}) schema = build_schema(table_schema, key_properties=key_props, add_metadata=metadata_columns, force_fields=force_fields) load_config = LoadJobConfig() load_config.ignore_unknown_values = True load_config.schema = schema if partition_field: load_config.time_partitioning = bigquery.table.TimePartitioning( type_=bigquery.table.TimePartitioningType.DAY, field=partition_field) if cluster_fields: load_config.clustering_fields = cluster_fields load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: logger.info(f"Load {table_name} by FULL_TABLE (truncate)") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE else: logger.info(f"Appending to {table_name}") load_config.write_disposition = WriteDisposition.WRITE_APPEND logger.info("loading {} to BigQuery".format(table_name)) load_job = None try: load_job = client.load_table_from_file(rows, dataset.table(table_name), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) job = load_job.result() logger.info(job._properties) return job except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table_name, str(err))) if load_job and load_job.errors: reason = err.errors[0]["reason"] messages = [f"{err['message']}" for err in load_job.errors] logger.error("reason: {reason}, errors:\n{e}".format( reason=reason, e="\n".join(messages))) err.message = f"reason: {reason}, errors: {';'.join(messages)}" raise err
def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def _process_data_for_bigquery(self, data: DataFrame, output_tablereference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') try: load_job = self.bigquery.client.load_table_from_dataframe( data, output_tablereference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def upload_prevalence_data(client, tmp_f): client.load_table_from_file( tmp_f, "qof.prevalence", rewind=True, job_config=LoadJobConfig(schema=PREVALENCE_SCHEMA), )
def upload_achievement_data(client, tmp_f): client.load_table_from_file( tmp_f, "qof.achievement", rewind=True, job_config=LoadJobConfig(schema=ACHIEVEMENT_SCHEMA), )
def gcs_csv_to_table(full_table_id: str, remote_csv_path: str) -> Table: """ Insert CSV from Google Storage to BigQuery Table. :param full_table_id: Full ID of a Google BigQuery table. :type full_table_id: str :param remote_csv_path: Path to uploaded CSV. :type remote_csv_path: str :returns: str """ try: gcs_csv_uri = f"gs://{GCP_BUCKET_NAME}/{remote_csv_path}" job_config = LoadJobConfig( autodetect=True, skip_leading_rows=1, source_format=SourceFormat.CSV, ) load_job = gbq.load_table_from_uri(gcs_csv_uri, full_table_id, job_config=job_config) LOGGER.info(f"Starting job {load_job.job_id}.") LOGGER.info(load_job.result()) # Waits for table load to complete. return gbq.get_table(full_table_id) except BadRequest as e: LOGGER.error( f"Invalid GCP request when creating table `{full_table_id}`: {e}") except Exception as e: LOGGER.error( f"Unexpected error when creating table `{full_table_id}`: {e}")
def store_participant_data(df, project_id, destination_table, schema=None): """ Stores the fetched participant data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will append the data onto that designated table. :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API :param project_id: identifies the project :param destination_table: name of the table to be written in the form of dataset.tablename :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of participant data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the tables') client = get_client(project_id) if not schema: schema = get_table_schema(destination_table.split('.')[-1]) # Dataframe data fields must be of type datetime df = set_dataframe_date_fields(df, schema) load_job_config = LoadJobConfig(schema=schema) job = client.load_table_from_dataframe(df, destination_table, job_config=load_job_config) job.result() return job.job_id
def upload_tweets(): big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json') dataset_ref = big_query_client.dataset('Tweets') dataset = Dataset(dataset_ref) dataset.description = 'This represents tweets of trending topics' dataset = big_query_client.create_dataset(dataset) SCHEMA = [ SchemaField('Tweets', 'STRING', mode='Nullable'), ] table_ref = big_query_client.dataset('Tweets').table('tabletweet') load_config = LoadJobConfig() load_config.skip_leading_rows = 0 load_config.schema = SCHEMA load_config.allow_quoted_newlines = True load_config.ignore_unknown_values = False load_config.max_bad_records = 200 with open('tweets.csv', 'rb') as readable: big_query_client.load_table_from_file( readable, table_ref, job_config=load_config) print('tweets file uploaded to big query')
def loadjob_one(client, dataset_ref, table_name): data = [{'keyword': 'dummy-{}'.format(str(time.time())), 'partition_value': '2019-12-24'}] table_ref = dataset_ref.table(table_name) table_obj = Table(table_ref, schema=TABLE_SCHEMA) job_config = LoadJobConfig() job_config.schema = TABLE_SCHEMA result_obj = client.load_table_from_json(data, table_obj, job_config=job_config) sleep_time = 1 while result_obj.done() is False: LOGGER.info('waiting for %s second. data insertion.', sleep_time) time.sleep(sleep_time) if result_obj.errors: error_msg = 'Failed to insert: error_msg=%s' % result_obj.errors LOGGER.error(error_msg) raise FailedInsertingSerpCacheBigQueryException(error_msg)
def create_table(self, path, table_from='uri'): bp = BQParser(path) dataset_name = bp.dataset_name table_name = bp.table_name skip_leading_rows = bp.skip_leading_rows schema = bp.schema table_ref = self.client.dataset(dataset_name).table(table_name) load_config = LoadJobConfig() load_config.skip_leading_rows = skip_leading_rows load_config.schema = schema file_source = bp.properties.get('inputPath') if table_from == 'uri': self.client.load_table_from_uri(source_uris=file_source, destination=table_ref, job_config=load_config) else: raise ValueError('Not supported')
def push_bq(): for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info('loading {} to Bigquery.\n'.format(table)) load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info('loading job {}'.format(load_job.job_id)) logger.info(load_job.result()) rows[table] = TemporaryFile(mode='w+b')
def __create_load_job_config( self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig: config = LoadJobConfig() config.labels = ems_load_job_config.labels config.create_disposition = ems_load_job_config.create_disposition.value config.write_disposition = ems_load_job_config.write_disposition.value config.schema = _parse_schema_resource(ems_load_job_config.schema) config.skip_leading_rows = ems_load_job_config.skip_leading_rows return config
def get_table_from_load_job_config(config: bigquery.LoadJobConfig): """ The BigQuery python library does not currently expose destinationTable as a property in the LoadJobConfig class. Because of this, you have to convert the LoadJobConfig object to the API representation and then extract the dictionary's value for key: destinationTable. :param config: bigquery.LoadJobConfig :return: bigquery.TableReference """ print(f"Retrieving table for LoadJobConfig: {config.to_api_repr()}") if config.to_api_repr().get('load'): config = config.to_api_repr().get('load') if config.get('destinationTable'): project_id = config.get('destinationTable').get('projectId') dataset_id = config.get('destinationTable').get('datasetId') table_id = config.get('destinationTable').get('tableId') return bigquery.TableReference.from_string( f"{project_id}.{dataset_id}.{table_id}") return None
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str, prefix: str, gcs_client: GCSClient, hpo_id: str) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: Identifies the destination dataset :param bq_client: a BigQuery client object :param bucket_name: the bucket in GCS containing the archive files :param prefix: prefix of the filepath URI :param gcs_client: a Cloud Storage client object :param hpo_id: Identifies the HPO site :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix)) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) if table_name not in AOU_REQUIRED: LOGGER.debug(f'Skipping file for {table_name}') continue schema = get_table_schema(table_name) hpo_table_name = f'{hpo_id}_{table_name}' fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}' destination = Table(fq_hpo_table, schema=schema) destination = bq_client.create_table(destination) job_config = LoadJobConfig() job_config.schema = schema job_config.skip_leading_rows = 1 job_config.source_format = 'CSV' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri( source_uri, destination, job_config=job_config, job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_") LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def load_data(self, dataframe, dataset_id, table_id, chunksize): from google.cloud.bigquery import LoadJobConfig from six import BytesIO destination_table = self.client.dataset(dataset_id).table(table_id) job_config = LoadJobConfig() job_config.write_disposition = 'WRITE_APPEND' job_config.source_format = 'NEWLINE_DELIMITED_JSON' rows = [] remaining_rows = len(dataframe) total_rows = remaining_rows self._print("\n\n") for index, row in dataframe.reset_index(drop=True).iterrows(): row_json = row.to_json(force_ascii=False, date_unit='s', date_format='iso') rows.append(row_json) remaining_rows -= 1 if (len(rows) % chunksize == 0) or (remaining_rows == 0): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) body = '{}\n'.format('\n'.join(rows)) if isinstance(body, bytes): body = body.decode('utf-8') body = body.encode('utf-8') body = BytesIO(body) try: self.client.load_table_from_file( body, destination_table, job_config=job_config).result() except self.http_error as ex: self.process_http_error(ex) rows = [] self._print("\n")
def loadDataFromCSV(tablename, global_dataset_ref, filename): schema = getTableSchema(tablename, global_dataset_ref) table_ref = global_dataset_ref.table(tablename) load_config = LoadJobConfig() load_config.source_format = bigquery.SourceFormat.CSV load_config.schema = schema load_config.autodetect = True load_config.allow_quoted_newlines = True load_config.encoding = 'UTF-8' try: with open(filename, 'rb') as readable: job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config) except Exception as e: print("Error") print(e) job.result() print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id)) return # Testing # if __name__ == '__main__': # datasetname = 'Testing' # tablename = 'SOViews' # sqlquery = '''SELECT CONCAT( # 'https://stackoverflow.com/questions/', # CAST(id as STRING)) as url, # view_count # FROM `bigquery-public-data.stackoverflow.posts_questions` # WHERE tags like '%google-bigquery%' # ORDER BY view_count DESC # LIMIT 10''' #createDataset(datasetname) #Successfully tested this code 2018-09-24 # global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24 #createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 # getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 # runBigQueryQuery(sqlquery) #Successfully tested this code 2018-09-24 #loadDataFromCSV(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
def store_digital_health_status_data(project_id, json_data, destination_table, schema=None): """ Stores the fetched digital_health_sharing_status data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will create a partition in the designated table or append to the same partition. This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe. The data is stored using a JSON file object since it is one of the ways BigQuery expects it. :param project_id: identifies the project :param json_data: list of json objects retrieved from process_digital_health_data_to_json :param destination_table: fully qualified destination table name as 'project.dataset.table' :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of digital health data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the table') client = get_client(project_id) if not schema: schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS) try: table = client.get_table(destination_table) except NotFound: table = Table(destination_table, schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY) table = client.create_table(table) file_obj = StringIO() for json_obj in json_data: json.dump(json_obj, file_obj) file_obj.write('\n') job_config = LoadJobConfig( source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema) job = client.load_table_from_file(file_obj, table, rewind=True, job_config=job_config, job_id_prefix='ps_digital_health_load_') job.result() return job.job_id
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str, gcs_client: storage.Client) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: reference to destination dataset object :param bq_client: a BigQuery client object :param bucket_name: the location in GCS containing the vocabulary files :param gcs_client: a Cloud Storage client object :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name)) table_blobs = [_filename_to_table_name(blob.name) for blob in blobs] missing_blobs = [ table for table in VOCABULARY_TABLES if table not in table_blobs ] if missing_blobs: raise RuntimeError( f'Bucket {bucket_name} is missing files for tables {missing_blobs}' ) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) # ignore any non-vocabulary files if table_name not in VOCABULARY_TABLES: continue destination = dst_dataset.table(table_name) safe_schema = safe_schema_for(table_name) job_config = LoadJobConfig() job_config.schema = safe_schema job_config.skip_leading_rows = 1 job_config.field_delimiter = FIELD_DELIMITER job_config.max_bad_records = MAX_BAD_RECORDS job_config.source_format = 'CSV' job_config.quote_character = '' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri(source_uri, destination, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def load_csv_uri(self, table_id, schemas_orig): full_table_id = f'{self.DATASET}.{table_id}' job_config = LoadJobConfig(schema=schemas_orig[table_id], source_format=SourceFormat.CSV, skip_leading_rows=1) uri = f'gs://{self.BUCKET}/{table_id}.csv' load_job = self.bq_client.load_table_from_uri( uri, full_table_id, job_config=job_config, job_id_prefix=f'{table_id}_') print(f'job {load_job.job_id} started') load_job.add_done_callback(self.load_job_cb) return load_job
def load_test_data(self, df, project_id, dataset_id, table): """ Add data to the tables for the rule to run on. :param df: a dataframe containing data to insert :param project_id :param dataset_id :param table """ client = get_client(project_id) schema = get_table_schema(table) schema = [field for field in schema if field.name in list(df.columns)] load_job_config = LoadJobConfig(schema=schema) load_job = client.load_table_from_dataframe(df, f'{dataset_id}.{table}', job_config=load_job_config) load_job.result()
def load_json_file(self, table_id, schemas_orig): full_table_id = f'{self.DATASET}.{table_id}' job_config = LoadJobConfig( schema=schemas_orig[table_id], source_format=SourceFormat.NEWLINE_DELIMITED_JSON) file_path = f'{table_id}.json' with open(file_path, "rb") as source_file: load_job = self.bq_client.load_table_from_file( source_file, full_table_id, job_config=job_config, job_id_prefix=f'{table_id}_') print(f'job {load_job.job_id} started') load_job.add_done_callback(self.load_job_cb) return load_job
def __init__(self, *, table, partition: Optional[str] = None, from_: PackageFileLoader, bq_client: Client, load_job_config: LoadJobConfig = LoadJobConfig()): """Constructor of BaseDataLoader Args: table (Table): table to load data into. from_ (PackageFileLoader): specifies where data is. bq_client (Client): instance of bigquery client to use accross the DSL. partition (Optional[str], optional): if you plan to load into a specific partition. Used as a decorator. Defaults to None. load_job_config (LoadJobConfig, optional): Big Query load job config. This is the object updated by this DSL. Defaults to LoadJobConfig(). """ self.load_job_config = load_job_config self.table = table self.from_ = from_ self._bq_client = bq_client self.partition = partition
def sync_bigquery_release_files(request): bq = request.find_service(name="gcloud.bigquery") # Multiple table names can be specified by separating them with whitespace table_names = request.registry.settings[ "warehouse.release_files_table"].split() for table_name in table_names: table_schema = bq.get_table(table_name).schema # Using the schema to populate the data allows us to automatically # set the values to their respective fields rather than assigning # values individually def populate_data_using_schema(file): release = file.release project = release.project row_data = dict() for sch in table_schema: # The order of data extraction below is determined based on the # classes that are most recently updated if hasattr(file, sch.name): field_data = getattr(file, sch.name) elif hasattr(release, sch.name) and sch.name == "description": field_data = getattr(release, sch.name).raw elif sch.name == "description_content_type": field_data = getattr(release, "description").content_type elif hasattr(release, sch.name): field_data = getattr(release, sch.name) elif hasattr(project, sch.name): field_data = getattr(project, sch.name) else: field_data = None if isinstance(field_data, datetime.datetime): field_data = field_data.isoformat() # Replace all empty objects to None will ensure # proper checks if a field is nullable or not if not isinstance(field_data, bool) and not field_data: field_data = None if field_data is None and sch.mode == "REPEATED": row_data[sch.name] = [] elif field_data and sch.mode == "REPEATED": # Currently, some of the metadata fields such as # the 'platform' tag are incorrectly classified as a # str instead of a list, hence, this workaround to comply # with PEP 345 and the Core Metadata specifications. # This extra check can be removed once # https://github.com/pypa/warehouse/issues/8257 is fixed if isinstance(field_data, str): row_data[sch.name] = [field_data] else: row_data[sch.name] = list(field_data) else: row_data[sch.name] = field_data return row_data for first, second in product("fedcba9876543210", repeat=2): db_release_files = (request.db.query(File.md5_digest).filter( File.md5_digest.like(f"{first}{second}%")).yield_per( 1000).all()) db_file_digests = [file.md5_digest for file in db_release_files] bq_file_digests = bq.query( "SELECT md5_digest " f"FROM {table_name} " f"WHERE md5_digest LIKE '{first}{second}%'").result() bq_file_digests = [ row.get("md5_digest") for row in bq_file_digests ] md5_diff_list = list(set(db_file_digests) - set(bq_file_digests))[:1000] if not md5_diff_list: # There are no files that need synced to BigQuery continue release_files = (request.db.query(File).join( Release, Release.id == File.release_id).filter( File.md5_digest.in_(md5_diff_list)).all()) json_rows = [ populate_data_using_schema(file) for file in release_files ] bq.load_table_from_json( json_rows, table_name, job_config=LoadJobConfig(schema=table_schema)).result() break
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) # try: # dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8') rows[msg.stream].write(dat) # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = TemporaryFile(mode='w+b') errors[table] = None # try: # tables[table] = bigquery_client.create_table(tables[table]) # except exceptions.Conflict: # pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) # for table in errors.keys(): # if not errors[table]: # print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) # else: # print('Errors:', errors[table], sep=" ") return state
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef): """ https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition TO https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob :param dts_tabledef: :return: """ from bq_dts import rest_client job_config = LoadJobConfig() dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema']) job_config.schema = dts_schema # BQ DTS does not provide controls for the following dispositions job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE if 'format' in dts_tabledef: dts_format = dts_tabledef['format'] source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[dts_format] assert source_format is not None job_config.source_format = source_format if 'max_bad_records' in dts_tabledef: job_config.max_bad_records = dts_tabledef['max_bad_records'] if 'encoding' in dts_tabledef: dts_encoding = dts_tabledef['encoding'] job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[dts_encoding] if 'csv_options' in dts_tabledef: csv_opts = dts_tabledef['csv_options'] if 'field_delimiter' in csv_opts: job_config.field_delimiter = csv_opts['field_delimiter'] if 'allow_quoted_newlines' in csv_opts: job_config.allow_quoted_newlines = csv_opts['allow_quoted_newlines'] if 'quote_char' in csv_opts: job_config.quote_character = csv_opts['quote_char'] if 'skip_leading_rows' in csv_opts: job_config.skip_leading_rows = csv_opts['skip_leading_rows'] return job_config
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: log_message = ('A record for stream {} was encountered ' 'before a corresponding schema') raise Exception(log_message.format(msg.stream)) schema = schemas[msg.stream] msg.record = convert_dict_keys_to_bigquery_format( record=msg.record) if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, # with a newline character splitting each row. dat = bytes(simplejson.dumps(msg.record) + '\n', 'UTF-8') rows[msg.stream].write(dat) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schema = convert_schema_column_names_to_bigquery_format( schema=msg.schema) schemas[table] = schema key_properties[table] = msg.key_properties rows[table] = TemporaryFile(mode='w+b') errors[table] = None elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = bigquery_client.load_table_from_file(rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) return state