def side_effect(request): if (request == bigquery.BigqueryTablesGetRequest( projectId='project', datasetId='dataset', tableId='table__sample_info')): raise exceptions.HttpError(response={'status': '404'}, url='', content='') return bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table__chr1_part1'))
def parse_table_reference(table, dataset=None, project=None): """Parses a table reference into a (project, dataset, table) tuple. Args: table: The ID of the table. The ID must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_). If dataset argument is None then the table argument must contain the entire table reference: 'DATASET.TABLE' or 'PROJECT:DATASET.TABLE'. This argument can be a bigquery.TableReference instance in which case dataset and project are ignored and the reference is returned as a result. Additionally, for date partitioned tables, appending '$YYYYmmdd' to the table name is supported, e.g. 'DATASET.TABLE$YYYYmmdd'. dataset: The ID of the dataset containing this table or null if the table reference is specified entirely by the table argument. project: The ID of the project containing this table or null if the table reference is specified entirely by the table (and possibly dataset) argument. Returns: A TableReference object from the bigquery API. The object has the following attributes: projectId, datasetId, and tableId. Raises: ValueError: if the table reference as a string does not match the expected format. """ if isinstance(table, bigquery.TableReference): return table elif callable(table): return table elif isinstance(table, value_provider.ValueProvider): return table table_reference = bigquery.TableReference() # If dataset argument is not specified, the expectation is that the # table argument will contain a full table reference instead of just a # table name. if dataset is None: match = re.match( r'^((?P<project>.+):)?(?P<dataset>\w+)\.(?P<table>[\w\$]+)$', table) if not match: raise ValueError( 'Expected a table reference (PROJECT:DATASET.TABLE or ' 'DATASET.TABLE) instead of %s.' % table) table_reference.projectId = match.group('project') table_reference.datasetId = match.group('dataset') table_reference.tableId = match.group('table') else: table_reference.projectId = project table_reference.datasetId = dataset table_reference.tableId = table return table_reference
def create_table(cls, table_name, data, table_schema): table = bigquery.Table(tableReference=bigquery.TableReference( projectId=cls.project, datasetId=cls.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=cls.project, datasetId=cls.dataset_id, table=table) cls.bigquery_client.client.tables.Insert(request) cls.bigquery_client.insert_rows(cls.project, cls.dataset_id, table_name, data) return table_schema
def expand(self,pcoll): table_spec = bigquery.TableReference( projectId='iotpubsub-1536350750202', datasetId='baybenames', #tableId='relation_extraction_data' tableId='relation_data_sample' ) return ( pcoll |'Read in put table' >> beam.io.Read(beam.io.BigQuerySource(table_spec)) |'Split words' >> beam.ParDo(SplitSentence_Updated_Table()) |'Split test and training data' >> beam.Partition(lambda element, _: 0 if randint(0,100)<80 else 1,2 ) )
def _create_table(cls, table_name): table_schema = bigquery.TableSchema() number = bigquery.TableFieldSchema() number.name = 'number' number.type = 'INTEGER' table_schema.fields.append(number) string = bigquery.TableFieldSchema() string.name = 'string' string.type = 'STRING' table_schema.fields.append(string) time = bigquery.TableFieldSchema() time.name = 'time' time.type = 'TIME' table_schema.fields.append(time) datetime = bigquery.TableFieldSchema() datetime.name = 'datetime' datetime.type = 'DATETIME' table_schema.fields.append(datetime) rec = bigquery.TableFieldSchema() rec.name = 'rec' rec.type = 'RECORD' rec_datetime = bigquery.TableFieldSchema() rec_datetime.name = 'rec_datetime' rec_datetime.type = 'DATETIME' rec.fields.append(rec_datetime) rec_rec = bigquery.TableFieldSchema() rec_rec.name = 'rec_rec' rec_rec.type = 'RECORD' rec_rec_datetime = bigquery.TableFieldSchema() rec_rec_datetime.name = 'rec_rec_datetime' rec_rec_datetime.type = 'DATETIME' rec_rec.fields.append(rec_rec_datetime) rec.fields.append(rec_rec) table_schema.fields.append(rec) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=cls.project, datasetId=cls.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=cls.project, datasetId=cls.dataset_id, table=table) cls.bigquery_client.client.tables.Insert(request) cls.bigquery_client.insert_rows(cls.project, cls.dataset_id, table_name, cls.TABLE_DATA)
def test_table_with_write_disposition_append(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table client.tables.Insert.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client): pass self.assertTrue(client.tables.Get.called) self.assertFalse(client.tables.Delete.called) self.assertFalse(client.tables.Insert.called)
def _create_parquet_file(self, blob_name, staging_table_util, destination_prefix): """Creates a parquet file from a staging table and stores in GCS. The parquet file is generated using DataFLow, since BigQuery Extract Jobs do not support the parquet file type as a destination format. Args: blob_name(str): Name of the file (or blob) to be generated. Starts with 'fileType=' and end with the file extension. Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv # pylint: disable=line-too-long staging_table_util(benchmark_tools.table_util.TableUtil): Util object for interacting with the staging table that the parquet file will be generated from. destination_prefix(str): String containing the 'gs://' prefix, the bucket name, and the path of the file, without the extension. This is needed by the WriteToParquet class. Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long """ logging.info('Attempting to create file ' '{0:s}'.format(blob_name)) pipeline_args = [ '--project', self.project_id, '--staging_location', self.dataflow_staging_location, '--temp_location', self.dataflow_temp_location, '--save_main_session', '--worker_machine_type', 'n1-highcpu-32', '--runner', 'DataflowRunner', '--setup_file', './setup.py' ] options = pipeline_options.PipelineOptions(pipeline_args) table_spec = beam_bigquery.TableReference( projectId=self.project_id, datasetId=self.primitive_staging_dataset_id, tableId=staging_table_util.table_id) bq_schema = staging_table_util.table.schema pa_schema = parquet_util.ParquetUtil( bq_schema).get_pa_translated_schema() p = beam.Pipeline(options=options) table = ( p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec))) (table | beam.io.WriteToParquet( file_path_prefix=destination_prefix, schema=pa_schema, file_name_suffix='.parquet', num_shards=1, shard_name_template='', )) p.run().wait_until_finish() logging.info('Created file: {0:s}'.format(blob_name))
def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59.990000' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # the API Tools bigquery client expects byte values to be base-64 encoded # TODO https://github.com/apache/beam/issues/19073: upgrade to # google-cloud-bigquery which does not require handling the encoding in # beam for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') passed, errors = self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) self.assertTrue(passed, 'Error in BQ setup: %s' % errors)
def test_table_not_empty_and_write_disposition_empty( self, patched_time_sleep): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tabledata.List.return_value = bigquery.TableDataList(totalRows=1) write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY with self.assertRaisesRegexp( RuntimeError, r'Table project:dataset\.table is not empty but write ' r'disposition is WRITE_EMPTY'): with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client): pass
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument('--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # The pipeline will be run on exiting the with block. with beam.Pipeline(options=pipeline_options) as p: bq_details = "justlikethat-294122:log_analysis.aws_log" SCHEMA = "bucket:string,date:datetime,operation:string,key:string,request_uri:string,http_status:string,error_code:string,bytes_sent:string,total_time:string,turnaround_time:string,referrer:string,user_agent:string,request_header:string" log_parser = AwsLogParser() # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) data = (lines | 'Parse AWS Log File' >> beam.Map(lambda log: log_parser.process(log))) table_spec = bigquery.TableReference(projectId='justlikethat-294122', datasetId='log_analysis', tableId='aws_log_test_1') data | beam.io.WriteToBigQuery( table_spec, schema=SCHEMA, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) #data | 'Write' >> WriteToText(known_args.output) """
def test_no_table_and_create_if_needed(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.side_effect = HttpError(response={'status': '404'}, url='', content='') client.tables.Insert.return_value = table create_disposition = beam.io.BigQueryDisposition.CREATE_IF_NEEDED with beam.io.BigQuerySink( 'project:dataset.table', schema='somefield:INTEGER', create_disposition=create_disposition).writer(client): pass self.assertTrue(client.tables.Get.called) self.assertTrue(client.tables.Insert.called)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument( '--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session # The pipeline will be run on exiting the with block. with beam.Pipeline(options=pipeline_options) as p: """ quotes = p | beam.Create([ { 'source': 'Mahatma Gandhi', 'quote': 'My life is my message.' } ]) """ var = AwsLogParser() quotes = p | beam.Create(var.my_json()) table_spec = bigquery.TableReference( projectId='justlikethat-294122', datasetId='log_analysis', tableId='quotes_2') table_schema = 'source:STRING, quote:STRING' quotes | beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def test_existing_sample_table(self): args = self._make_args([ '--append', 'False', '--output_table', 'project:dataset.table', '--sharding_config_path', 'gcp_variant_transforms/testing/data/sharding_configs/' 'residual_at_end.yaml' ]) client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table__sample_info')) with self.assertRaisesRegexp( ValueError, 'project:dataset.table__sample_info already exists'): self._options.validate(args, client)
def _create_table(self, project_id, dataset_id, table_id, schema, additional_parameters=None): additional_parameters = additional_parameters or {} table = bigquery.Table(tableReference=bigquery.TableReference( projectId=project_id, datasetId=dataset_id, tableId=table_id), schema=schema, **additional_parameters) request = bigquery.BigqueryTablesInsertRequest(projectId=project_id, datasetId=dataset_id, table=table) response = self.client.tables.Insert(request) logging.debug("Created the table with id %s", table_id) # The response is a bigquery.Table instance. return response
def test_table_empty_and_write_disposition_empty(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table client.tabledata.List.return_value = bigquery.TableDataList( totalRows=0) client.tables.Insert.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client): pass self.assertTrue(client.tables.Get.called) self.assertTrue(client.tabledata.List.called) self.assertFalse(client.tables.Delete.called) self.assertFalse(client.tables.Insert.called)
def create_table_new_types(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery client expects base64 encoded bytes for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows(self.project, self.dataset_id, table_name, table_data)
def test_table_exist(self): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table')) self.assertEqual( bigquery_util.table_exist(client, 'project', 'dataset', 'table'), True) client.tables.Get.side_effect = exceptions.HttpError( response={'status': '404'}, url='', content='') self.assertEqual( bigquery_util.table_exist(client, 'project', 'dataset', 'table'), False) client.tables.Get.side_effect = exceptions.HttpError( response={'status': '401'}, url='', content='') self.assertRaises(exceptions.HttpError, bigquery_util.table_exist, client, 'project', 'dataset', 'table')
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument('--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: quotes = p | beam.Create([{ 'source': 'Mahatma Gandhi', 'quote': 'My life is my message.' }, { 'source': 'Mahatma Gandhi', 'quote': 'My life is my message.' }]) table_spec = bigquery.TableReference(projectId='justlikethat-294122', datasetId='mydataset', tableId='quotes') table_schema = 'source:STRING, quote:STRING' quotes | beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) quotes | WriteToText(known_args.output)
def create_table(cls, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'number' table_field.type = 'INTEGER' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'str' table_field.type = 'STRING' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=cls.project, datasetId=cls.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=cls.project, datasetId=cls.dataset_id, table=table) cls.bigquery_client.client.tables.Insert(request) cls.bigquery_client.insert_rows(cls.project, cls.dataset_id, table_name, cls.TABLE_DATA)
def test_dofn_client_start_bundle_called(self): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference(projectId='project_id', datasetId='dataset_id', tableId='table_id')) create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND fn = beam.io.gcp.bigquery.BigQueryWriteFn( table_id='table_id', dataset_id='dataset_id', project_id='project_id', batch_size=2, schema='month:INTEGER', create_disposition=create_disposition, write_disposition=write_disposition, client=client) fn.start_bundle() self.assertTrue(client.tables.Get.called)
def test_dofn_client_process_performs_batching(self): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project_id', datasetId='dataset_id', tableId='table_id')) client.tabledata.InsertAll.return_value = \ bigquery.TableDataInsertAllResponse(insertErrors=[]) create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND fn = beam.io.gcp.bigquery.BigQueryWriteFn( batch_size=2, create_disposition=create_disposition, write_disposition=write_disposition, kms_key=None, test_client=client) fn.process(('project_id:dataset_id.table_id', {'month': 1})) # InsertRows not called as batch size is not hit yet self.assertFalse(client.tabledata.InsertAll.called)
def test_rows_are_written(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND client.insert_rows_json.return_value = [] with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client) as writer: writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14}) sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14} client.insert_rows_json.assert_called_with(gcp_bigquery.TableReference( gcp_bigquery.DatasetReference('project', 'dataset'), 'table'), json_rows=[sample_row], row_ids=['_1'], skip_invalid_rows=True)
def run(): argv = [ '--project={0}'.format(PROJECT), '--job_name={}'.format('wcount' + get_time()), '--save_main_session', '--region=europe-west1', '--requirements_file=requirements.txt', '--staging_location={}/staging/'.format(BUCKET), '--temp_location={}/staging/'.format(BUCKET), '--runner=DataflowRunner' ] p = beam.Pipeline(argv=argv) # sys.argv # input = '' table_spec = bigquery.TableReference(projectId='patstat2016a', datasetId='raw', tableId='tls2012_cp') output_prefix = '{}/countWord/fullCW'.format(BUCKET) query = 'SELECT\ appln_title_lg,\ appln_title,\ appln_auth,\ year\ FROM\ `patstat2016a.raw.tls2012_cp`\ WHERE\ appln_title_lg="en"' (p | 'ReadTable' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) # more efficient # |'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec)) # |'FilterLg' >> beam.Filter(lambda line: line['appln_title_lg']=='en') | 'FormatPairTitle' >> beam.Map(lambda line: formatPairTitle(line)) | 'FormatPairWord' >> beam.ParDo(SplitAndPairWithKey()) | 'GroupAndSum' >> beam.CombinePerKey(sum) | 'FilterSW' >> beam.Filter(lambda (w, c): filterStopWords(w[-1])) | 'FormatCSV' >> beam.Map(lambda pair: formatCSV(pair)) | 'Write' >> beam.io.WriteToText(output_prefix, file_name_suffix='.txt')) p.run()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument('--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: utility = Utility() quotes = (p | 'Read' >> ReadFromText(known_args.input) | 'ParDo Dealings' >> beam.ParDo(Split())) table_spec = bigquery.TableReference(projectId='justlikethat-294122', datasetId='log_analysis', tableId='quotes') table_schema = 'source:STRING, quote:STRING' quotes | beam.io.WriteToBigQuery( table_spec, table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def __init__(self): self.table_spec = bigquery.TableReference( projectId='justlikethat-294122', datasetId='medium_dataset', tableId='mytable') self.schema = 'bucket:string,date:datetime,remote_ip:string,operation:string,\ key:string,request_uri:string,http_status:string,error_code:string,\ bytes_sent:string,object_size:string,total_time:string,turn_aroundtime:string,\ referrer:string' self.log = { # 'bucket_owner': '', 'bucket': '', 'date': '', #'time_offset': '', 'remote_ip': '', #'request_arn': '', #'request_id': '', 'operation': '', 'key': '', 'request_uri': '', 'http_status': '', 'error_code': '', 'bytes_sent': '', 'object_size': '', 'total_time': '', 'turn_aroundtime': '', 'referrer': '', #'user_agent': '', #'version_id': '', #'host_id': '', #'signature_version': '', #'cipher_suite': '', #'authentication_type': '', #'host_header': '', #'tls_version': '' }
def test_dofn_client_process_flush_called(self): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project_id', datasetId='dataset_id', tableId='table_id')) client.tabledata.InsertAll.return_value = ( bigquery.TableDataInsertAllResponse(insertErrors=[])) create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND fn = beam.io.gcp.bigquery.BigQueryWriteFn( batch_size=2, create_disposition=create_disposition, write_disposition=write_disposition, kms_key=None, test_client=client) fn.start_bundle() fn.process(('project_id:dataset_id.table_id', ({'month': 1}, 'insertid1'))) fn.process(('project_id:dataset_id.table_id', ({'month': 2}, 'insertid2'))) # InsertRows called as batch size is hit self.assertTrue(client.tabledata.InsertAll.called)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument('--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: #obj = Utility() table_spec = bigquery.TableReference(projectId='justlikethat-294122', datasetId='log_analysis', tableId='quotes') table_schema = 'source:STRING, quote:STRING' data_ingestion = dataingestion() (p | 'Read from a File' >> beam.io.ReadFromText(known_args.input) | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> WriteToBigQuery( table_spec, schema='source:STRING, quote:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) """
def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request)
def test_dofn_client_start_bundle_called(self): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project_id', datasetId='dataset_id', tableId='table_id')) create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND schema = {'fields': [ {'name': 'month', 'type': 'INTEGER', 'mode': 'NULLABLE'}]} fn = beam.io.gcp.bigquery.BigQueryWriteFn( table_id='table_id', dataset_id='dataset_id', project_id='project_id', batch_size=2, schema=schema, create_disposition=create_disposition, write_disposition=write_disposition, kms_key=None, test_client=client) fn.start_bundle() self.assertTrue(client.tables.Get.called)
def test_rows_are_written(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND insert_response = mock.Mock() insert_response.insertErrors = [] client.tabledata.InsertAll.return_value = insert_response with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client) as writer: writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14}) sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14} expected_rows = [] json_object = bigquery.JsonObject() for k, v in iteritems(sample_row): json_object.additionalProperties.append( bigquery.JsonObject.AdditionalProperty(key=k, value=to_json_value(v))) expected_rows.append( bigquery.TableDataInsertAllRequest.RowsValueListEntry( insertId='_1', # First row ID generated with prefix '' json=json_object)) client.tabledata.InsertAll.assert_called_with( bigquery.BigqueryTabledataInsertAllRequest( projectId='project', datasetId='dataset', tableId='table', tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest( rows=expected_rows, skipInvalidRows=False, )))