def json_compliance_exception(self, value): with self.assertRaisesRegexp(ValueError, re.escape(JSON_COMPLIANCE_ERROR)): schema_definition = [('f', 'FLOAT')] schema = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition ]) coder = TableRowJsonCoder(table_schema=schema) test_row = bigquery.TableRow( f=[bigquery.TableCell(v=to_json_value(value))]) coder.encode(test_row)
def _make_schema(fields): def _fill_schema(fields): for field in fields: table_field = bigquery.TableFieldSchema() table_field.name, table_field.type, nested_fields = field if nested_fields: table_field.fields = list(_fill_schema(nested_fields)) yield table_field schema = bigquery.TableSchema() schema.fields = list(_fill_schema(fields)) return schema
def test_get_or_create_table(self): client = mock.Mock() client.tables.Insert.return_value = 'table_id' client.tables.Get.side_effect = [None, 'table_id'] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) new_table = wrapper.get_or_create_table( 'project_id', 'dataset_id', 'table_id', bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False) self.assertEqual(new_table, 'table_id')
def _get_table_schema(self): # type (None) -> bigquery.TableSchema schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema(name='IB', type=TableFieldConstants.TYPE_BOOLEAN, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc')) schema.fields.append( bigquery.TableFieldSchema(name='IBR', type=TableFieldConstants.TYPE_BOOLEAN, mode=TableFieldConstants.MODE_REPEATED, description='INFO foo desc')) schema.fields.append( bigquery.TableFieldSchema(name='II', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='INFO foo desc')) schema.fields.append( bigquery.TableFieldSchema(name='IF', type=TableFieldConstants.TYPE_FLOAT, mode=TableFieldConstants.MODE_REPEATED, description='INFO foo desc')) schema.fields.append( bigquery.TableFieldSchema(name='IS', type=TableFieldConstants.TYPE_STRING, mode=TableFieldConstants.MODE_REPEATED, description='INFO foo desc')) # Call record. call_record = bigquery.TableFieldSchema( name=ColumnKeyConstants.CALLS, type=TableFieldConstants.TYPE_RECORD, mode=TableFieldConstants.MODE_REPEATED, description='One record for each call.') call_record.fields.append( bigquery.TableFieldSchema(name='FB', type=TableFieldConstants.TYPE_BOOLEAN, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) call_record.fields.append( bigquery.TableFieldSchema(name='FI', type=TableFieldConstants.TYPE_INTEGER, mode=TableFieldConstants.MODE_NULLABLE, description='FORMAT foo desc')) call_record.fields.append( bigquery.TableFieldSchema(name='FS', type=TableFieldConstants.TYPE_STRING, mode=TableFieldConstants.MODE_REPEATED, description='FORMAT foo desc')) schema.fields.append(call_record) return schema
def test_generate_header_fields_from_schema_date_type(self): schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name='partition_date_please_ignore', type='Date', mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Column required by BigQuery partitioning logic.')) header = schema_converter.generate_header_fields_from_schema(schema) expected_header = vcf_header_io.VcfHeader(infos=OrderedDict(), formats=OrderedDict()) self.assertEqual(header, expected_header)
def get_table_schema(): """Formulate the schema for the destination table.""" fields = [(SAMPLE, 'string', 'required'), (CLUSTER, 'integer', 'required')] from apache_beam.io.gcp.internal.clients import bigquery # pylint: disable=wrong-import-order, wrong-import-position table_schema = bigquery.TableSchema() for (col_name, col_type, col_mode) in fields: field_schema = bigquery.TableFieldSchema() field_schema.name = col_name field_schema.type = col_type field_schema.mode = col_mode table_schema.fields.append(field_schema) return table_schema
def test_get_or_create_table_race_condition(self): client = mock.Mock() client.tables.Insert.side_effect = HttpError( response={'status': '409'}, url='', content='') client.tables.Get.side_effect = [None, 'table_id'] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) new_table = wrapper.get_or_create_table( 'project_id', 'dataset_id', 'table_id', bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False) self.assertEqual(new_table, 'table_id')
def competition_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='region', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='market_count', type='INTEGER', mode='REQUIRED'), ])
def test_simple_schemas(self): schema1 = bigquery.TableSchema(fields=[]) self.assertTrue(check_schema_equal(schema1, schema1)) schema2 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64") ]) self.assertTrue(check_schema_equal(schema2, schema2)) self.assertFalse(check_schema_equal(schema1, schema2)) schema3 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name="b", mode="REPEATED", type="RECORD", fields=[ bigquery.TableFieldSchema( name="c", mode="REQUIRED", type="BOOL") ]) ]) self.assertTrue(check_schema_equal(schema3, schema3)) self.assertFalse(check_schema_equal(schema2, schema3))
def test_get_variant_query_no_region(self): args = self._create_mock_args( input_table='my_bucket:my_dataset.my_table', genomic_regions=None) schema = bigquery.TableSchema() schema.fields.append(bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) self.assertEqual(bq_to_vcf._get_variant_query(args, schema), 'SELECT reference_name FROM ' '`my_bucket.my_dataset.my_table`')
def test_descriptions(self): """ Test that differences in description are ignored when ignore_descriptions=True. """ schema1 = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name="a", mode="REQUIRED", type="FLOAT64", description="Field A", ), bigquery.TableFieldSchema( name="b", mode="REQUIRED", type="INT64", ), ]) schema2 = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name="a", mode="REQUIRED", type="FLOAT64", description="Field A is for Apple"), bigquery.TableFieldSchema( name="b", mode="REQUIRED", type="INT64", description="Field B", ), ]) self.assertFalse(check_schema_equal(schema1, schema2)) self.assertTrue( check_schema_equal(schema1, schema2, ignore_descriptions=True))
def run (input_topic, output_table, window_size=1.0, pipeline_args=None): pipeline_options = PipelineOptions( pipeline_args, streaming=True, save_main_session=True ) with beam.Pipeline(options=pipeline_options) as p: from apache_beam.io.gcp.internal.clients import bigquery table_schema = bigquery.TableSchema() host_schema = bigquery.TableFieldSchema() host_schema.name = 'host' host_schema.type = 'string' host_schema.mode = 'nullable' table_schema.fields.append(host_schema) time_schema = bigquery.TableFieldSchema() time_schema.name = 'time' time_schema.type = 'string' time_schema.mode = 'nullable' table_schema.fields.append(time_schema) request_type_schema = bigquery.TableFieldSchema() request_type_schema.name = 'request' request_type_schema.type = 'string' request_type_schema.mode = 'nullable' table_schema.fields.append(request_type_schema) status_schema = bigquery.TableFieldSchema() status_schema.name = 'status' status_schema.type = 'string' status_schema.mode = 'nullable' table_schema.fields.append(status_schema) size_schema = bigquery.TableFieldSchema() size_schema.name = 'size' size_schema.type = 'string' size_schema.mode = 'nullable' table_schema.fields.append(size_schema) parsing_webserver_logs = ( p | "read data from pubsub" >> beam.io.ReadFromPubSub(topic=input_topic).with_output_types(bytes) | "lines" >> beam.Map(lambda x : x.decode("utf-8")) | "Parse server log data" >> beam.ParDo(ParseApacheServerLog()) | "Write to Big Query" >> beam.io.WriteToBigQuery(output_table ,schema = table_schema ,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED ,write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ) )
def _create_table(cls, table_name): table_schema = bigquery.TableSchema() number = bigquery.TableFieldSchema() number.name = 'number' number.type = 'INTEGER' table_schema.fields.append(number) string = bigquery.TableFieldSchema() string.name = 'string' string.type = 'STRING' table_schema.fields.append(string) time = bigquery.TableFieldSchema() time.name = 'time' time.type = 'TIME' table_schema.fields.append(time) datetime = bigquery.TableFieldSchema() datetime.name = 'datetime' datetime.type = 'DATETIME' table_schema.fields.append(datetime) rec = bigquery.TableFieldSchema() rec.name = 'rec' rec.type = 'RECORD' rec_datetime = bigquery.TableFieldSchema() rec_datetime.name = 'rec_datetime' rec_datetime.type = 'DATETIME' rec.fields.append(rec_datetime) rec_rec = bigquery.TableFieldSchema() rec_rec.name = 'rec_rec' rec_rec.type = 'RECORD' rec_rec_datetime = bigquery.TableFieldSchema() rec_rec_datetime.name = 'rec_rec_datetime' rec_rec_datetime.type = 'DATETIME' rec_rec.fields.append(rec_rec_datetime) rec.fields.append(rec_rec) table_schema.fields.append(rec) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=cls.project, datasetId=cls.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=cls.project, datasetId=cls.dataset_id, table=table) cls.bigquery_client.client.tables.Insert(request) cls.bigquery_client.insert_rows(cls.project, cls.dataset_id, table_name, cls.TABLE_DATA)
def test_get_or_create_table_invalid_tablename(self): invalid_names = ['big-query', 'table name', 'a' * 1025] for table_id in invalid_names: client = mock.Mock() client.tables.Get.side_effect = [None] wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) self.assertRaises( ValueError, wrapper.get_or_create_table, 'project_id', 'dataset_id', table_id, bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='b', type='BOOLEAN', mode='REQUIRED') ]), False, False)
def expand(self, pcoll): table_schema = bigquery.TableSchema() head_schema = bigquery.TableFieldSchema() head_schema.name = 'head' head_schema.type = 'string' head_schema.mode = 'nullable' table_schema.fields.append(head_schema) head_type_schema = bigquery.TableFieldSchema() head_type_schema.name = 'head_type' head_type_schema.type = 'string' head_type_schema.mode = 'nullable' table_schema.fields.append(head_type_schema) relation_schema = bigquery.TableFieldSchema() relation_schema.name = 'relation' relation_schema.type = 'string' relation_schema.mode = 'nullable' table_schema.fields.append(relation_schema) tail_schema = bigquery.TableFieldSchema() tail_schema.name = 'tail' tail_schema.type = 'string' tail_schema.mode = 'nullable' table_schema.fields.append(tail_schema) tail_type_schema = bigquery.TableFieldSchema() tail_type_schema.name = 'tail_type' tail_type_schema.type = 'string' tail_type_schema.mode = 'nullable' table_schema.fields.append(tail_type_schema) sentence_schema = bigquery.TableFieldSchema() sentence_schema.name = 'sentence' sentence_schema.type = 'string' sentence_schema.mode = 'nullable' table_schema.fields.append(sentence_schema) return ( pcoll | 'Parse the json lines' >> beam.ParDo(Parse_json()) | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( table='relation_extraction_data', dataset=DATASET_ID, project=PROJECT_ID, schema=table_schema, # Pass the defined table_schema create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def test_get_query_columns(self): schema = bigquery.TableSchema() schema.fields.append(bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) schema.fields.append(bigquery.TableFieldSchema( name='partition_date_please_ignore', type='Date', mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Column required by BigQuery partitioning logic.')) expected_columns = [bigquery_util.ColumnKeyConstants.REFERENCE_NAME] self.assertEqual(bq_to_vcf._get_query_columns(schema), expected_columns)
def test_table_schema_parsing_end_to_end(self): string_field = bigquery.TableFieldSchema( name='s', type='STRING', mode='NULLABLE') nested_field = bigquery.TableFieldSchema( name='x', type='INTEGER', mode='NULLABLE') number_field = bigquery.TableFieldSchema( name='n', type='INTEGER', mode='NULLABLE') record_field = bigquery.TableFieldSchema( name='r', type='RECORD', mode='NULLABLE', fields=[nested_field]) schema = bigquery.TableSchema( fields=[string_field, number_field, record_field]) table_schema = beam.io.gcp.bigquery.BigQueryWriteFn.get_table_schema( beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema(schema)) self.assertEqual(table_schema, schema)
def get_bq_schema(): def add_field(schema, field_name, field_type): field_schema = bigquery.TableFieldSchema() field_schema.name = field_name field_schema.type = field_type field_schema.mode = 'NULLABLE' schema.fields.append(field_schema) table_schema = bigquery.TableSchema() add_field(table_schema, 'offer_id', 'INTEGER') add_field(table_schema, 'campaign_id', 'INTEGER') add_field(table_schema, 'crm_product_id', 'INTEGER') add_field(table_schema, 'sku', 'STRING') add_field(table_schema, 'offer_name', 'STRING') add_field(table_schema, 'display_txt', 'STRING') add_field(table_schema, 'display_flag', 'INTEGER') add_field(table_schema, 'offer_type_id', 'INTEGER') add_field(table_schema, 'effective_from_date', 'TIMESTAMP') add_field(table_schema, 'effective_to_date', 'TIMESTAMP') add_field(table_schema, 'status', 'INTEGER') add_field(table_schema, 'is_exported', 'INTEGER') add_field(table_schema, 'exported_time', 'TIMESTAMP') add_field(table_schema, 'create_time', 'TIMESTAMP') add_field(table_schema, 'setting_time', 'TIMESTAMP') add_field(table_schema, 'create_by', 'INTEGER') add_field(table_schema, 'setting_by', 'INTEGER') add_field(table_schema, 'recommend', 'INTEGER') add_field(table_schema, 'internet_code', 'STRING') add_field(table_schema, 'need_synchronize', 'INTEGER') add_field(table_schema, 'is_web_exported', 'INTEGER') add_field(table_schema, 'limit_times', 'INTEGER') add_field(table_schema, 'qty', 'INTEGER') add_field(table_schema, 'special_price', 'FLOAT') add_field(table_schema, 'what_reward_definition_id', 'INTEGER') add_field(table_schema, 'times', 'INTEGER') add_field(table_schema, 'pos_sku_for_sap', 'STRING') add_field(table_schema, 'pos_sku_for_sap_time', 'TIMESTAMP') add_field(table_schema, 'calculate_in_report', 'INTEGER') add_field(table_schema, 'is_dragon_exported', 'INTEGER') add_field(table_schema, 'no_need_synchronize_dragon', 'INTEGER') add_field(table_schema, 'available_country', 'INTEGER') add_field(table_schema, 'external_id', 'INTEGER') add_field(table_schema, 'valid_online', 'INTEGER') add_field(table_schema, 'valid_offline', 'INTEGER') add_field(table_schema, 'valid_mobile_app', 'INTEGER') add_field(table_schema, 'country_id', 'INTEGER') return table_schema
def test_table_with_write_disposition_append(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.return_value = table client.tables.Insert.return_value = table write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client): pass self.assertTrue(client.tables.Get.called) self.assertFalse(client.tables.Delete.called) self.assertFalse(client.tables.Insert.called)
def market_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='total_matched', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='event_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='start_time', type='TIMESTAMP', mode='NULLABLE'), ])
def test_get_annotation_names_multiple_annotations(self): schema = bigquery.TableSchema() alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') annotation_record_1 = bigquery.TableFieldSchema( name='CSQ_1', type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='desc') annotation_record_1.fields.append( bigquery.TableFieldSchema( name='allele', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) annotation_record_1.fields.append( bigquery.TableFieldSchema( name='Consequence', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) alternate_bases_record.fields.append(annotation_record_1) annotation_record_2 = bigquery.TableFieldSchema( name='CSQ_2', type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='desc') annotation_record_2.fields.append( bigquery.TableFieldSchema( name='allele', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) annotation_record_2.fields.append( bigquery.TableFieldSchema( name='IMPACT', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc.')) alternate_bases_record.fields.append(annotation_record_2) schema.fields.append(alternate_bases_record) self.assertEqual(bq_to_vcf._extract_annotation_names(schema), { 'CSQ_1': ['allele', 'Consequence'], 'CSQ_2': ['allele', 'IMPACT'] })
def build_bq_schema(): table_schema = bigquery.TableSchema() text_field = bigquery.TableFieldSchema() text_field.name = 'text' text_field.type = 'string' text_field.mode = 'nullable' table_schema.fields.append(text_field) created_at_field = bigquery.TableFieldSchema() created_at_field.name = 'created_at' created_at_field.type = 'datetime' created_at_field.mode = 'nullable' table_schema.fields.append(created_at_field) sentiment_field = bigquery.TableFieldSchema() sentiment_field.name = 'sentiment' sentiment_field.type = 'integer' sentiment_field.mode = 'nullable' table_schema.fields.append(sentiment_field) # nested field job_field = bigquery.TableFieldSchema() job_field.name = 'job' job_field.type = 'record' job_field.mode = 'nullable' job_id_field = bigquery.TableFieldSchema() job_id_field.name = 'job_id' job_id_field.type = 'string' job_id_field.mode = 'nullable' job_field.fields.append(job_id_field) query_field = bigquery.TableFieldSchema() query_field.name = 'query' query_field.type = 'string' query_field.mode = 'nullable' job_field.fields.append(query_field) created_at_job_field = bigquery.TableFieldSchema() created_at_job_field.name = 'created_at' created_at_job_field.type = 'datetime' created_at_job_field.mode = 'nullable' job_field.fields.append(created_at_job_field) table_schema.fields.append(job_field) return table_schema
def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59.990000' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # the API Tools bigquery client expects byte values to be base-64 encoded # TODO https://github.com/apache/beam/issues/19073: upgrade to # google-cloud-bigquery which does not require handling the encoding in # beam for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') passed, errors = self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) self.assertTrue(passed, 'Error in BQ setup: %s' % errors)
def __init__(self, row1, row2="", delimiter=","): """ Creates a BigQuery schema based on the first 2 rows of the dataset: - if the schema is flat only the first row will matter but we'll have to parse the 2nd to make sure - if the schema is nested we'll discover it with the 2nd row by reading the values. Args: row1: String that contains the column names of the file row2: String that contains the first row of actual data Returns: void """ self.table_schema = bigquery.TableSchema() self.json_columns = [] self.row1 = row1 self.row2 = row2 self.delimiter = delimiter
def create_schema(fields): table_schema = bigquery.TableSchema() for field in fields: tmp_schema = bigquery.TableFieldSchema() tmp_schema.name = field tmp_schema.mode = "nullable" if field == "cookies": tmp_schema.type = "STRING" elif field == "visitDate": tmp_schema.type = "DATE" else: tmp_schema.type = "INTEGER" table_schema.fields.append(tmp_schema) return table_schema
def test_table_not_empty_and_write_disposition_empty( self, patched_time_sleep): client = mock.Mock() client.tables.Get.return_value = bigquery.Table( tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tabledata.List.return_value = bigquery.TableDataList(totalRows=1) write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY with self.assertRaisesRegexp( RuntimeError, r'Table project:dataset\.table is not empty but write ' r'disposition is WRITE_EMPTY'): with beam.io.BigQuerySink( 'project:dataset.table', write_disposition=write_disposition).writer(client): pass
def runner_schema(self): return bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), bigquery.TableFieldSchema( name='selection_id', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='market_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='event_id', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='sort_priority', type='INTEGER', mode='REQUIRED'), bigquery.TableFieldSchema( name='handicap', type='FLOAT', mode='REQUIRED'), ])
def test_row_as_table_row(self): schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'), ('b', 'BOOLEAN'), ('n', 'NUMERIC'), ('r', 'RECORD'), ('g', 'GEOGRAPHY')] data_definition = [ 'abc', 123, 123.456, True, decimal.Decimal('987654321.987654321'), { 'a': 'b' }, 'LINESTRING(1 2, 3 4, 5 6, 7 8)' ] str_def = ( '{"s": "abc", ' '"i": 123, ' '"f": 123.456, ' '"b": true, ' '"n": "987654321.987654321", ' '"r": {"a": "b"}, ' '"g": "LINESTRING(1 2, 3 4, 5 6, 7 8)"}') schema = bigquery.TableSchema( fields=[ bigquery.TableFieldSchema(name=k, type=v) for k, v in schema_definition ]) coder = TableRowJsonCoder(table_schema=schema) def value_or_decimal_to_json(val): if isinstance(val, decimal.Decimal): return to_json_value(str(val)) else: return to_json_value(val) test_row = bigquery.TableRow( f=[ bigquery.TableCell(v=value_or_decimal_to_json(e)) for e in data_definition ]) self.assertEqual(str_def, coder.encode(test_row)) self.assertEqual(test_row, coder.decode(coder.encode(test_row))) # A coder without schema can still decode. self.assertEqual( test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
def test_generate_header_fields_from_schema_invalid_description(self): schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name='invalid_description', type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Desc\nThis is added intentionally.')) header = schema_converter.generate_header_fields_from_schema(schema) infos = OrderedDict([('invalid_description', Info('invalid_description', 1, 'String', 'Desc This is added intentionally.', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def test_no_table_and_create_if_needed(self): client = mock.Mock() table = bigquery.Table(tableReference=bigquery.TableReference( projectId='project', datasetId='dataset', tableId='table'), schema=bigquery.TableSchema()) client.tables.Get.side_effect = HttpError(response={'status': '404'}, url='', content='') client.tables.Insert.return_value = table create_disposition = beam.io.BigQueryDisposition.CREATE_IF_NEEDED with beam.io.BigQuerySink( 'project:dataset.table', schema='somefield:INTEGER', create_disposition=create_disposition).writer(client): pass self.assertTrue(client.tables.Get.called) self.assertTrue(client.tables.Insert.called)