Ejemplo n.º 1
0
 def json_compliance_exception(self, value):
     with self.assertRaisesRegexp(ValueError,
                                  re.escape(JSON_COMPLIANCE_ERROR)):
         schema_definition = [('f', 'FLOAT')]
         schema = bigquery.TableSchema(fields=[
             bigquery.TableFieldSchema(name=k, type=v)
             for k, v in schema_definition
         ])
         coder = TableRowJsonCoder(table_schema=schema)
         test_row = bigquery.TableRow(
             f=[bigquery.TableCell(v=to_json_value(value))])
         coder.encode(test_row)
Ejemplo n.º 2
0
    def _make_schema(fields):
        def _fill_schema(fields):
            for field in fields:
                table_field = bigquery.TableFieldSchema()
                table_field.name, table_field.type, nested_fields = field
                if nested_fields:
                    table_field.fields = list(_fill_schema(nested_fields))
                yield table_field

        schema = bigquery.TableSchema()
        schema.fields = list(_fill_schema(fields))
        return schema
 def test_get_or_create_table(self):
     client = mock.Mock()
     client.tables.Insert.return_value = 'table_id'
     client.tables.Get.side_effect = [None, 'table_id']
     wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)
     new_table = wrapper.get_or_create_table(
         'project_id', 'dataset_id', 'table_id',
         bigquery.TableSchema(fields=[
             bigquery.TableFieldSchema(
                 name='b', type='BOOLEAN', mode='REQUIRED')
         ]), False, False)
     self.assertEqual(new_table, 'table_id')
    def _get_table_schema(self):
        # type (None) -> bigquery.TableSchema
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(name='IB',
                                      type=TableFieldConstants.TYPE_BOOLEAN,
                                      mode=TableFieldConstants.MODE_NULLABLE,
                                      description='INFO foo desc'))
        schema.fields.append(
            bigquery.TableFieldSchema(name='IBR',
                                      type=TableFieldConstants.TYPE_BOOLEAN,
                                      mode=TableFieldConstants.MODE_REPEATED,
                                      description='INFO foo desc'))

        schema.fields.append(
            bigquery.TableFieldSchema(name='II',
                                      type=TableFieldConstants.TYPE_INTEGER,
                                      mode=TableFieldConstants.MODE_NULLABLE,
                                      description='INFO foo desc'))
        schema.fields.append(
            bigquery.TableFieldSchema(name='IF',
                                      type=TableFieldConstants.TYPE_FLOAT,
                                      mode=TableFieldConstants.MODE_REPEATED,
                                      description='INFO foo desc'))
        schema.fields.append(
            bigquery.TableFieldSchema(name='IS',
                                      type=TableFieldConstants.TYPE_STRING,
                                      mode=TableFieldConstants.MODE_REPEATED,
                                      description='INFO foo desc'))
        # Call record.
        call_record = bigquery.TableFieldSchema(
            name=ColumnKeyConstants.CALLS,
            type=TableFieldConstants.TYPE_RECORD,
            mode=TableFieldConstants.MODE_REPEATED,
            description='One record for each call.')
        call_record.fields.append(
            bigquery.TableFieldSchema(name='FB',
                                      type=TableFieldConstants.TYPE_BOOLEAN,
                                      mode=TableFieldConstants.MODE_NULLABLE,
                                      description='FORMAT foo desc'))
        call_record.fields.append(
            bigquery.TableFieldSchema(name='FI',
                                      type=TableFieldConstants.TYPE_INTEGER,
                                      mode=TableFieldConstants.MODE_NULLABLE,
                                      description='FORMAT foo desc'))
        call_record.fields.append(
            bigquery.TableFieldSchema(name='FS',
                                      type=TableFieldConstants.TYPE_STRING,
                                      mode=TableFieldConstants.MODE_REPEATED,
                                      description='FORMAT foo desc'))
        schema.fields.append(call_record)
        return schema
    def test_generate_header_fields_from_schema_date_type(self):
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(
                name='partition_date_please_ignore',
                type='Date',
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Column required by BigQuery partitioning logic.'))
        header = schema_converter.generate_header_fields_from_schema(schema)

        expected_header = vcf_header_io.VcfHeader(infos=OrderedDict(),
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
def get_table_schema():
  """Formulate the schema for the destination table."""

  fields = [(SAMPLE, 'string', 'required'), (CLUSTER, 'integer', 'required')]
  from apache_beam.io.gcp.internal.clients import bigquery  # pylint: disable=wrong-import-order, wrong-import-position
  table_schema = bigquery.TableSchema()
  for (col_name, col_type, col_mode) in fields:
    field_schema = bigquery.TableFieldSchema()
    field_schema.name = col_name
    field_schema.type = col_type
    field_schema.mode = col_mode
    table_schema.fields.append(field_schema)
  return table_schema
Ejemplo n.º 7
0
 def test_get_or_create_table_race_condition(self):
     client = mock.Mock()
     client.tables.Insert.side_effect = HttpError(
         response={'status': '409'}, url='', content='')
     client.tables.Get.side_effect = [None, 'table_id']
     wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)
     new_table = wrapper.get_or_create_table(
         'project_id', 'dataset_id', 'table_id',
         bigquery.TableSchema(fields=[
             bigquery.TableFieldSchema(
                 name='b', type='BOOLEAN', mode='REQUIRED')
         ]), False, False)
     self.assertEqual(new_table, 'table_id')
Ejemplo n.º 8
0
 def competition_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='region', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='market_count', type='INTEGER', mode='REQUIRED'),
     ])
Ejemplo n.º 9
0
    def test_simple_schemas(self):
        schema1 = bigquery.TableSchema(fields=[])
        self.assertTrue(check_schema_equal(schema1, schema1))

        schema2 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64")
        ])
        self.assertTrue(check_schema_equal(schema2, schema2))
        self.assertFalse(check_schema_equal(schema1, schema2))

        schema3 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(
                name="b",
                mode="REPEATED",
                type="RECORD",
                fields=[
                    bigquery.TableFieldSchema(
                        name="c", mode="REQUIRED", type="BOOL")
                ])
        ])
        self.assertTrue(check_schema_equal(schema3, schema3))
        self.assertFalse(check_schema_equal(schema2, schema3))
Ejemplo n.º 10
0
 def test_get_variant_query_no_region(self):
   args = self._create_mock_args(
       input_table='my_bucket:my_dataset.my_table',
       genomic_regions=None)
   schema = bigquery.TableSchema()
   schema.fields.append(bigquery.TableFieldSchema(
       name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
       type=bigquery_util.TableFieldConstants.TYPE_STRING,
       mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
       description='Reference name.'))
   self.assertEqual(bq_to_vcf._get_variant_query(args, schema),
                    'SELECT reference_name FROM '
                    '`my_bucket.my_dataset.my_table`')
Ejemplo n.º 11
0
  def test_descriptions(self):
    """
        Test that differences in description are ignored
        when ignore_descriptions=True.
        """
    schema1 = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(
                name="a",
                mode="REQUIRED",
                type="FLOAT64",
                description="Field A",
            ),
            bigquery.TableFieldSchema(
                name="b",
                mode="REQUIRED",
                type="INT64",
            ),
        ])

    schema2 = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(
                name="a",
                mode="REQUIRED",
                type="FLOAT64",
                description="Field A is for Apple"),
            bigquery.TableFieldSchema(
                name="b",
                mode="REQUIRED",
                type="INT64",
                description="Field B",
            ),
        ])

    self.assertFalse(check_schema_equal(schema1, schema2))
    self.assertTrue(
        check_schema_equal(schema1, schema2, ignore_descriptions=True))
def run (input_topic, output_table, window_size=1.0, pipeline_args=None):
  pipeline_options = PipelineOptions(
      pipeline_args, streaming=True, save_main_session=True
  )
  with beam.Pipeline(options=pipeline_options) as p:
    from apache_beam.io.gcp.internal.clients import bigquery
    table_schema = bigquery.TableSchema()

    host_schema =  bigquery.TableFieldSchema()
    host_schema.name = 'host'
    host_schema.type = 'string'
    host_schema.mode = 'nullable'
    table_schema.fields.append(host_schema)

    time_schema = bigquery.TableFieldSchema()
    time_schema.name = 'time'
    time_schema.type = 'string'
    time_schema.mode = 'nullable'
    table_schema.fields.append(time_schema)

    request_type_schema = bigquery.TableFieldSchema()
    request_type_schema.name = 'request'
    request_type_schema.type = 'string'
    request_type_schema.mode = 'nullable'
    table_schema.fields.append(request_type_schema)

    status_schema = bigquery.TableFieldSchema()
    status_schema.name = 'status'
    status_schema.type = 'string'
    status_schema.mode = 'nullable'
    table_schema.fields.append(status_schema)   

    size_schema = bigquery.TableFieldSchema()
    size_schema.name = 'size'
    size_schema.type = 'string'
    size_schema.mode = 'nullable'
    table_schema.fields.append(size_schema)


    parsing_webserver_logs = (
        p 
        | "read data from pubsub" >> beam.io.ReadFromPubSub(topic=input_topic).with_output_types(bytes)
        | "lines" >> beam.Map(lambda x : x.decode("utf-8"))
        | "Parse server log data" >> beam.ParDo(ParseApacheServerLog())
        | "Write to Big Query" >> beam.io.WriteToBigQuery(output_table
                                                          ,schema = table_schema
                                                          ,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
                                                          ,write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
                                                          )
    )
Ejemplo n.º 13
0
    def _create_table(cls, table_name):
        table_schema = bigquery.TableSchema()

        number = bigquery.TableFieldSchema()
        number.name = 'number'
        number.type = 'INTEGER'
        table_schema.fields.append(number)

        string = bigquery.TableFieldSchema()
        string.name = 'string'
        string.type = 'STRING'
        table_schema.fields.append(string)

        time = bigquery.TableFieldSchema()
        time.name = 'time'
        time.type = 'TIME'
        table_schema.fields.append(time)

        datetime = bigquery.TableFieldSchema()
        datetime.name = 'datetime'
        datetime.type = 'DATETIME'
        table_schema.fields.append(datetime)

        rec = bigquery.TableFieldSchema()
        rec.name = 'rec'
        rec.type = 'RECORD'
        rec_datetime = bigquery.TableFieldSchema()
        rec_datetime.name = 'rec_datetime'
        rec_datetime.type = 'DATETIME'
        rec.fields.append(rec_datetime)
        rec_rec = bigquery.TableFieldSchema()
        rec_rec.name = 'rec_rec'
        rec_rec.type = 'RECORD'
        rec_rec_datetime = bigquery.TableFieldSchema()
        rec_rec_datetime.name = 'rec_rec_datetime'
        rec_rec_datetime.type = 'DATETIME'
        rec_rec.fields.append(rec_rec_datetime)
        rec.fields.append(rec_rec)
        table_schema.fields.append(rec)

        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=cls.project,
            datasetId=cls.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=cls.project, datasetId=cls.dataset_id, table=table)
        cls.bigquery_client.client.tables.Insert(request)
        cls.bigquery_client.insert_rows(cls.project, cls.dataset_id,
                                        table_name, cls.TABLE_DATA)
Ejemplo n.º 14
0
    def test_get_or_create_table_invalid_tablename(self):
        invalid_names = ['big-query', 'table name', 'a' * 1025]
        for table_id in invalid_names:
            client = mock.Mock()
            client.tables.Get.side_effect = [None]
            wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)

            self.assertRaises(
                ValueError, wrapper.get_or_create_table, 'project_id',
                'dataset_id', table_id,
                bigquery.TableSchema(fields=[
                    bigquery.TableFieldSchema(
                        name='b', type='BOOLEAN', mode='REQUIRED')
                ]), False, False)
Ejemplo n.º 15
0
    def expand(self, pcoll):
        table_schema = bigquery.TableSchema()

        head_schema = bigquery.TableFieldSchema()
        head_schema.name = 'head'
        head_schema.type = 'string'
        head_schema.mode = 'nullable'
        table_schema.fields.append(head_schema)

        head_type_schema = bigquery.TableFieldSchema()
        head_type_schema.name = 'head_type'
        head_type_schema.type = 'string'
        head_type_schema.mode = 'nullable'
        table_schema.fields.append(head_type_schema)

        relation_schema = bigquery.TableFieldSchema()
        relation_schema.name = 'relation'
        relation_schema.type = 'string'
        relation_schema.mode = 'nullable'
        table_schema.fields.append(relation_schema)

        tail_schema = bigquery.TableFieldSchema()
        tail_schema.name = 'tail'
        tail_schema.type = 'string'
        tail_schema.mode = 'nullable'
        table_schema.fields.append(tail_schema)

        tail_type_schema = bigquery.TableFieldSchema()
        tail_type_schema.name = 'tail_type'
        tail_type_schema.type = 'string'
        tail_type_schema.mode = 'nullable'
        table_schema.fields.append(tail_type_schema)

        sentence_schema = bigquery.TableFieldSchema()
        sentence_schema.name = 'sentence'
        sentence_schema.type = 'string'
        sentence_schema.mode = 'nullable'
        table_schema.fields.append(sentence_schema)

        return (
            pcoll
            | 'Parse the json lines' >> beam.ParDo(Parse_json())
            | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
                table='relation_extraction_data',
                dataset=DATASET_ID,
                project=PROJECT_ID,
                schema=table_schema,  # Pass the defined table_schema
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Ejemplo n.º 16
0
 def test_get_query_columns(self):
   schema = bigquery.TableSchema()
   schema.fields.append(bigquery.TableFieldSchema(
       name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
       type=bigquery_util.TableFieldConstants.TYPE_STRING,
       mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
       description='Reference name.'))
   schema.fields.append(bigquery.TableFieldSchema(
       name='partition_date_please_ignore',
       type='Date',
       mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
       description='Column required by BigQuery partitioning logic.'))
   expected_columns = [bigquery_util.ColumnKeyConstants.REFERENCE_NAME]
   self.assertEqual(bq_to_vcf._get_query_columns(schema), expected_columns)
Ejemplo n.º 17
0
 def test_table_schema_parsing_end_to_end(self):
   string_field = bigquery.TableFieldSchema(
       name='s', type='STRING', mode='NULLABLE')
   nested_field = bigquery.TableFieldSchema(
       name='x', type='INTEGER', mode='NULLABLE')
   number_field = bigquery.TableFieldSchema(
       name='n', type='INTEGER', mode='NULLABLE')
   record_field = bigquery.TableFieldSchema(
       name='r', type='RECORD', mode='NULLABLE', fields=[nested_field])
   schema = bigquery.TableSchema(
       fields=[string_field, number_field, record_field])
   table_schema = beam.io.gcp.bigquery.BigQueryWriteFn.get_table_schema(
       beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema(schema))
   self.assertEqual(table_schema, schema)
    def get_bq_schema():
        def add_field(schema, field_name, field_type):
            field_schema = bigquery.TableFieldSchema()
            field_schema.name = field_name
            field_schema.type = field_type
            field_schema.mode = 'NULLABLE'
            schema.fields.append(field_schema)

        table_schema = bigquery.TableSchema()

        add_field(table_schema, 'offer_id', 'INTEGER')
        add_field(table_schema, 'campaign_id', 'INTEGER')
        add_field(table_schema, 'crm_product_id', 'INTEGER')
        add_field(table_schema, 'sku', 'STRING')
        add_field(table_schema, 'offer_name', 'STRING')
        add_field(table_schema, 'display_txt', 'STRING')
        add_field(table_schema, 'display_flag', 'INTEGER')
        add_field(table_schema, 'offer_type_id', 'INTEGER')
        add_field(table_schema, 'effective_from_date', 'TIMESTAMP')
        add_field(table_schema, 'effective_to_date', 'TIMESTAMP')
        add_field(table_schema, 'status', 'INTEGER')
        add_field(table_schema, 'is_exported', 'INTEGER')
        add_field(table_schema, 'exported_time', 'TIMESTAMP')
        add_field(table_schema, 'create_time', 'TIMESTAMP')
        add_field(table_schema, 'setting_time', 'TIMESTAMP')
        add_field(table_schema, 'create_by', 'INTEGER')
        add_field(table_schema, 'setting_by', 'INTEGER')
        add_field(table_schema, 'recommend', 'INTEGER')
        add_field(table_schema, 'internet_code', 'STRING')
        add_field(table_schema, 'need_synchronize', 'INTEGER')
        add_field(table_schema, 'is_web_exported', 'INTEGER')
        add_field(table_schema, 'limit_times', 'INTEGER')
        add_field(table_schema, 'qty', 'INTEGER')
        add_field(table_schema, 'special_price', 'FLOAT')
        add_field(table_schema, 'what_reward_definition_id', 'INTEGER')
        add_field(table_schema, 'times', 'INTEGER')
        add_field(table_schema, 'pos_sku_for_sap', 'STRING')
        add_field(table_schema, 'pos_sku_for_sap_time', 'TIMESTAMP')
        add_field(table_schema, 'calculate_in_report', 'INTEGER')
        add_field(table_schema, 'is_dragon_exported', 'INTEGER')
        add_field(table_schema, 'no_need_synchronize_dragon', 'INTEGER')
        add_field(table_schema, 'available_country', 'INTEGER')
        add_field(table_schema, 'external_id', 'INTEGER')
        add_field(table_schema, 'valid_online', 'INTEGER')
        add_field(table_schema, 'valid_offline', 'INTEGER')
        add_field(table_schema, 'valid_mobile_app', 'INTEGER')
        add_field(table_schema, 'country_id', 'INTEGER')

        return table_schema
Ejemplo n.º 19
0
 def test_table_with_write_disposition_append(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.return_value = table
     client.tables.Insert.return_value = table
     write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND
     with beam.io.BigQuerySink(
             'project:dataset.table',
             write_disposition=write_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertFalse(client.tables.Delete.called)
     self.assertFalse(client.tables.Insert.called)
Ejemplo n.º 20
0
 def market_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='total_matched', type='FLOAT', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='event_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='start_time', type='TIMESTAMP', mode='NULLABLE'),
     ])
 def test_get_annotation_names_multiple_annotations(self):
     schema = bigquery.TableSchema()
     alternate_bases_record = bigquery.TableFieldSchema(
         name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='One record for each alternate base (if any).')
     annotation_record_1 = bigquery.TableFieldSchema(
         name='CSQ_1',
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='desc')
     annotation_record_1.fields.append(
         bigquery.TableFieldSchema(
             name='allele',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     annotation_record_1.fields.append(
         bigquery.TableFieldSchema(
             name='Consequence',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     alternate_bases_record.fields.append(annotation_record_1)
     annotation_record_2 = bigquery.TableFieldSchema(
         name='CSQ_2',
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='desc')
     annotation_record_2.fields.append(
         bigquery.TableFieldSchema(
             name='allele',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     annotation_record_2.fields.append(
         bigquery.TableFieldSchema(
             name='IMPACT',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     alternate_bases_record.fields.append(annotation_record_2)
     schema.fields.append(alternate_bases_record)
     self.assertEqual(bq_to_vcf._extract_annotation_names(schema), {
         'CSQ_1': ['allele', 'Consequence'],
         'CSQ_2': ['allele', 'IMPACT']
     })
Ejemplo n.º 22
0
def build_bq_schema():
    table_schema = bigquery.TableSchema()

    text_field = bigquery.TableFieldSchema()
    text_field.name = 'text'
    text_field.type = 'string'
    text_field.mode = 'nullable'
    table_schema.fields.append(text_field)

    created_at_field = bigquery.TableFieldSchema()
    created_at_field.name = 'created_at'
    created_at_field.type = 'datetime'
    created_at_field.mode = 'nullable'
    table_schema.fields.append(created_at_field)

    sentiment_field = bigquery.TableFieldSchema()
    sentiment_field.name = 'sentiment'
    sentiment_field.type = 'integer'
    sentiment_field.mode = 'nullable'
    table_schema.fields.append(sentiment_field)

    # nested field
    job_field = bigquery.TableFieldSchema()
    job_field.name = 'job'
    job_field.type = 'record'
    job_field.mode = 'nullable'

    job_id_field = bigquery.TableFieldSchema()
    job_id_field.name = 'job_id'
    job_id_field.type = 'string'
    job_id_field.mode = 'nullable'
    job_field.fields.append(job_id_field)

    query_field = bigquery.TableFieldSchema()
    query_field.name = 'query'
    query_field.type = 'string'
    query_field.mode = 'nullable'
    job_field.fields.append(query_field)

    created_at_job_field = bigquery.TableFieldSchema()
    created_at_job_field.name = 'created_at'
    created_at_job_field.type = 'datetime'
    created_at_job_field.mode = 'nullable'
    job_field.fields.append(created_at_job_field)

    table_schema.fields.append(job_field)

    return table_schema
Ejemplo n.º 23
0
 def _setup_new_types_env(self):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'bytes'
     table_field.type = 'BYTES'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'date'
     table_field.type = 'DATE'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'time'
     table_field.type = 'TIME'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=self.project,
         datasetId=self.dataset_id,
         tableId=NEW_TYPES_INPUT_TABLE),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=self.project, datasetId=self.dataset_id, table=table)
     self.bigquery_client.client.tables.Insert(request)
     table_data = [{
         'bytes': b'xyw',
         'date': '2011-01-01',
         'time': '23:59:59.999999'
     }, {
         'bytes': b'abc',
         'date': '2000-01-01',
         'time': '00:00:00'
     }, {
         'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
         'date': '3000-12-31',
         'time': '23:59:59.990000'
     }, {
         'bytes': b'\xab\xac\xad',
         'date': '2000-01-01',
         'time': '00:00:00'
     }]
     # the API Tools bigquery client expects byte values to be base-64 encoded
     # TODO https://github.com/apache/beam/issues/19073: upgrade to
     # google-cloud-bigquery which does not require handling the encoding in
     # beam
     for row in table_data:
         row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
     passed, errors = self.bigquery_client.insert_rows(
         self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)
     self.assertTrue(passed, 'Error in BQ setup: %s' % errors)
Ejemplo n.º 24
0
    def __init__(self, row1, row2="", delimiter=","):
        """ Creates a BigQuery schema based on the first 2 rows of the dataset:
        - if the schema is flat only the first row will matter but we'll have to parse the 2nd to make sure
        - if the schema is nested we'll discover it with the 2nd row by reading the values.
        Args:
            row1: String that contains the column names of the file
            row2: String that contains the first row of actual data

        Returns:
            void
        """
        self.table_schema = bigquery.TableSchema()
        self.json_columns = []
        self.row1 = row1
        self.row2 = row2
        self.delimiter = delimiter
Ejemplo n.º 25
0
def create_schema(fields):
    table_schema = bigquery.TableSchema()
    for field in fields:
        tmp_schema = bigquery.TableFieldSchema()
        tmp_schema.name = field
        tmp_schema.mode = "nullable"

        if field == "cookies":
            tmp_schema.type = "STRING"
        elif field == "visitDate":
            tmp_schema.type = "DATE"
        else:
            tmp_schema.type = "INTEGER"

        table_schema.fields.append(tmp_schema)
    return table_schema
Ejemplo n.º 26
0
 def test_table_not_empty_and_write_disposition_empty(
     self, patched_time_sleep):
   client = mock.Mock()
   client.tables.Get.return_value = bigquery.Table(
       tableReference=bigquery.TableReference(
           projectId='project', datasetId='dataset', tableId='table'),
       schema=bigquery.TableSchema())
   client.tabledata.List.return_value = bigquery.TableDataList(totalRows=1)
   write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY
   with self.assertRaisesRegexp(
       RuntimeError, r'Table project:dataset\.table is not empty but write '
                     r'disposition is WRITE_EMPTY'):
     with beam.io.BigQuerySink(
         'project:dataset.table',
         write_disposition=write_disposition).writer(client):
       pass
Ejemplo n.º 27
0
 def runner_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='selection_id', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='market_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='event_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='sort_priority', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='handicap', type='FLOAT', mode='REQUIRED'),
     ])
Ejemplo n.º 28
0
  def test_row_as_table_row(self):
    schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                         ('b', 'BOOLEAN'), ('n', 'NUMERIC'), ('r', 'RECORD'),
                         ('g', 'GEOGRAPHY')]
    data_definition = [
        'abc',
        123,
        123.456,
        True,
        decimal.Decimal('987654321.987654321'), {
            'a': 'b'
        },
        'LINESTRING(1 2, 3 4, 5 6, 7 8)'
    ]
    str_def = (
        '{"s": "abc", '
        '"i": 123, '
        '"f": 123.456, '
        '"b": true, '
        '"n": "987654321.987654321", '
        '"r": {"a": "b"}, '
        '"g": "LINESTRING(1 2, 3 4, 5 6, 7 8)"}')
    schema = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(name=k, type=v) for k,
            v in schema_definition
        ])
    coder = TableRowJsonCoder(table_schema=schema)

    def value_or_decimal_to_json(val):
      if isinstance(val, decimal.Decimal):
        return to_json_value(str(val))
      else:
        return to_json_value(val)

    test_row = bigquery.TableRow(
        f=[
            bigquery.TableCell(v=value_or_decimal_to_json(e))
            for e in data_definition
        ])

    self.assertEqual(str_def, coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
    def test_generate_header_fields_from_schema_invalid_description(self):
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(
                name='invalid_description',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Desc\nThis is added intentionally.'))
        header = schema_converter.generate_header_fields_from_schema(schema)

        infos = OrderedDict([('invalid_description',
                              Info('invalid_description', 1, 'String',
                                   'Desc This is added intentionally.', None,
                                   None))])
        expected_header = vcf_header_io.VcfHeader(infos=infos,
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
Ejemplo n.º 30
0
 def test_no_table_and_create_if_needed(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.side_effect = HttpError(response={'status': '404'},
                                               url='',
                                               content='')
     client.tables.Insert.return_value = table
     create_disposition = beam.io.BigQueryDisposition.CREATE_IF_NEEDED
     with beam.io.BigQuerySink(
             'project:dataset.table',
             schema='somefield:INTEGER',
             create_disposition=create_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertTrue(client.tables.Insert.called)