def _setup_new_types_env(self):
   table_schema = bigquery.TableSchema()
   table_field = bigquery.TableFieldSchema()
   table_field.name = 'bytes'
   table_field.type = 'BYTES'
   table_schema.fields.append(table_field)
   table_field = bigquery.TableFieldSchema()
   table_field.name = 'date'
   table_field.type = 'DATE'
   table_schema.fields.append(table_field)
   table_field = bigquery.TableFieldSchema()
   table_field.name = 'time'
   table_field.type = 'TIME'
   table_schema.fields.append(table_field)
   table = bigquery.Table(
       tableReference=bigquery.TableReference(
           projectId=self.project,
           datasetId=self.dataset_id,
           tableId=NEW_TYPES_INPUT_TABLE),
       schema=table_schema)
   request = bigquery.BigqueryTablesInsertRequest(
       projectId=self.project, datasetId=self.dataset_id, table=table)
   self.bigquery_client.client.tables.Insert(request)
   table_data = [
       {'bytes':b'xyw', 'date':'2011-01-01', 'time':'23:59:59.999999'},
       {'bytes':b'abc', 'date':'2000-01-01', 'time':'00:00:00'},
       {'bytes':b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date':'3000-12-31',
        'time':'23:59:59.990000'},
       {'bytes':b'\xab\xac\xad', 'date':'2000-01-01', 'time':'00:00:00'}
   ]
   # the API Tools bigquery client expects byte values to be base-64 encoded
   # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require
   # handling the encoding in beam
   for row in table_data:
     row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
   self.bigquery_client.insert_rows(
       self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)
 def _setup_new_types_env(self):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'bytes'
     table_field.type = 'BYTES'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'date'
     table_field.type = 'DATE'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'time'
     table_field.type = 'TIME'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=self.project,
         datasetId=self.dataset_id,
         tableId=NEW_TYPES_INPUT_TABLE),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=self.project, datasetId=self.dataset_id, table=table)
     self.bigquery_client.client.tables.Insert(request)
     table_data = [{
         'bytes': b'xyw=',
         'date': '2011-01-01',
         'time': '23:59:59.999999'
     }, {
         'bytes': b'abc=',
         'date': '2000-01-01',
         'time': '00:00:00'
     }, {
         'bytes': b'dec=',
         'date': '3000-12-31',
         'time': '23:59:59.990000'
     }]
     self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                      NEW_TYPES_INPUT_TABLE, table_data)
    def test_add_format_fields_reserved_field(self):
        calls_record_with_desc = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each call.')
        calls_record_with_desc.fields.append(
            bigquery.TableFieldSchema(
                name='GQ',
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='bigquery desc'))
        formats = OrderedDict()
        schema_converter._add_format_fields(calls_record_with_desc, formats)
        expected_formats = OrderedDict([('GQ',
                                         createFormat('GQ', 1, 'Integer',
                                                      'bigquery desc'))])
        self.assertEqual(formats, expected_formats)

        calls_record_without_desc = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each call.')
        calls_record_without_desc.fields.append(
            bigquery.TableFieldSchema(
                name='GQ',
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=''))
        formats = OrderedDict()
        schema_converter._add_format_fields(calls_record_without_desc, formats)
        expected_formats = OrderedDict([
            ('GQ',
             createFormat('GQ', 1, 'Integer', 'Conditional genotype quality'))
        ])
        self.assertEqual(formats, expected_formats)
예제 #4
0
  def test_simple_schemas(self):
    schema1 = bigquery.TableSchema(fields=[])
    self.assertTrue(check_schema_equal(schema1, schema1))

    schema2 = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64")
        ])
    self.assertTrue(check_schema_equal(schema2, schema2))
    self.assertFalse(check_schema_equal(schema1, schema2))

    schema3 = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(
                name="b",
                mode="REPEATED",
                type="RECORD",
                fields=[
                    bigquery.TableFieldSchema(
                        name="c", mode="REQUIRED", type="BOOL")
                ])
        ])
    self.assertTrue(check_schema_equal(schema3, schema3))
    self.assertFalse(check_schema_equal(schema2, schema3))
    def test_add_info_fields_reserved_field(self):
        field_with_desc = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='bigquery desc')
        infos = OrderedDict()
        schema_converter._add_info_fields(field_with_desc, infos)
        expected_infos = OrderedDict([('AA',
                                       Info('AA', 1, 'String', 'bigquery desc',
                                            None, None))])
        self.assertEqual(infos, expected_infos)

        field_without_desc = bigquery.TableFieldSchema(
            name='AA',
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='')
        infos = OrderedDict()
        schema_converter._add_info_fields(field_without_desc, infos)
        expected_infos = OrderedDict([('AA',
                                       Info('AA', 1, 'String',
                                            'Ancestral allele', None, None))])
        self.assertEqual(infos, expected_infos)
    def test_add_info_fields_from_alternate_bases_schema_compatibility(self):
        schema_conflict_info = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        schema_conflict_info.fields.append(
            bigquery.TableFieldSchema(
                name='AF',
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='desc'))
        with self.assertRaises(ValueError):
            schema_converter._add_info_fields(schema_conflict_info,
                                              OrderedDict())

        infos_allow_incompatible_schema = OrderedDict()
        schema_converter._add_info_fields(schema_conflict_info,
                                          infos_allow_incompatible_schema,
                                          allow_incompatible_schema=True)
        expected_infos = OrderedDict([('AF',
                                       Info('AF', field_counts['A'], 'Integer',
                                            'desc', None, None))])
        self.assertEqual(infos_allow_incompatible_schema, expected_infos)
예제 #7
0
 def test_get_or_create_table_intermittent_exception(self):
     client = mock.Mock()
     client.tables.Insert.side_effect = [
         HttpError(response={'status': '408'}, url='', content=''),
         'table_id'
     ]
     client.tables.Get.side_effect = [None, 'table_id']
     wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)
     new_table = wrapper.get_or_create_table(
         'project_id', 'dataset_id', 'table_id',
         bigquery.TableSchema(fields=[
             bigquery.TableFieldSchema(
                 name='b', type='BOOLEAN', mode='REQUIRED')
         ]), False, False)
     self.assertEqual(new_table, 'table_id')
예제 #8
0
  def test_get_beam_bigquery_schema(self) -> None:
    """Test making a bigquery schema for beam's table writing."""
    test_field = {
        'string_field': ('string', 'nullable'),
        'int_field': ('integer', 'repeated'),
    }

    table_schema = beam_tables._get_beam_bigquery_schema(test_field)

    expected_field_schema_1 = beam_bigquery.TableFieldSchema()
    expected_field_schema_1.name = 'string_field'
    expected_field_schema_1.type = 'string'
    expected_field_schema_1.mode = 'nullable'

    expected_field_schema_2 = beam_bigquery.TableFieldSchema()
    expected_field_schema_2.name = 'int_field'
    expected_field_schema_2.type = 'integer'
    expected_field_schema_2.mode = 'repeated'

    expected_table_schema = beam_bigquery.TableSchema()
    expected_table_schema.fields.append(expected_field_schema_1)
    expected_table_schema.fields.append(expected_field_schema_2)

    self.assertEqual(table_schema, expected_table_schema)
예제 #9
0
    def test_write_row(self):
        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name='stamp', type='TIMESTAMP'),
            bigquery.TableFieldSchema(
                name='number', type='FLOAT', mode='REQUIRED'),
        ])
        stamp = datetime.datetime(2020, 2, 25, 12, 0, 0, tzinfo=pytz.utc)

        with io.BytesIO() as buf:
            # Mock close() so we can access the buffer contents
            # after AvroRowWriter is closed.
            with mock.patch.object(buf, 'close') as mock_close:
                writer = AvroRowWriter(buf, schema)
                writer.write({'stamp': stamp, 'number': float('NaN')})
                writer.close()

                mock_close.assert_called_once()

            buf.seek(0)
            records = [r for r in fastavro.reader(buf)]

        self.assertEqual(len(records), 1)
        self.assertTrue(math.isnan(records[0]['number']))
        self.assertEqual(records[0]['stamp'], stamp)
예제 #10
0
def create_schema(fields):
    table_schema = bigquery.TableSchema()
    for field in fields:
        tmp_schema = bigquery.TableFieldSchema()
        tmp_schema.name = field
        tmp_schema.mode = "nullable"

        if field == "cookies":
            tmp_schema.type = "STRING"
        elif field == "visitDate":
            tmp_schema.type = "DATE"
        else:
            tmp_schema.type = "INTEGER"

        table_schema.fields.append(tmp_schema)
    return table_schema
예제 #11
0
 def test_get_or_create_table(self):
   client = mock.Mock()
   client.tables.Insert.return_value = 'table_id'
   client.tables.Get.side_effect = [None, 'table_id']
   wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)
   new_table = wrapper.get_or_create_table(
       'project_id',
       'dataset_id',
       'table_id',
       bigquery.TableSchema(
           fields=[
               bigquery.TableFieldSchema(
                   name='b', type='BOOLEAN', mode='REQUIRED')
           ]),
       False,
       False)
   self.assertEqual(new_table, 'table_id')
    def test_generate_header_fields_from_schema_invalid_description(self):
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(
                name='invalid_description',
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Desc\nThis is added intentionally.'))
        header = schema_converter.generate_header_fields_from_schema(schema)

        infos = OrderedDict([('invalid_description',
                              Info('invalid_description', 1, 'String',
                                   'Desc This is added intentionally.', None,
                                   None))])
        expected_header = vcf_header_io.VcfHeader(infos=infos,
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
예제 #13
0
  def test_row_as_table_row(self):
    schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                         ('b', 'BOOLEAN'), ('n', 'NUMERIC'), ('r', 'RECORD'),
                         ('g', 'GEOGRAPHY')]
    data_definition = [
        'abc',
        123,
        123.456,
        True,
        decimal.Decimal('987654321.987654321'), {
            'a': 'b'
        },
        'LINESTRING(1 2, 3 4, 5 6, 7 8)'
    ]
    str_def = (
        '{"s": "abc", '
        '"i": 123, '
        '"f": 123.456, '
        '"b": true, '
        '"n": "987654321.987654321", '
        '"r": {"a": "b"}, '
        '"g": "LINESTRING(1 2, 3 4, 5 6, 7 8)"}')
    schema = bigquery.TableSchema(
        fields=[
            bigquery.TableFieldSchema(name=k, type=v) for k,
            v in schema_definition
        ])
    coder = TableRowJsonCoder(table_schema=schema)

    def value_or_decimal_to_json(val):
      if isinstance(val, decimal.Decimal):
        return to_json_value(str(val))
      else:
        return to_json_value(val)

    test_row = bigquery.TableRow(
        f=[
            bigquery.TableCell(v=value_or_decimal_to_json(e))
            for e in data_definition
        ])

    self.assertEqual(str_def, coder.encode(test_row))
    self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
    # A coder without schema can still decode.
    self.assertEqual(
        test_row, TableRowJsonCoder().decode(coder.encode(test_row)))
 def test_get_annotation_names_multiple_annotations(self):
     schema = bigquery.TableSchema()
     alternate_bases_record = bigquery.TableFieldSchema(
         name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='One record for each alternate base (if any).')
     annotation_record_1 = bigquery.TableFieldSchema(
         name='CSQ_1',
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='desc')
     annotation_record_1.fields.append(
         bigquery.TableFieldSchema(
             name='allele',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     annotation_record_1.fields.append(
         bigquery.TableFieldSchema(
             name='Consequence',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     alternate_bases_record.fields.append(annotation_record_1)
     annotation_record_2 = bigquery.TableFieldSchema(
         name='CSQ_2',
         type=bigquery_util.TableFieldConstants.TYPE_RECORD,
         mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
         description='desc')
     annotation_record_2.fields.append(
         bigquery.TableFieldSchema(
             name='allele',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     annotation_record_2.fields.append(
         bigquery.TableFieldSchema(
             name='IMPACT',
             type=bigquery_util.TableFieldConstants.TYPE_STRING,
             mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
             description='desc.'))
     alternate_bases_record.fields.append(annotation_record_2)
     schema.fields.append(alternate_bases_record)
     self.assertEqual(bq_to_vcf._extract_annotation_names(schema), {
         'CSQ_1': ['allele', 'Consequence'],
         'CSQ_2': ['allele', 'IMPACT']
     })
예제 #15
0
def build_bq_schema():
    table_schema = bigquery.TableSchema()

    text_field = bigquery.TableFieldSchema()
    text_field.name = 'text'
    text_field.type = 'string'
    text_field.mode = 'nullable'
    table_schema.fields.append(text_field)

    created_at_field = bigquery.TableFieldSchema()
    created_at_field.name = 'created_at'
    created_at_field.type = 'datetime'
    created_at_field.mode = 'nullable'
    table_schema.fields.append(created_at_field)

    sentiment_field = bigquery.TableFieldSchema()
    sentiment_field.name = 'sentiment'
    sentiment_field.type = 'integer'
    sentiment_field.mode = 'nullable'
    table_schema.fields.append(sentiment_field)

    # nested field
    job_field = bigquery.TableFieldSchema()
    job_field.name = 'job'
    job_field.type = 'record'
    job_field.mode = 'nullable'

    job_id_field = bigquery.TableFieldSchema()
    job_id_field.name = 'job_id'
    job_id_field.type = 'string'
    job_id_field.mode = 'nullable'
    job_field.fields.append(job_id_field)

    query_field = bigquery.TableFieldSchema()
    query_field.name = 'query'
    query_field.type = 'string'
    query_field.mode = 'nullable'
    job_field.fields.append(query_field)

    created_at_job_field = bigquery.TableFieldSchema()
    created_at_job_field.name = 'created_at'
    created_at_job_field.type = 'datetime'
    created_at_job_field.mode = 'nullable'
    job_field.fields.append(created_at_job_field)

    table_schema.fields.append(job_field)

    return table_schema
예제 #16
0
    def _json_to_table_schema(self, from_json, child_schema):
        """ Recursively converts a json object to a BigQuery TableFieldSchema
        Args:
            from_json: the json object to convert
        Returns:
            A TableFieldSchema that can be appened to an existing TableFieldSchema
        """
        for k, v in from_json.items():
            no = bigquery.TableFieldSchema()
            no.name = k
            no.type = "string"
            no.mode = 'nullable'
            if isinstance(v, dict):
                no.type = "record"
                self._json_to_table_schema(v, no)
            child_schema.fields.append(no)

        return child_schema
예제 #17
0
  def test_merge_field_schemas_merge_record_fields(self):
    call_record_1 = bigquery.TableFieldSchema(
        name=ColumnKeyConstants.CALLS,
        type=TableFieldConstants.TYPE_RECORD,
        mode=TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    call_record_1.fields.append(bigquery.TableFieldSchema(
        name='FB',
        type=TableFieldConstants.TYPE_BOOLEAN,
        mode=TableFieldConstants.MODE_NULLABLE,
        description='FORMAT foo desc'))

    field_schemas_1 = [call_record_1]

    call_record_2 = bigquery.TableFieldSchema(
        name=ColumnKeyConstants.CALLS,
        type=TableFieldConstants.TYPE_RECORD,
        mode=TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    call_record_2.fields.append(bigquery.TableFieldSchema(
        name='GQ',
        type=TableFieldConstants.TYPE_INTEGER,
        mode=TableFieldConstants.MODE_NULLABLE,
        description='FORMAT foo desc'))
    field_schemas_2 = [call_record_2]

    call_record_3 = bigquery.TableFieldSchema(
        name=ColumnKeyConstants.CALLS,
        type=TableFieldConstants.TYPE_RECORD,
        mode=TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    call_record_3.fields.append(bigquery.TableFieldSchema(
        name='FB',
        type=TableFieldConstants.TYPE_BOOLEAN,
        mode=TableFieldConstants.MODE_NULLABLE,
        description='FORMAT foo desc'))
    call_record_3.fields.append(bigquery.TableFieldSchema(
        name='GQ',
        type=TableFieldConstants.TYPE_INTEGER,
        mode=TableFieldConstants.MODE_NULLABLE,
        description='FORMAT foo desc'))

    expected_merged_field_schemas = [call_record_3]
    self.assertEqual(
        bigquery_util._get_merged_field_schemas(field_schemas_1,
                                                field_schemas_2),
        expected_merged_field_schemas)
예제 #18
0
 def runner_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='selection_id', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='market_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='event_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='sort_priority', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='handicap', type='FLOAT', mode='REQUIRED'),
     ])
예제 #19
0
    def test_row_as_table_row(self):
        schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                             ('b', 'BOOLEAN'), ('r', 'RECORD')]
        data_defination = ['abc', 123, 123.456, True, {'a': 'b'}]
        str_def = '{"s": "abc", "i": 123, "f": 123.456, "b": true, "r": {"a": "b"}}'
        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name=k, type=v)
            for k, v in schema_definition
        ])
        coder = TableRowJsonCoder(table_schema=schema)
        test_row = bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value(e)) for e in data_defination
        ])

        self.assertEqual(str_def, coder.encode(test_row))
        self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
        # A coder without schema can still decode.
        self.assertEqual(test_row,
                         TableRowJsonCoder().decode(coder.encode(test_row)))
예제 #20
0
    def test_generate_header_fields_from_schema_schema_compatibility(self):
        schema_conflict = bigquery.TableSchema()
        schema_conflict.fields.append(
            bigquery.TableFieldSchema(
                name='AA',
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='desc'))
        with self.assertRaises(ValueError):
            bigquery_vcf_schema_converter.generate_header_fields_from_schema(
                schema_conflict)

        header = bigquery_vcf_schema_converter.generate_header_fields_from_schema(
            schema_conflict, allow_incompatible_schema=True)
        infos = OrderedDict([('AA', Info('AA', 1, 'Integer', 'desc', None,
                                         None))])
        expected_header = vcf_header_io.VcfHeader(infos=infos,
                                                  formats=OrderedDict())
        self.assertEqual(header, expected_header)
예제 #21
0
    def parseSchema(self):
        """ Reads the schema of the given rows using row2 as a check for hidden
        json strings.
        Args:

        Returns:
            A tuple (json_columns, table_schema) where
            - json_columns is a list of the columns [{'name':<COLUMN_NAME>, 'is_json':<CONTAINS_A_JSON_STRING>}]
            - table_schema is the actual bigquery.TableSchema
        """
        column_names = self.row1.split(self.delimiter)

        for idx, cn in enumerate(column_names):
            column_schema = bigquery.TableFieldSchema()
            column_schema.name = cn
            column_schema.type = "string"
            column_schema.mode = 'nullable'
            is_json = False

            if self.row2 != "":
                data_values = re.findall(
                    r'(?:[^\s' + self.delimiter +
                    '"]|"(?:\\.|[^"])*"|(?<=,))+', self.row2)
                check_json = self._is_json(
                    data_values[idx].decode('string-escape').strip('"'))
                if check_json:
                    # Update the schema type and mark the column as json
                    column_schema.type = 'record'
                    is_json = True

                    # Converts JSON to TableFieldSchema and append it to the column
                    nested_schema = self._json_to_table_schema(
                        check_json, column_schema)

                    # Appends
                    column_schema = nested_schema
            # Add the column name to the list and whether it is json or not
            self.json_columns.append({'name': cn, 'is_json': is_json})

            # Add the field to the schema
            self.table_schema.fields.append(column_schema)

        return self.json_columns, self.getSchema()
예제 #22
0
    def expand(self, pcoll):
        table_schema = bigquery.TableSchema()

        head_schema = bigquery.TableFieldSchema()
        head_schema.name = 'head'
        head_schema.type = 'string'
        head_schema.mode = 'nullable'
        table_schema.fields.append(head_schema)

        head_type_schema = bigquery.TableFieldSchema()
        head_type_schema.name = 'head_type'
        head_type_schema.type = 'string'
        head_type_schema.mode = 'nullable'
        table_schema.fields.append(head_type_schema)

        relation_schema = bigquery.TableFieldSchema()
        relation_schema.name = 'relation'
        relation_schema.type = 'string'
        relation_schema.mode = 'nullable'
        table_schema.fields.append(relation_schema)

        tail_schema = bigquery.TableFieldSchema()
        tail_schema.name = 'tail'
        tail_schema.type = 'string'
        tail_schema.mode = 'nullable'
        table_schema.fields.append(tail_schema)

        tail_type_schema = bigquery.TableFieldSchema()
        tail_type_schema.name = 'tail_type'
        tail_type_schema.type = 'string'
        tail_type_schema.mode = 'nullable'
        table_schema.fields.append(tail_type_schema)

        sentence_schema = bigquery.TableFieldSchema()
        sentence_schema.name = 'sentence'
        sentence_schema.type = 'string'
        sentence_schema.mode = 'nullable'
        table_schema.fields.append(sentence_schema)

        return (
            pcoll
            | 'Parse the json lines' >> beam.ParDo(Parse_json())
            | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
                table='relation_extraction_data',
                dataset=DATASET_ID,
                project=PROJECT_ID,
                schema=table_schema,  # Pass the defined table_schema
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
예제 #23
0
 def test_merge_field_schemas_same_id_no_conflicts(self):
   field_schemas_1 = [
       bigquery.TableFieldSchema(
           name='II',
           type=TableFieldConstants.TYPE_INTEGER,
           mode=TableFieldConstants.MODE_NULLABLE,
           description='INFO foo desc'),
       bigquery.TableFieldSchema(
           name='IFR',
           type=TableFieldConstants.TYPE_FLOAT,
           mode=TableFieldConstants.MODE_REPEATED,
           description='INFO foo desc')
   ]
   field_schemas_2 = [
       bigquery.TableFieldSchema(
           name='II',
           type=TableFieldConstants.TYPE_INTEGER,
           mode=TableFieldConstants.MODE_NULLABLE,
           description='INFO foo desc'),
       bigquery.TableFieldSchema(
           name='AB',
           type=TableFieldConstants.TYPE_FLOAT,
           mode=TableFieldConstants.MODE_NULLABLE,
           description='INFO foo desc')
   ]
   merged_field_schemas = bigquery_util._get_merged_field_schemas(
       field_schemas_1, field_schemas_2)
   expected_merged_field_schemas = [
       bigquery.TableFieldSchema(
           name='II',
           type=TableFieldConstants.TYPE_INTEGER,
           mode=TableFieldConstants.MODE_NULLABLE,
           description='INFO foo desc'),
       bigquery.TableFieldSchema(
           name='IFR',
           type=TableFieldConstants.TYPE_FLOAT,
           mode=TableFieldConstants.MODE_REPEATED,
           description='INFO foo desc'),
       bigquery.TableFieldSchema(
           name='AB',
           type=TableFieldConstants.TYPE_FLOAT,
           mode=TableFieldConstants.MODE_NULLABLE,
           description='INFO foo desc')
   ]
   self.assertEqual(merged_field_schemas, expected_merged_field_schemas)
예제 #24
0
 def event_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='timezone', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='market_count', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(name='open_date_str',
                                   type='STRING',
                                   mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='country_code', type='STRING', mode='REQUIRED'),
     ])
예제 #25
0
  def test_get_or_create_table_invalid_tablename(self):
    invalid_names = ['big-query', 'table name', 'a' * 1025]
    for table_id in invalid_names:
      client = mock.Mock()
      client.tables.Get.side_effect = [None]
      wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)

      self.assertRaises(
          ValueError,
          wrapper.get_or_create_table,
          'project_id',
          'dataset_id',
          table_id,
          bigquery.TableSchema(
              fields=[
                  bigquery.TableFieldSchema(
                      name='b', type='BOOLEAN', mode='REQUIRED')
              ]),
          False,
          False)
예제 #26
0
def _get_beam_bigquery_schema(
    fields: Dict[str, Tuple[str, str]]) -> beam_bigquery.TableSchema:
  """Return a beam bigquery schema for the output table.

  Args:
    fields: dict of {'field_name': ['column_type', 'column_mode']}

  Returns:
    A bigquery table schema
  """
  table_schema = beam_bigquery.TableSchema()

  for (name, (field_type, mode)) in fields.items():
    field_schema = beam_bigquery.TableFieldSchema()
    field_schema.name = name
    field_schema.type = field_type
    field_schema.mode = mode
    table_schema.fields.append(field_schema)

  return table_schema
예제 #27
0
 def market_schema(self):
     return bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='name', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='total_matched', type='FLOAT', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='event_id', type='STRING', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='start_time', type='TIMESTAMP', mode='NULLABLE'),
     ])
예제 #28
0
    def _parse_schema_field(field):
        """Parse a single schema field from dictionary.

    Args:
      field: Dictionary object containing serialized schema.

    Returns:
      A TableFieldSchema for a single column in BigQuery.
    """
        schema = bigquery.TableFieldSchema()
        schema.name = field['name']
        schema.type = field['type']
        if 'mode' in field:
            schema.mode = field['mode']
        else:
            schema.mode = 'NULLABLE'
        if 'description' in field:
            schema.description = field['description']
        if 'fields' in field:
            schema.fields = [_parse_schema_field(x) for x in field['fields']]
        return schema
def get_bigquery_schema():
    """
  Returns a bigquery schema.
  """
    from apache_beam.io.gcp.internal.clients import bigquery

    table_schema = bigquery.TableSchema()
    columns = (('topics', 'string',
                'nullable'), ('title', 'string',
                              'nullable'), ('content', 'string', 'nullable'),
               ('title_embed', 'float', 'repeated'), ('content_embed', 'float',
                                                      'repeated'))

    for column in columns:
        column_schema = bigquery.TableFieldSchema()
        column_schema.name = column[0]
        column_schema.type = column[1]
        column_schema.mode = column[2]
        table_schema.fields.append(column_schema)

    return table_schema
    def _get_table_schema(self):
        # type (None) -> bigquery.TableSchema
        schema = bigquery.TableSchema()
        schema.fields.append(
            bigquery.TableFieldSchema(name='field_1',
                                      type=Consts.TYPE_STRING,
                                      mode=Consts.MODE_NULLABLE))
        schema.fields.append(
            bigquery.TableFieldSchema(name='field_2',
                                      type=Consts.TYPE_INTEGER,
                                      mode=Consts.MODE_REPEATED))
        # Record field.
        record_field = bigquery.TableFieldSchema(
            name='record_1',
            type=Consts.TYPE_RECORD,
            mode=Consts.MODE_REPEATED,
        )
        record_field.fields.append(
            bigquery.TableFieldSchema(
                name='record_1_field_1',
                type=Consts.TYPE_BOOLEAN,
                mode=Consts.MODE_NULLABLE,
            ))
        record_field.fields.append(
            bigquery.TableFieldSchema(name='record_1_field_2',
                                      type=Consts.TYPE_FLOAT,
                                      mode=Consts.MODE_REPEATED))
        # Record field, two level deep.
        deep_record_field = bigquery.TableFieldSchema(
            name='record_1-record_2',
            type=Consts.TYPE_RECORD,
            mode=Consts.MODE_REPEATED)
        deep_record_field.fields.append(
            bigquery.TableFieldSchema(name='record_1-record_2_field_1',
                                      type=Consts.TYPE_BOOLEAN,
                                      mode=Consts.MODE_NULLABLE))

        record_field.fields.append(deep_record_field)
        schema.fields.append(record_field)
        return schema