Esempio n. 1
0
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = df.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with df.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in sample_row.iteritems():
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows)))
Esempio n. 2
0
 def test_table_with_write_disposition_append(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.return_value = table
     client.tables.Insert.return_value = table
     write_disposition = df.io.BigQueryDisposition.WRITE_APPEND
     with df.io.BigQuerySink(
             'project:dataset.table',
             write_disposition=write_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertFalse(client.tables.Delete.called)
     self.assertFalse(client.tables.Insert.called)
Esempio n. 3
0
 def test_no_table_and_create_if_needed(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.side_effect = HttpError(response={'status': '404'},
                                               url='',
                                               content='')
     client.tables.Insert.return_value = table
     create_disposition = df.io.BigQueryDisposition.CREATE_IF_NEEDED
     with df.io.BigQuerySink(
             'project:dataset.table',
             schema='somefield:INTEGER',
             create_disposition=create_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertTrue(client.tables.Insert.called)
Esempio n. 4
0
 def test_table_empty_and_write_disposition_empty(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.return_value = table
     client.tabledata.List.return_value = bigquery.TableDataList(
         totalRows=0)
     client.tables.Insert.return_value = table
     write_disposition = df.io.BigQueryDisposition.WRITE_EMPTY
     with df.io.BigQuerySink(
             'project:dataset.table',
             write_disposition=write_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertTrue(client.tabledata.List.called)
     self.assertFalse(client.tables.Delete.called)
     self.assertFalse(client.tables.Insert.called)
Esempio n. 5
0
 def get_test_rows(self):
     now = time.time()
     expected_rows = [{
         'i': 1,
         's': 'abc',
         'f': 2.3,
         'b': True,
         't': now
     }, {
         'i': 10,
         's': 'xyz',
         'f': -3.14,
         'b': False
     }]
     schema = bigquery.TableSchema(fields=[
         bigquery.TableFieldSchema(
             name='b', type='BOOLEAN', mode='REQUIRED'),
         bigquery.TableFieldSchema(name='f', type='FLOAT', mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='i', type='INTEGER', mode='REQUIRED'),
         bigquery.TableFieldSchema(name='s', type='STRING',
                                   mode='REQUIRED'),
         bigquery.TableFieldSchema(
             name='t', type='TIMESTAMP', mode='NULLABLE')
     ])
     table_rows = [
         bigquery.TableRow(f=[
             bigquery.TableCell(v=to_json_value('true')),
             bigquery.TableCell(v=to_json_value(str(2.3))),
             bigquery.TableCell(v=to_json_value(str(1))),
             bigquery.TableCell(v=to_json_value('abc')),
             # For timestamps cannot use str() because it will truncate the
             # number representing the timestamp.
             bigquery.TableCell(v=to_json_value('%f' % now))
         ]),
         bigquery.TableRow(f=[
             bigquery.TableCell(v=to_json_value('false')),
             bigquery.TableCell(v=to_json_value(str(-3.14))),
             bigquery.TableCell(v=to_json_value(str(10))),
             bigquery.TableCell(v=to_json_value('xyz')),
             bigquery.TableCell(v=None)
         ])
     ]
     return table_rows, schema, expected_rows
Esempio n. 6
0
    def test_row_as_table_row(self):
        schema_definition = [('s', 'STRING'), ('i', 'INTEGER'), ('f', 'FLOAT'),
                             ('b', 'BOOLEAN')]
        schema = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name=k, type=v)
            for k, v in schema_definition
        ])
        coder = TableRowJsonCoder(table_schema=schema)
        test_row = bigquery.TableRow(f=[
            bigquery.TableCell(v=to_json_value(e))
            for e in ['abc', 123, 123.456, True]
        ])

        self.assertEqual('{"s": "abc", "i": 123, "f": 123.456, "b": true}',
                         coder.encode(test_row))
        self.assertEqual(test_row, coder.decode(coder.encode(test_row)))
        # A coder without schema can still decode.
        self.assertEqual(test_row,
                         TableRowJsonCoder().decode(coder.encode(test_row)))
Esempio n. 7
0
 def test_table_not_empty_and_write_disposition_empty(self):
     client = mock.Mock()
     client.tables.Get.return_value = bigquery.Table(
         tableReference=bigquery.TableReference(projectId='project',
                                                datasetId='dataset',
                                                tableId='table'),
         schema=bigquery.TableSchema())
     client.tabledata.List.return_value = bigquery.TableDataList(
         totalRows=1)
     write_disposition = df.io.BigQueryDisposition.WRITE_EMPTY
     with self.assertRaises(RuntimeError) as exn:
         with df.io.BigQuerySink(
                 'project:dataset.table',
                 write_disposition=write_disposition).writer(client):
             pass
     self.assertEqual(
         exn.exception.message,
         'Table project:dataset.table is not empty but write disposition is '
         'WRITE_EMPTY.')
Esempio n. 8
0
 def test_nested_schema_as_json(self):
     string_field = bigquery.TableFieldSchema(name='s',
                                              type='STRING',
                                              mode='NULLABLE',
                                              description='s description')
     number_field = bigquery.TableFieldSchema(name='n',
                                              type='INTEGER',
                                              mode='REQUIRED',
                                              description='n description')
     record_field = bigquery.TableFieldSchema(
         name='r',
         type='RECORD',
         mode='REQUIRED',
         description='r description',
         fields=[string_field, number_field])
     schema = bigquery.TableSchema(fields=[record_field])
     sink = df.io.BigQuerySink('dataset.table', schema=schema)
     self.assertEqual(
         {
             'fields': [{
                 'name':
                 'r',
                 'type':
                 'RECORD',
                 'mode':
                 'REQUIRED',
                 'description':
                 'r description',
                 'fields': [{
                     'name': 's',
                     'type': 'STRING',
                     'mode': 'NULLABLE',
                     'description': 's description'
                 }, {
                     'name': 'n',
                     'type': 'INTEGER',
                     'mode': 'REQUIRED',
                     'description': 'n description'
                 }]
             }]
         }, json.loads(sink.schema_as_json()))
def run(argv=None):
    """Run the workflow."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--output',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    from google.cloud.dataflow.internal.clients import bigquery  # pylint: disable=g-import-not-at-top

    table_schema = bigquery.TableSchema()

    # Fields that use standard types.
    kind_schema = bigquery.TableFieldSchema()
    kind_schema.name = 'kind'
    kind_schema.type = 'string'
    kind_schema.mode = 'nullable'
    table_schema.fields.append(kind_schema)

    full_name_schema = bigquery.TableFieldSchema()
    full_name_schema.name = 'fullName'
    full_name_schema.type = 'string'
    full_name_schema.mode = 'required'
    table_schema.fields.append(full_name_schema)

    age_schema = bigquery.TableFieldSchema()
    age_schema.name = 'age'
    age_schema.type = 'integer'
    age_schema.mode = 'nullable'
    table_schema.fields.append(age_schema)

    gender_schema = bigquery.TableFieldSchema()
    gender_schema.name = 'gender'
    gender_schema.type = 'string'
    gender_schema.mode = 'nullable'
    table_schema.fields.append(gender_schema)

    # A nested field
    phone_number_schema = bigquery.TableFieldSchema()
    phone_number_schema.name = 'phoneNumber'
    phone_number_schema.type = 'record'
    phone_number_schema.mode = 'nullable'

    area_code = bigquery.TableFieldSchema()
    area_code.name = 'areaCode'
    area_code.type = 'integer'
    area_code.mode = 'nullable'
    phone_number_schema.fields.append(area_code)

    number = bigquery.TableFieldSchema()
    number.name = 'number'
    number.type = 'integer'
    number.mode = 'nullable'
    phone_number_schema.fields.append(number)
    table_schema.fields.append(phone_number_schema)

    # A repeated field.
    children_schema = bigquery.TableFieldSchema()
    children_schema.name = 'children'
    children_schema.type = 'string'
    children_schema.mode = 'repeated'
    table_schema.fields.append(children_schema)

    def create_random_record(record_id):
        return {
            'kind':
            'kind' + record_id,
            'fullName':
            'fullName' + record_id,
            'age':
            int(record_id) * 10,
            'gender':
            'male',
            'phoneNumber': {
                'areaCode': int(record_id) * 100,
                'number': int(record_id) * 100000
            },
            'children': [
                'child' + record_id + '1', 'child' + record_id + '2',
                'child' + record_id + '3'
            ]
        }

    # pylint: disable=expression-not-assigned
    record_ids = p | df.Create('CreateIDs', ['1', '2', '3', '4', '5'])
    records = record_ids | df.Map('CreateRecords', create_random_record)
    records | df.io.Write(
        'write',
        df.io.BigQuerySink(
            known_args.output,
            schema=table_schema,
            create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))

    # Run the pipeline (all operations are deferred until run() is called).
    p.run()
Esempio n. 10
0
  def __init__(self, table, dataset=None, project=None, schema=None,
               create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=BigQueryDisposition.WRITE_EMPTY,
               validate=False, coder=None):
    """Initialize a BigQuerySink.

    Args:
      table: The ID of the table. The ID must contain only letters
        (a-z, A-Z), numbers (0-9), or underscores (_). If dataset argument is
        None then the table argument must contain the entire table reference
        specified as: 'DATASET.TABLE' or 'PROJECT:DATASET.TABLE'.
      dataset: The ID of the dataset containing this table or null if the table
        reference is specified entirely by the table argument.
      project: The ID of the project containing this table or null if the table
        reference is specified entirely by the table argument.
      schema: The schema to be used if the BigQuery table to write has to be
        created. This can be either specified as a 'bigquery.TableSchema' object
        or a single string  of the form 'field1:type1,field2:type2,field3:type3'
        that defines a comma separated list of fields. Here 'type' should
        specify the BigQuery type of the field. Single string based schemas do
        not support nested fields, repeated fields, or specifying a BigQuery
        mode for fields (mode will always be set to 'NULLABLE').
      create_disposition: A string describing what happens if the table does not
        exist. Possible values are:
        - BigQueryDisposition.CREATE_IF_NEEDED: create if does not exist.
        - BigQueryDisposition.CREATE_NEVER: fail the write if does not exist.
      write_disposition: A string describing what happens if the table has
        already some data. Possible values are:
        -  BigQueryDisposition.WRITE_TRUNCATE: delete existing rows.
        -  BigQueryDisposition.WRITE_APPEND: add to existing rows.
        -  BigQueryDisposition.WRITE_EMPTY: fail the write if table not empty.
      validate: If true, various checks will be done when sink gets
        initialized (e.g., is table present given the disposition arguments?).
        This should be True for most scenarios in order to catch errors as early
        as possible (pipeline construction instead of pipeline execution). It
        should be False if the table is created during pipeline execution by a
        previous step.
      coder: The coder for the table rows if serialized to disk. If None, then
        the default coder is RowAsDictJsonCoder, which will interpret every
        element written to the sink as a dictionary that will be JSON serialized
        as a line in a file. This argument needs a value only in special cases
        when writing table rows as dictionaries is not desirable.

    Raises:
      TypeError: if the schema argument is not a string or a TableSchema object.
      ValueError: if the table reference as a string does not match the expected
      format.
    """
    self.table_reference = _parse_table_reference(table, dataset, project)
    # Transform the table schema into a bigquery.TableSchema instance.
    if isinstance(schema, basestring):
      # TODO(silviuc): Should add a regex-based validation of the format.
      table_schema = bigquery.TableSchema()
      schema_list = [s.strip(' ') for s in schema.split(',')]
      for field_and_type in schema_list:
        field_name, field_type = field_and_type.split(':')
        field_schema = bigquery.TableFieldSchema()
        field_schema.name = field_name
        field_schema.type = field_type
        field_schema.mode = 'NULLABLE'
        table_schema.fields.append(field_schema)
      self.table_schema = table_schema
    elif schema is None:
      # TODO(silviuc): Should check that table exists if no schema specified.
      self.table_schema = schema
    elif isinstance(schema, bigquery.TableSchema):
      self.table_schema = schema
    else:
      raise TypeError('Unexpected schema argument: %s.' % schema)

    self.create_disposition = BigQueryDisposition.validate_create(
        create_disposition)
    self.write_disposition = BigQueryDisposition.validate_write(
        write_disposition)
    self.validate = validate
    self.coder = coder or RowAsDictJsonCoder()