コード例 #1
0
def test_datalab_load_table_from_gcs_csv(to_delete):
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    import google.datalab.bigquery as bq

    # Create the dataset
    dataset_id = 'import_sample'
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]
    # Use unique dataset ID to avoid collisions when running tests
    dataset_id = 'test_dataset_{}'.format(int(time.time() * 1000))
    to_delete.append(dataset_id)
    # [START bigquery_migration_datalab_load_table_from_gcs_csv]
    bq.Dataset(dataset_id).create()

    # Create the table
    schema = [
        {'name': 'name', 'type': 'STRING'},
        {'name': 'post_abbr', 'type': 'STRING'},
    ]
    table = bq.Table(
        '{}.us_states'.format(dataset_id)).create(schema=schema)
    table.load(
        'gs://cloud-samples-data/bigquery/us-states/us-states.csv',
        mode='append',
        source_format='csv',
        csv_options=bq.CSVOptions(skip_leading_rows=1)
    )  # Waits for the job to complete
    # [END bigquery_migration_datalab_load_table_from_gcs_csv]

    assert table.length == 50
コード例 #2
0
    def execute(self, context):
        table = bq.Table(self.table, context=None)
        if not table.exists():
            table.create(schema=self.schema)

        kwargs = {}
        if 'delimiter' in self.csv_options:
            kwargs['delimiter'] = self.csv_options['delimiter']
        if 'skip' in self.csv_options:
            kwargs['skip_leading_rows'] = self.csv_options['skip']
        if 'strict' in self.csv_options:
            kwargs['allow_jagged_rows'] = self.csv_options['strict']
        if 'quote' in self.csv_options:
            kwargs['quote'] = self.csv_options['quote']
        csv_options = bq.CSVOptions(**kwargs)

        job = table.load(
            self.path,
            mode=self.mode,
            source_format=('csv' if self.format == 'csv' else
                           'NEWLINE_DELIMITED_JSON'),
            csv_options=csv_options,
            ignore_unknown_values=not self.csv_options.get('strict'))

        if job.failed:
            raise Exception('Load failed: %s' % str(job.fatal_error))
        elif job.errors:
            raise Exception('Load completed with errors: %s' % str(job.errors))

        return {'result': job.result()}
コード例 #3
0
    def execute(self, context):
        if self._table:
            pydatalab_context = google.datalab.Context.default()
            table = bq.Table(self._table, context=pydatalab_context)

        if self._mode == 'create':
            if table.exists():
                raise Exception(
                    "%s already exists; mode should be \'append\' or \'overwrite\'"
                    % self._table)
            if not self._schema:
                raise Exception(
                    '%s does not exist, and no schema specified in cell; cannot load.'
                    % self._table)
            table.create(schema=self._schema)
        elif not table.exists():
            raise Exception('%s does not exist; mode should be \'create\'' %
                            self._table)

        csv_options = bq.CSVOptions(
            delimiter=self._csv_options.get('delimiter'),
            skip_leading_rows=self._csv_options.get('skip'),
            allow_jagged_rows=self._csv_options.get('strict'),
            quote=self._csv_options.get('quote'))
        job = table.load(
            self._path,
            mode=self._mode,
            source_format=('csv' if self._format == 'csv' else
                           'NEWLINE_DELIMITED_JSON'),
            csv_options=csv_options,
            ignore_unknown_values=not self._csv_options.get('strict'))
        if job.failed:
            raise Exception('Load failed: %s' % str(job.fatal_error))
        elif job.errors:
            raise Exception('Load completed with errors: %s' % str(job.errors))
コード例 #4
0
ファイル: _bigquery.py プロジェクト: abhinavrpatel/pydatalab
def _load_cell(args, cell_body):
    """Implements the BigQuery load magic used to load data from GCS to a table.

   The supported syntax is:

       %bq load <optional args>

  Args:
    args: the arguments following '%bq load'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    A message about whether the load succeeded or failed.
  """
    name = args['table']
    table = _get_table(name)
    if not table:
        table = bigquery.Table(name)

    if args['mode'] == 'create':
        if table.exists():
            raise Exception(
                'table %s already exists; use "append" or "overwrite" as mode.'
                % name)
        if not cell_body or 'schema' not in cell_body:
            raise Exception(
                'Table does not exist, and no schema specified in cell; cannot load.'
            )

        env = google.datalab.utils.commands.notebook_environment()
        config = google.datalab.utils.commands.parse_config(
            cell_body, env, False)
        schema = config['schema']
        # schema can be an instance of bigquery.Schema.
        # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and
        # specify "schema: $my_schema" in cell input.
        if not isinstance(schema, bigquery.Schema):
            jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
            schema = bigquery.Schema(schema)
        table.create(schema=schema)
    elif not table.exists():
        raise Exception('table %s does not exist; use "create" as mode.' %
                        name)

    csv_options = bigquery.CSVOptions(delimiter=args['delimiter'],
                                      skip_leading_rows=args['skip'],
                                      allow_jagged_rows=not args['strict'],
                                      quote=args['quote'])
    job = table.load(args['path'],
                     mode=args['mode'],
                     source_format=args['format'],
                     csv_options=csv_options,
                     ignore_unknown_values=not args['strict'])
    if job.failed:
        raise Exception('Load failed: %s' % str(job.fatal_error))
    elif job.errors:
        raise Exception('Load completed with errors: %s' % str(job.errors))
コード例 #5
0
 def _create_external_data_source(self, skip_header_rows):
     import google.datalab.bigquery as bq
     df = self.browse(1, None)
     # read each column as STRING because we only want to sample rows.
     schema_train = bq.Schema([{
         'name': name,
         'type': 'STRING'
     } for name in df.keys()])
     options = bq.CSVOptions(
         skip_leading_rows=(1 if skip_header_rows == True else 0))
     return bq.ExternalDataSource(self.path,
                                  csv_options=options,
                                  schema=schema_train,
                                  max_bad_records=0)
コード例 #6
0
    def execute(self, context):
        if self.data_source:
            kwargs = {}
            if self.csv_options:
                csv_kwargs = {}
                if 'delimiter' in self.csv_options:
                    csv_kwargs['delimiter'] = self.csv_options['delimiter']
                if 'skip' in self.csv_options:
                    csv_kwargs['skip_leading_rows'] = self.csv_options['skip']
                if 'strict' in self.csv_options:
                    csv_kwargs['allow_jagged_rows'] = self.csv_options[
                        'strict']
                if 'quote' in self.csv_options:
                    csv_kwargs['quote'] = self.csv_options['quote']
                kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs)

            if self.format:
                kwargs['source_format'] = self.format

            if self.max_bad_records:
                kwargs['max_bad_records'] = self.max_bad_records

            external_data_source = bq.ExternalDataSource(source=self.path,
                                                         schema=bq.Schema(
                                                             self.schema),
                                                         **kwargs)
            query = bq.Query(
                sql=self.sql,
                data_sources={self.data_source: external_data_source})
        else:
            query = bq.Query(sql=self.sql)

        # use_cache is False since this is most likely the case in pipeline scenarios
        # allow_large_results can be True only if table is specified (i.e. when it's not None)
        kwargs = {}
        if self.mode is not None:
            kwargs['mode'] = self.mode
        output_options = bq.QueryOutput.table(name=self.table,
                                              use_cache=False,
                                              allow_large_results=self.table
                                              is not None,
                                              **kwargs)
        query_params = bq.Query.get_query_parameters(self.parameters)
        job = query.execute(output_options=output_options,
                            query_params=query_params)

        # Returning the table-name here makes it available for downstream task instances.
        return {'table': job.result().full_name}