Ejemplo n.º 1
0
def run_analysis(args):
    """Builds an analysis file for training.

  Uses BiqQuery tables to do the analysis.

  Args:
    args: command line args

  Raises:
    ValueError if schema contains unknown types.
  """
    import google.datalab.bigquery as bq
    if args.bigquery_table:
        table = bq.Table(args.bigquery_table)
        schema_list = table.schema._bq_schema
    else:
        schema_list = json.loads(
            file_io.read_file_to_string(args.schema_file).decode())
        table = bq.ExternalDataSource(source=args.input_file_pattern,
                                      schema=bq.Schema(schema_list))

    # Check the schema is supported.
    for col_schema in schema_list:
        col_type = col_schema['type'].lower()
        if col_type != 'string' and col_type != 'integer' and col_type != 'float':
            raise ValueError('Schema contains an unsupported type %s.' %
                             col_type)

    run_numerical_analysis(table, schema_list, args)
    run_categorical_analysis(table, schema_list, args)

    # Save a copy of the schema to the output location.
    file_io.write_string_to_file(
        os.path.join(args.output_dir, SCHEMA_FILE),
        json.dumps(schema_list, indent=2, separators=(',', ': ')))
Ejemplo n.º 2
0
def _datasource_cell(args, cell_body):
  """Implements the BigQuery datasource cell magic for ipython notebooks.

  The supported syntax is
  %%bq datasource --name <var> --paths <url> [--format <CSV|JSON>]
  <schema>

  Args:
    args: the optional arguments following '%%bq datasource'
    cell_body: the datasource's schema in json/yaml
  """
  name = args['name']
  paths = args['paths']
  data_format = (args['format'] or 'CSV').lower()
  compressed = args['compressed'] or False

  # Get the source schema from the cell body
  record = google.datalab.utils.commands.parse_config(
      cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False)

  jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
  schema = bigquery.Schema(record['schema'])

  # Finally build the datasource object
  datasource = bigquery.ExternalDataSource(source=paths, source_format=data_format,
                                           compressed=compressed, schema=schema)
  google.datalab.utils.commands.notebook_environment()[name] = datasource
Ejemplo n.º 3
0
    def test_numerics(self):
        """Build a BQ table, and then call analyze on it."""
        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        project_id = dl.Context.default().project_id
        dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
        table_name = 'temp_table'
        full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

        output_folder = tempfile.mkdtemp()

        try:
            # Make a dataset, a table, and insert data.
            db = bq.Dataset((project_id, dataset_name))
            db.create()

            table = bq.Table(full_table_name)
            table.create(schema=bq.Schema(schema), overwrite=True)

            data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
            table.insert(data)

            analyze_data.run_cloud_analysis(output_dir=output_folder,
                                            csv_file_pattern=None,
                                            bigquery_table=full_table_name,
                                            schema=schema,
                                            features={
                                                'col1': {
                                                    'transform': 'scale'
                                                },
                                                'col2': {
                                                    'transform': 'identity'
                                                }
                                            })

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze_data.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
            db.delete(delete_contents=True)
Ejemplo n.º 4
0
def _load_cell(args, cell_body):
    """Implements the BigQuery load magic used to load data from GCS to a table.

   The supported syntax is:

       %bq load <optional args>

  Args:
    args: the arguments following '%bq load'.
    cell_body: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    A message about whether the load succeeded or failed.
  """
    name = args['table']
    table = _get_table(name)
    if not table:
        table = bigquery.Table(name)

    if args['mode'] == 'create':
        if table.exists():
            raise Exception(
                'table %s already exists; use "append" or "overwrite" as mode.'
                % name)
        if not cell_body or 'schema' not in cell_body:
            raise Exception(
                'Table does not exist, and no schema specified in cell; cannot load.'
            )

        env = google.datalab.utils.commands.notebook_environment()
        config = google.datalab.utils.commands.parse_config(
            cell_body, env, False)
        schema = config['schema']
        # schema can be an instance of bigquery.Schema.
        # For example, user can run "my_schema = bigquery.Schema.from_data(df)" in a previous cell and
        # specify "schema: $my_schema" in cell input.
        if not isinstance(schema, bigquery.Schema):
            jsonschema.validate(config, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
            schema = bigquery.Schema(schema)
        table.create(schema=schema)
    elif not table.exists():
        raise Exception('table %s does not exist; use "create" as mode.' %
                        name)

    csv_options = bigquery.CSVOptions(delimiter=args['delimiter'],
                                      skip_leading_rows=args['skip'],
                                      allow_jagged_rows=not args['strict'],
                                      quote=args['quote'])
    job = table.load(args['path'],
                     mode=args['mode'],
                     source_format=args['format'],
                     csv_options=csv_options,
                     ignore_unknown_values=not args['strict'])
    if job.failed:
        raise Exception('Load failed: %s' % str(job.fatal_error))
    elif job.errors:
        raise Exception('Load completed with errors: %s' % str(job.errors))
Ejemplo n.º 5
0
 def test_table_cell_create(self, mock_table, mock_default_context):
   args = {'command': 'create', 'name': 'test-table', 'overwrite': None}
   cell_body = {
     'schema': [
       {'name': 'col1', 'type': 'int64', 'mode': 'NULLABLE', 'description': 'description1'},
       {'name': 'col1', 'type': 'STRING', 'mode': 'required', 'description': 'description1'}
     ]
   }
   bq.commands._bigquery._table_cell(args, json.dumps(cell_body))
   call_kwargs = mock_table.return_value.create.call_args[1]
   self.assertEqual(None, call_kwargs['overwrite'])
   self.assertEqual(bq.Schema(cell_body['schema']), call_kwargs['schema'])
Ejemplo n.º 6
0
 def _create_external_data_source(self, skip_header_rows):
     import google.datalab.bigquery as bq
     df = self.browse(1, None)
     # read each column as STRING because we only want to sample rows.
     schema_train = bq.Schema([{
         'name': name,
         'type': 'STRING'
     } for name in df.keys()])
     options = bq.CSVOptions(
         skip_leading_rows=(1 if skip_header_rows == True else 0))
     return bq.ExternalDataSource(self.path,
                                  csv_options=options,
                                  schema=schema_train,
                                  max_bad_records=0)
Ejemplo n.º 7
0
    def test_table_cell_describe(self, mock_get_table, mock_default_context):
        args = {'command': 'describe', 'name': 'test-table', 'overwrite': None}
        mock_get_table.return_value = None
        with self.assertRaisesRegexp(Exception, 'Could not find table'):
            bq.commands._bigquery._table_cell(args, None)

        mock_get_table.return_value = bq.Table('project.test.table')
        schema = bq.Schema([{'name': 'col1', 'type': 'string'}])
        mock_get_table.return_value._schema = schema
        rendered = bq.commands._bigquery._table_cell(args, None)
        expected_html1 = 'bq.renderSchema(dom, [{"type": "string", "name": "col1"}]);'
        expected_html2 = 'bq.renderSchema(dom, [{"name": "col1", "type": "string"}]);'
        self.assertTrue(expected_html1 in rendered
                        or expected_html2 in rendered)
Ejemplo n.º 8
0
    def execute(self, context):
        if self.data_source:
            kwargs = {}
            if self.csv_options:
                csv_kwargs = {}
                if 'delimiter' in self.csv_options:
                    csv_kwargs['delimiter'] = self.csv_options['delimiter']
                if 'skip' in self.csv_options:
                    csv_kwargs['skip_leading_rows'] = self.csv_options['skip']
                if 'strict' in self.csv_options:
                    csv_kwargs['allow_jagged_rows'] = self.csv_options[
                        'strict']
                if 'quote' in self.csv_options:
                    csv_kwargs['quote'] = self.csv_options['quote']
                kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs)

            if self.format:
                kwargs['source_format'] = self.format

            if self.max_bad_records:
                kwargs['max_bad_records'] = self.max_bad_records

            external_data_source = bq.ExternalDataSource(source=self.path,
                                                         schema=bq.Schema(
                                                             self.schema),
                                                         **kwargs)
            query = bq.Query(
                sql=self.sql,
                data_sources={self.data_source: external_data_source})
        else:
            query = bq.Query(sql=self.sql)

        # use_cache is False since this is most likely the case in pipeline scenarios
        # allow_large_results can be True only if table is specified (i.e. when it's not None)
        kwargs = {}
        if self.mode is not None:
            kwargs['mode'] = self.mode
        output_options = bq.QueryOutput.table(name=self.table,
                                              use_cache=False,
                                              allow_large_results=self.table
                                              is not None,
                                              **kwargs)
        query_params = bq.Query.get_query_parameters(self.parameters)
        job = query.execute(output_options=output_options,
                            query_params=query_params)

        # Returning the table-name here makes it available for downstream task instances.
        return {'table': job.result().full_name}
Ejemplo n.º 9
0
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema,
                       inverted_features):
  """Use BigQuery to analyze input date.

  Only one of csv_file_pattern or bigquery_table should be non-None.

  Args:
    output_dir: output folder
    csv_file_pattern: list of csv file paths, may contain wildcards
    bigquery_table: project_id.dataset_name.table_name
    schema: schema list
    inverted_features: inverted_features dict
  """

  def _execute_sql(sql, table):
    """Runs a BigQuery job and dowloads the results into local memeory.

    Args:
      sql: a SQL string
      table: bq.ExternalDataSource or bq.Table

    Returns:
      A Pandas dataframe.
    """
    import google.datalab.bigquery as bq
    if isinstance(table, bq.ExternalDataSource):
      query = bq.Query(sql, data_sources={'csv_table': table})
    else:
      query = bq.Query(sql)
    return query.execute().result().to_dataframe()

  import google.datalab.bigquery as bq
  if bigquery_table:
    table_name = '`%s`' % bigquery_table
    table = None
  else:
    table_name = 'csv_table'
    table = bq.ExternalDataSource(
        source=csv_file_pattern,
        schema=bq.Schema(schema))

  # Make a copy of inverted_features and update the target transform to be
  # identity or one hot depending on the schema.
  inverted_features_target = copy.deepcopy(inverted_features)
  for name, transform_set in six.iteritems(inverted_features_target):
    if transform_set == set([constant.TARGET_TRANSFORM]):
      target_schema = next(col['type'].lower() for col in schema if col['name'] == name)
      if target_schema in constant.NUMERIC_SCHEMA:
        inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
      else:
        inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}

  numerical_vocab_stats = {}
  for col_name, transform_set in six.iteritems(inverted_features_target):
    sys.stdout.write('Analyzing column %s...' % col_name)
    sys.stdout.flush()
    # All transforms in transform_set require the same analysis. So look
    # at the first transform.
    transform_name = next(iter(transform_set))
    if (transform_name in constant.CATEGORICAL_TRANSFORMS or
       transform_name in constant.TEXT_TRANSFORMS):
      if transform_name in constant.TEXT_TRANSFORMS:
        # Split strings on space, then extract labels and how many rows each
        # token is in. This is done by making two temp tables:
        #   SplitTable: each text row is made into an array of strings. The
        #       array may contain repeat tokens
        #   TokenTable: SplitTable with repeated tokens removed per row.
        # Then to flatten the arrays, TokenTable has to be joined with itself.
        # See the sections 'Flattening Arrays' and 'Filtering Arrays' at
        # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays
        sql = ('WITH SplitTable AS '
               '         (SELECT SPLIT({name}, \' \') as token_array FROM {table}), '
               '     TokenTable AS '
               '         (SELECT ARRAY(SELECT DISTINCT x '
               '                       FROM UNNEST(token_array) AS x) AS unique_tokens_per_row '
               '          FROM SplitTable) '
               'SELECT token, COUNT(token) as token_count '
               'FROM TokenTable '
               'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token '
               'WHERE LENGTH(token) > 0 '
               'GROUP BY token '
               'ORDER BY token_count DESC, token ASC').format(name=col_name,
                                                              table=table_name)
      else:
        # Extract label and frequency
        sql = ('SELECT {name} as token, count(*) as count '
               'FROM {table} '
               'WHERE {name} IS NOT NULL '
               'GROUP BY {name} '
               'ORDER BY count DESC, token ASC').format(name=col_name,
                                                        table=table_name)

      df = _execute_sql(sql, table)

      # Save the vocab
      string_buff = six.StringIO()
      df.to_csv(string_buff, index=False, header=False)
      file_io.write_string_to_file(
          os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % col_name),
          string_buff.getvalue())
      numerical_vocab_stats[col_name] = {'vocab_size': len(df)}

      # free memeory
      del string_buff
      del df
    elif transform_name in constant.NUMERIC_TRANSFORMS:
      # get min/max/average
      sql = ('SELECT max({name}) as max_value, min({name}) as min_value, '
             'avg({name}) as avg_value from {table}').format(name=col_name,
                                                             table=table_name)
      df = _execute_sql(sql, table)
      numerical_vocab_stats[col_name] = {'min': df.iloc[0]['min_value'],
                                         'max': df.iloc[0]['max_value'],
                                         'mean': df.iloc[0]['avg_value']}
    sys.stdout.write('done.\n')
    sys.stdout.flush()

  # get num examples
  sql = 'SELECT count(*) as num_examples from {table}'.format(table=table_name)
  df = _execute_sql(sql, table)
  num_examples = df.iloc[0]['num_examples']

  # Write the stats file.
  stats = {'column_stats': numerical_vocab_stats, 'num_examples': num_examples}
  file_io.write_string_to_file(
      os.path.join(output_dir, constant.STATS_FILE),
      json.dumps(stats, indent=2, separators=(',', ': ')))
Ejemplo n.º 10
0
def _table_cell(args, cell_body):
  """Implements the BigQuery table magic subcommand used to operate on tables

   The supported syntax is:
   %%bq tables <command> <args>

  Commands:
    {list, create, delete, describe, view}

  Args:
    args: the optional arguments following '%%bq tables command'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The HTML rendering for the table of datasets.
  """
  if args['command'] == 'list':
    filter_ = args['filter'] if args['filter'] else '*'
    if args['dataset']:
      if args['project'] is None:
        datasets = [bigquery.Dataset(args['dataset'])]
      else:
        context = google.datalab.Context(args['project'],
                                         google.datalab.Context.default().credentials)
        datasets = [bigquery.Dataset(args['dataset'], context)]
    else:
      default_context = google.datalab.Context.default()
      context = google.datalab.Context(default_context.project_id, default_context.credentials)
      if args['project']:
        context.set_project_id(args['project'])
      datasets = bigquery.Datasets(context)

    tables = []
    for dataset in datasets:
      tables.extend([table.full_name
                     for table in dataset if fnmatch.fnmatch(table.full_name, filter_)])

    return _render_list(tables)

  elif args['command'] == 'create':
    if cell_body is None:
      print('Failed to create %s: no schema specified' % args['name'])
    else:
      try:
        record = google.datalab.utils.commands.parse_config(
            cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False)
        jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
        schema = bigquery.Schema(record['schema'])
        bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite'])
      except Exception as e:
        print('Failed to create table %s: %s' % (args['name'], e))

  elif args['command'] == 'describe':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)

    html = _repr_html_table_schema(table.schema)
    return IPython.core.display.HTML(html)

  elif args['command'] == 'delete':
    try:
      bigquery.Table(args['name']).delete()
    except Exception as e:
      print('Failed to delete table %s: %s' % (args['name'], e))

  elif args['command'] == 'view':
    name = args['name']
    table = _get_table(name)
    if not table:
      raise Exception('Could not find table %s' % name)
    return table
Ejemplo n.º 11
0
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema,
                       features):
    """Use BigQuery to analyze input date.

  Only one of csv_file_pattern or bigquery_table should be non-None.

  Args:
    output_dir: output folder
    csv_file_pattern: csv file path, may contain wildcards
    bigquery_table: project_id.dataset_name.table_name
    schema: schema list
    features: features dict
  """
    def _execute_sql(sql, table):
        """Runs a BigQuery job and dowloads the results into local memeory.

    Args:
      sql: a SQL string
      table: bq.ExternalDataSource or bq.Table

    Returns:
      A Pandas dataframe.
    """
        import google.datalab.bigquery as bq
        if isinstance(table, bq.ExternalDataSource):
            query = bq.Query(sql, data_sources={'csv_table': table})
        else:
            query = bq.Query(sql)
        return query.execute().result().to_dataframe()

    import google.datalab.bigquery as bq
    if bigquery_table:
        table_name = '`%s`' % bigquery_table
        table = None
    else:
        table_name = 'csv_table'
        table = bq.ExternalDataSource(source=csv_file_pattern,
                                      schema=bq.Schema(schema))

    numerical_vocab_stats = {}

    for col_schema in schema:
        col_name = col_schema['name']
        col_type = col_schema['type'].lower()
        transform = features[col_name]['transform']

        # Map the target transfrom into one_hot or identity.
        if transform == TARGET_TRANSFORM:
            if col_type == STRING_SCHEMA:
                transform = ONE_HOT_TRANSFORM
            elif col_type in NUMERIC_SCHEMA:
                transform = IDENTITY_TRANSFORM
            else:
                raise ValueError('Unknown schema type')

        if transform in (TEXT_TRANSFORMS + CATEGORICAL_TRANSFORMS):
            if transform in TEXT_TRANSFORMS:
                # Split strings on space, then extract labels and how many rows each
                # token is in. This is done by making two temp tables:
                #   SplitTable: each text row is made into an array of strings. The
                #       array may contain repeat tokens
                #   TokenTable: SplitTable with repeated tokens removed per row.
                # Then to flatten the arrays, TokenTable has to be joined with itself.
                # See the sections 'Flattening Arrays' and 'Filtering Arrays' at
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays
                sql = (
                    'WITH SplitTable AS '
                    '         (SELECT SPLIT({name}, \' \') as token_array FROM {table}), '
                    '     TokenTable AS '
                    '         (SELECT ARRAY(SELECT DISTINCT x '
                    '                       FROM UNNEST(token_array) AS x) AS unique_tokens_per_row '
                    '          FROM SplitTable) '
                    'SELECT token, COUNT(token) as token_count '
                    'FROM TokenTable '
                    'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token '
                    'WHERE LENGTH(token) > 0 '
                    'GROUP BY token '
                    'ORDER BY token_count DESC, token ASC').format(
                        name=col_name, table=table_name)
            else:
                # Extract label and frequency
                sql = ('SELECT {name} as token, count(*) as count '
                       'FROM {table} '
                       'WHERE {name} IS NOT NULL '
                       'GROUP BY {name} '
                       'ORDER BY count DESC, token ASC').format(
                           name=col_name, table=table_name)

            df = _execute_sql(sql, table)

            # Save the vocab
            string_buff = six.StringIO()
            df.to_csv(string_buff, index=False, header=False)
            file_io.write_string_to_file(
                os.path.join(output_dir, VOCAB_ANALYSIS_FILE % col_name),
                string_buff.getvalue())
            numerical_vocab_stats[col_name] = {'vocab_size': len(df)}

            # free memeory
            del string_buff
            del df
        elif transform in NUMERIC_TRANSFORMS:
            # get min/max/average
            sql = (
                'SELECT max({name}) as max_value, min({name}) as min_value, '
                'avg({name}) as avg_value from {table}').format(
                    name=col_name, table=table_name)
            df = _execute_sql(sql, table)
            numerical_vocab_stats[col_name] = {
                'min': df.iloc[0]['min_value'],
                'max': df.iloc[0]['max_value'],
                'mean': df.iloc[0]['avg_value']
            }
        elif transform == IMAGE_TRANSFORM:
            pass
        elif transform == KEY_TRANSFORM:
            pass
        else:
            raise ValueError('Unknown transform %s' % transform)

    # get num examples
    sql = 'SELECT count(*) as num_examples from {table}'.format(
        table=table_name)
    df = _execute_sql(sql, table)
    num_examples = df.iloc[0]['num_examples']

    # Write the stats file.
    stats = {
        'column_stats': numerical_vocab_stats,
        'num_examples': num_examples
    }
    file_io.write_string_to_file(
        os.path.join(output_dir, STATS_FILE),
        json.dumps(stats, indent=2, separators=(',', ': ')))