Python ExternalDataSourceの例、google.datalab.bigquery.ExternalDataSource Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cloud_preprocess.py プロジェクト: zomglings/pydatalab

def run_analysis(args):
    """Builds an analysis file for training.

  Uses BiqQuery tables to do the analysis.

  Args:
    args: command line args

  Raises:
    ValueError if schema contains unknown types.
  """
    import google.datalab.bigquery as bq
    if args.bigquery_table:
        table = bq.Table(args.bigquery_table)
        schema_list = table.schema._bq_schema
    else:
        schema_list = json.loads(
            file_io.read_file_to_string(args.schema_file).decode())
        table = bq.ExternalDataSource(source=args.input_file_pattern,
                                      schema=bq.Schema(schema_list))

    # Check the schema is supported.
    for col_schema in schema_list:
        col_type = col_schema['type'].lower()
        if col_type != 'string' and col_type != 'integer' and col_type != 'float':
            raise ValueError('Schema contains an unsupported type %s.' %
                             col_type)

    run_numerical_analysis(table, schema_list, args)
    run_categorical_analysis(table, schema_list, args)

    # Save a copy of the schema to the output location.
    file_io.write_string_to_file(
        os.path.join(args.output_dir, SCHEMA_FILE),
        json.dumps(schema_list, indent=2, separators=(',', ': ')))

コード例 #2

0

ファイルを表示

ファイル: _bigquery.py プロジェクト: tiravata/pydatalab

def _datasource_cell(args, cell_body):
  """Implements the BigQuery datasource cell magic for ipython notebooks.

  The supported syntax is
  %%bq datasource --name <var> --paths <url> [--format <CSV|JSON>]
  <schema>

  Args:
    args: the optional arguments following '%%bq datasource'
    cell_body: the datasource's schema in json/yaml
  """
  name = args['name']
  paths = args['paths']
  data_format = (args['format'] or 'CSV').lower()
  compressed = args['compressed'] or False

  # Get the source schema from the cell body
  record = google.datalab.utils.commands.parse_config(
      cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False)

  jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA)
  schema = bigquery.Schema(record['schema'])

  # Finally build the datasource object
  datasource = bigquery.ExternalDataSource(source=paths, source_format=data_format,
                                           compressed=compressed, schema=schema)
  google.datalab.utils.commands.notebook_environment()[name] = datasource

コード例 #3

0

ファイルを表示

 def _create_external_data_source(self, skip_header_rows):
     import google.datalab.bigquery as bq
     df = self.browse(1, None)
     # read each column as STRING because we only want to sample rows.
     schema_train = bq.Schema([{
         'name': name,
         'type': 'STRING'
     } for name in df.keys()])
     options = bq.CSVOptions(
         skip_leading_rows=(1 if skip_header_rows == True else 0))
     return bq.ExternalDataSource(self.path,
                                  csv_options=options,
                                  schema=schema_train,
                                  max_bad_records=0)

コード例 #4

0

ファイルを表示

    def execute(self, context):
        if self.data_source:
            kwargs = {}
            if self.csv_options:
                csv_kwargs = {}
                if 'delimiter' in self.csv_options:
                    csv_kwargs['delimiter'] = self.csv_options['delimiter']
                if 'skip' in self.csv_options:
                    csv_kwargs['skip_leading_rows'] = self.csv_options['skip']
                if 'strict' in self.csv_options:
                    csv_kwargs['allow_jagged_rows'] = self.csv_options[
                        'strict']
                if 'quote' in self.csv_options:
                    csv_kwargs['quote'] = self.csv_options['quote']
                kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs)

            if self.format:
                kwargs['source_format'] = self.format

            if self.max_bad_records:
                kwargs['max_bad_records'] = self.max_bad_records

            external_data_source = bq.ExternalDataSource(source=self.path,
                                                         schema=bq.Schema(
                                                             self.schema),
                                                         **kwargs)
            query = bq.Query(
                sql=self.sql,
                data_sources={self.data_source: external_data_source})
        else:
            query = bq.Query(sql=self.sql)

        # use_cache is False since this is most likely the case in pipeline scenarios
        # allow_large_results can be True only if table is specified (i.e. when it's not None)
        kwargs = {}
        if self.mode is not None:
            kwargs['mode'] = self.mode
        output_options = bq.QueryOutput.table(name=self.table,
                                              use_cache=False,
                                              allow_large_results=self.table
                                              is not None,
                                              **kwargs)
        query_params = bq.Query.get_query_parameters(self.parameters)
        job = query.execute(output_options=output_options,
                            query_params=query_params)

        # Returning the table-name here makes it available for downstream task instances.
        return {'table': job.result().full_name}

コード例 #5

0

ファイルを表示

def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema,
                       inverted_features):
  """Use BigQuery to analyze input date.

  Only one of csv_file_pattern or bigquery_table should be non-None.

  Args:
    output_dir: output folder
    csv_file_pattern: list of csv file paths, may contain wildcards
    bigquery_table: project_id.dataset_name.table_name
    schema: schema list
    inverted_features: inverted_features dict
  """

  def _execute_sql(sql, table):
    """Runs a BigQuery job and dowloads the results into local memeory.

    Args:
      sql: a SQL string
      table: bq.ExternalDataSource or bq.Table

    Returns:
      A Pandas dataframe.
    """
    import google.datalab.bigquery as bq
    if isinstance(table, bq.ExternalDataSource):
      query = bq.Query(sql, data_sources={'csv_table': table})
    else:
      query = bq.Query(sql)
    return query.execute().result().to_dataframe()

  import google.datalab.bigquery as bq
  if bigquery_table:
    table_name = '`%s`' % bigquery_table
    table = None
  else:
    table_name = 'csv_table'
    table = bq.ExternalDataSource(
        source=csv_file_pattern,
        schema=bq.Schema(schema))

  # Make a copy of inverted_features and update the target transform to be
  # identity or one hot depending on the schema.
  inverted_features_target = copy.deepcopy(inverted_features)
  for name, transform_set in six.iteritems(inverted_features_target):
    if transform_set == set([constant.TARGET_TRANSFORM]):
      target_schema = next(col['type'].lower() for col in schema if col['name'] == name)
      if target_schema in constant.NUMERIC_SCHEMA:
        inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
      else:
        inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}

  numerical_vocab_stats = {}
  for col_name, transform_set in six.iteritems(inverted_features_target):
    sys.stdout.write('Analyzing column %s...' % col_name)
    sys.stdout.flush()
    # All transforms in transform_set require the same analysis. So look
    # at the first transform.
    transform_name = next(iter(transform_set))
    if (transform_name in constant.CATEGORICAL_TRANSFORMS or
       transform_name in constant.TEXT_TRANSFORMS):
      if transform_name in constant.TEXT_TRANSFORMS:
        # Split strings on space, then extract labels and how many rows each
        # token is in. This is done by making two temp tables:
        #   SplitTable: each text row is made into an array of strings. The
        #       array may contain repeat tokens
        #   TokenTable: SplitTable with repeated tokens removed per row.
        # Then to flatten the arrays, TokenTable has to be joined with itself.
        # See the sections 'Flattening Arrays' and 'Filtering Arrays' at
        # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays
        sql = ('WITH SplitTable AS '
               '         (SELECT SPLIT({name}, \' \') as token_array FROM {table}), '
               '     TokenTable AS '
               '         (SELECT ARRAY(SELECT DISTINCT x '
               '                       FROM UNNEST(token_array) AS x) AS unique_tokens_per_row '
               '          FROM SplitTable) '
               'SELECT token, COUNT(token) as token_count '
               'FROM TokenTable '
               'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token '
               'WHERE LENGTH(token) > 0 '
               'GROUP BY token '
               'ORDER BY token_count DESC, token ASC').format(name=col_name,
                                                              table=table_name)
      else:
        # Extract label and frequency
        sql = ('SELECT {name} as token, count(*) as count '
               'FROM {table} '
               'WHERE {name} IS NOT NULL '
               'GROUP BY {name} '
               'ORDER BY count DESC, token ASC').format(name=col_name,
                                                        table=table_name)

      df = _execute_sql(sql, table)

      # Save the vocab
      string_buff = six.StringIO()
      df.to_csv(string_buff, index=False, header=False)
      file_io.write_string_to_file(
          os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % col_name),
          string_buff.getvalue())
      numerical_vocab_stats[col_name] = {'vocab_size': len(df)}

      # free memeory
      del string_buff
      del df
    elif transform_name in constant.NUMERIC_TRANSFORMS:
      # get min/max/average
      sql = ('SELECT max({name}) as max_value, min({name}) as min_value, '
             'avg({name}) as avg_value from {table}').format(name=col_name,
                                                             table=table_name)
      df = _execute_sql(sql, table)
      numerical_vocab_stats[col_name] = {'min': df.iloc[0]['min_value'],
                                         'max': df.iloc[0]['max_value'],
                                         'mean': df.iloc[0]['avg_value']}
    sys.stdout.write('done.\n')
    sys.stdout.flush()

  # get num examples
  sql = 'SELECT count(*) as num_examples from {table}'.format(table=table_name)
  df = _execute_sql(sql, table)
  num_examples = df.iloc[0]['num_examples']

  # Write the stats file.
  stats = {'column_stats': numerical_vocab_stats, 'num_examples': num_examples}
  file_io.write_string_to_file(
      os.path.join(output_dir, constant.STATS_FILE),
      json.dumps(stats, indent=2, separators=(',', ': ')))

コード例 #6

0

ファイルを表示

ファイル: analyze_data.py プロジェクト: LabBros/pydatalab

def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema,
                       features):
    """Use BigQuery to analyze input date.

  Only one of csv_file_pattern or bigquery_table should be non-None.

  Args:
    output_dir: output folder
    csv_file_pattern: csv file path, may contain wildcards
    bigquery_table: project_id.dataset_name.table_name
    schema: schema list
    features: features dict
  """
    def _execute_sql(sql, table):
        """Runs a BigQuery job and dowloads the results into local memeory.

    Args:
      sql: a SQL string
      table: bq.ExternalDataSource or bq.Table

    Returns:
      A Pandas dataframe.
    """
        import google.datalab.bigquery as bq
        if isinstance(table, bq.ExternalDataSource):
            query = bq.Query(sql, data_sources={'csv_table': table})
        else:
            query = bq.Query(sql)
        return query.execute().result().to_dataframe()

    import google.datalab.bigquery as bq
    if bigquery_table:
        table_name = '`%s`' % bigquery_table
        table = None
    else:
        table_name = 'csv_table'
        table = bq.ExternalDataSource(source=csv_file_pattern,
                                      schema=bq.Schema(schema))

    numerical_vocab_stats = {}

    for col_schema in schema:
        col_name = col_schema['name']
        col_type = col_schema['type'].lower()
        transform = features[col_name]['transform']

        # Map the target transfrom into one_hot or identity.
        if transform == TARGET_TRANSFORM:
            if col_type == STRING_SCHEMA:
                transform = ONE_HOT_TRANSFORM
            elif col_type in NUMERIC_SCHEMA:
                transform = IDENTITY_TRANSFORM
            else:
                raise ValueError('Unknown schema type')

        if transform in (TEXT_TRANSFORMS + CATEGORICAL_TRANSFORMS):
            if transform in TEXT_TRANSFORMS:
                # Split strings on space, then extract labels and how many rows each
                # token is in. This is done by making two temp tables:
                #   SplitTable: each text row is made into an array of strings. The
                #       array may contain repeat tokens
                #   TokenTable: SplitTable with repeated tokens removed per row.
                # Then to flatten the arrays, TokenTable has to be joined with itself.
                # See the sections 'Flattening Arrays' and 'Filtering Arrays' at
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays
                sql = (
                    'WITH SplitTable AS '
                    '         (SELECT SPLIT({name}, \' \') as token_array FROM {table}), '
                    '     TokenTable AS '
                    '         (SELECT ARRAY(SELECT DISTINCT x '
                    '                       FROM UNNEST(token_array) AS x) AS unique_tokens_per_row '
                    '          FROM SplitTable) '
                    'SELECT token, COUNT(token) as token_count '
                    'FROM TokenTable '
                    'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token '
                    'WHERE LENGTH(token) > 0 '
                    'GROUP BY token '
                    'ORDER BY token_count DESC, token ASC').format(
                        name=col_name, table=table_name)
            else:
                # Extract label and frequency
                sql = ('SELECT {name} as token, count(*) as count '
                       'FROM {table} '
                       'WHERE {name} IS NOT NULL '
                       'GROUP BY {name} '
                       'ORDER BY count DESC, token ASC').format(
                           name=col_name, table=table_name)

            df = _execute_sql(sql, table)

            # Save the vocab
            string_buff = six.StringIO()
            df.to_csv(string_buff, index=False, header=False)
            file_io.write_string_to_file(
                os.path.join(output_dir, VOCAB_ANALYSIS_FILE % col_name),
                string_buff.getvalue())
            numerical_vocab_stats[col_name] = {'vocab_size': len(df)}

            # free memeory
            del string_buff
            del df
        elif transform in NUMERIC_TRANSFORMS:
            # get min/max/average
            sql = (
                'SELECT max({name}) as max_value, min({name}) as min_value, '
                'avg({name}) as avg_value from {table}').format(
                    name=col_name, table=table_name)
            df = _execute_sql(sql, table)
            numerical_vocab_stats[col_name] = {
                'min': df.iloc[0]['min_value'],
                'max': df.iloc[0]['max_value'],
                'mean': df.iloc[0]['avg_value']
            }
        elif transform == IMAGE_TRANSFORM:
            pass
        elif transform == KEY_TRANSFORM:
            pass
        else:
            raise ValueError('Unknown transform %s' % transform)

    # get num examples
    sql = 'SELECT count(*) as num_examples from {table}'.format(
        table=table_name)
    df = _execute_sql(sql, table)
    num_examples = df.iloc[0]['num_examples']

    # Write the stats file.
    stats = {
        'column_stats': numerical_vocab_stats,
        'num_examples': num_examples
    }
    file_io.write_string_to_file(
        os.path.join(output_dir, STATS_FILE),
        json.dumps(stats, indent=2, separators=(',', ': ')))