def run_analysis(args): """Builds an analysis file for training. Uses BiqQuery tables to do the analysis. Args: args: command line args Raises: ValueError if schema contains unknown types. """ import google.datalab.bigquery as bq if args.bigquery_table: table = bq.Table(args.bigquery_table) schema_list = table.schema._bq_schema else: schema_list = json.loads( file_io.read_file_to_string(args.schema_file).decode()) table = bq.ExternalDataSource(source=args.input_file_pattern, schema=bq.Schema(schema_list)) # Check the schema is supported. for col_schema in schema_list: col_type = col_schema['type'].lower() if col_type != 'string' and col_type != 'integer' and col_type != 'float': raise ValueError('Schema contains an unsupported type %s.' % col_type) run_numerical_analysis(table, schema_list, args) run_categorical_analysis(table, schema_list, args) # Save a copy of the schema to the output location. file_io.write_string_to_file( os.path.join(args.output_dir, SCHEMA_FILE), json.dumps(schema_list, indent=2, separators=(',', ': ')))
def _datasource_cell(args, cell_body): """Implements the BigQuery datasource cell magic for ipython notebooks. The supported syntax is %%bq datasource --name <var> --paths <url> [--format <CSV|JSON>] <schema> Args: args: the optional arguments following '%%bq datasource' cell_body: the datasource's schema in json/yaml """ name = args['name'] paths = args['paths'] data_format = (args['format'] or 'CSV').lower() compressed = args['compressed'] or False # Get the source schema from the cell body record = google.datalab.utils.commands.parse_config( cell_body, google.datalab.utils.commands.notebook_environment(), as_dict=False) jsonschema.validate(record, BigQuerySchema.TABLE_SCHEMA_SCHEMA) schema = bigquery.Schema(record['schema']) # Finally build the datasource object datasource = bigquery.ExternalDataSource(source=paths, source_format=data_format, compressed=compressed, schema=schema) google.datalab.utils.commands.notebook_environment()[name] = datasource
def _create_external_data_source(self, skip_header_rows): import google.datalab.bigquery as bq df = self.browse(1, None) # read each column as STRING because we only want to sample rows. schema_train = bq.Schema([{ 'name': name, 'type': 'STRING' } for name in df.keys()]) options = bq.CSVOptions( skip_leading_rows=(1 if skip_header_rows == True else 0)) return bq.ExternalDataSource(self.path, csv_options=options, schema=schema_train, max_bad_records=0)
def execute(self, context): if self.data_source: kwargs = {} if self.csv_options: csv_kwargs = {} if 'delimiter' in self.csv_options: csv_kwargs['delimiter'] = self.csv_options['delimiter'] if 'skip' in self.csv_options: csv_kwargs['skip_leading_rows'] = self.csv_options['skip'] if 'strict' in self.csv_options: csv_kwargs['allow_jagged_rows'] = self.csv_options[ 'strict'] if 'quote' in self.csv_options: csv_kwargs['quote'] = self.csv_options['quote'] kwargs['csv_options'] = bq.CSVOptions(**csv_kwargs) if self.format: kwargs['source_format'] = self.format if self.max_bad_records: kwargs['max_bad_records'] = self.max_bad_records external_data_source = bq.ExternalDataSource(source=self.path, schema=bq.Schema( self.schema), **kwargs) query = bq.Query( sql=self.sql, data_sources={self.data_source: external_data_source}) else: query = bq.Query(sql=self.sql) # use_cache is False since this is most likely the case in pipeline scenarios # allow_large_results can be True only if table is specified (i.e. when it's not None) kwargs = {} if self.mode is not None: kwargs['mode'] = self.mode output_options = bq.QueryOutput.table(name=self.table, use_cache=False, allow_large_results=self.table is not None, **kwargs) query_params = bq.Query.get_query_parameters(self.parameters) job = query.execute(output_options=output_options, query_params=query_params) # Returning the table-name here makes it available for downstream task instances. return {'table': job.result().full_name}
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema, inverted_features): """Use BigQuery to analyze input date. Only one of csv_file_pattern or bigquery_table should be non-None. Args: output_dir: output folder csv_file_pattern: list of csv file paths, may contain wildcards bigquery_table: project_id.dataset_name.table_name schema: schema list inverted_features: inverted_features dict """ def _execute_sql(sql, table): """Runs a BigQuery job and dowloads the results into local memeory. Args: sql: a SQL string table: bq.ExternalDataSource or bq.Table Returns: A Pandas dataframe. """ import google.datalab.bigquery as bq if isinstance(table, bq.ExternalDataSource): query = bq.Query(sql, data_sources={'csv_table': table}) else: query = bq.Query(sql) return query.execute().result().to_dataframe() import google.datalab.bigquery as bq if bigquery_table: table_name = '`%s`' % bigquery_table table = None else: table_name = 'csv_table' table = bq.ExternalDataSource( source=csv_file_pattern, schema=bq.Schema(schema)) # Make a copy of inverted_features and update the target transform to be # identity or one hot depending on the schema. inverted_features_target = copy.deepcopy(inverted_features) for name, transform_set in six.iteritems(inverted_features_target): if transform_set == set([constant.TARGET_TRANSFORM]): target_schema = next(col['type'].lower() for col in schema if col['name'] == name) if target_schema in constant.NUMERIC_SCHEMA: inverted_features_target[name] = {constant.IDENTITY_TRANSFORM} else: inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM} numerical_vocab_stats = {} for col_name, transform_set in six.iteritems(inverted_features_target): sys.stdout.write('Analyzing column %s...' % col_name) sys.stdout.flush() # All transforms in transform_set require the same analysis. So look # at the first transform. transform_name = next(iter(transform_set)) if (transform_name in constant.CATEGORICAL_TRANSFORMS or transform_name in constant.TEXT_TRANSFORMS): if transform_name in constant.TEXT_TRANSFORMS: # Split strings on space, then extract labels and how many rows each # token is in. This is done by making two temp tables: # SplitTable: each text row is made into an array of strings. The # array may contain repeat tokens # TokenTable: SplitTable with repeated tokens removed per row. # Then to flatten the arrays, TokenTable has to be joined with itself. # See the sections 'Flattening Arrays' and 'Filtering Arrays' at # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays sql = ('WITH SplitTable AS ' ' (SELECT SPLIT({name}, \' \') as token_array FROM {table}), ' ' TokenTable AS ' ' (SELECT ARRAY(SELECT DISTINCT x ' ' FROM UNNEST(token_array) AS x) AS unique_tokens_per_row ' ' FROM SplitTable) ' 'SELECT token, COUNT(token) as token_count ' 'FROM TokenTable ' 'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token ' 'WHERE LENGTH(token) > 0 ' 'GROUP BY token ' 'ORDER BY token_count DESC, token ASC').format(name=col_name, table=table_name) else: # Extract label and frequency sql = ('SELECT {name} as token, count(*) as count ' 'FROM {table} ' 'WHERE {name} IS NOT NULL ' 'GROUP BY {name} ' 'ORDER BY count DESC, token ASC').format(name=col_name, table=table_name) df = _execute_sql(sql, table) # Save the vocab string_buff = six.StringIO() df.to_csv(string_buff, index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % col_name), string_buff.getvalue()) numerical_vocab_stats[col_name] = {'vocab_size': len(df)} # free memeory del string_buff del df elif transform_name in constant.NUMERIC_TRANSFORMS: # get min/max/average sql = ('SELECT max({name}) as max_value, min({name}) as min_value, ' 'avg({name}) as avg_value from {table}').format(name=col_name, table=table_name) df = _execute_sql(sql, table) numerical_vocab_stats[col_name] = {'min': df.iloc[0]['min_value'], 'max': df.iloc[0]['max_value'], 'mean': df.iloc[0]['avg_value']} sys.stdout.write('done.\n') sys.stdout.flush() # get num examples sql = 'SELECT count(*) as num_examples from {table}'.format(table=table_name) df = _execute_sql(sql, table) num_examples = df.iloc[0]['num_examples'] # Write the stats file. stats = {'column_stats': numerical_vocab_stats, 'num_examples': num_examples} file_io.write_string_to_file( os.path.join(output_dir, constant.STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))
def run_cloud_analysis(output_dir, csv_file_pattern, bigquery_table, schema, features): """Use BigQuery to analyze input date. Only one of csv_file_pattern or bigquery_table should be non-None. Args: output_dir: output folder csv_file_pattern: csv file path, may contain wildcards bigquery_table: project_id.dataset_name.table_name schema: schema list features: features dict """ def _execute_sql(sql, table): """Runs a BigQuery job and dowloads the results into local memeory. Args: sql: a SQL string table: bq.ExternalDataSource or bq.Table Returns: A Pandas dataframe. """ import google.datalab.bigquery as bq if isinstance(table, bq.ExternalDataSource): query = bq.Query(sql, data_sources={'csv_table': table}) else: query = bq.Query(sql) return query.execute().result().to_dataframe() import google.datalab.bigquery as bq if bigquery_table: table_name = '`%s`' % bigquery_table table = None else: table_name = 'csv_table' table = bq.ExternalDataSource(source=csv_file_pattern, schema=bq.Schema(schema)) numerical_vocab_stats = {} for col_schema in schema: col_name = col_schema['name'] col_type = col_schema['type'].lower() transform = features[col_name]['transform'] # Map the target transfrom into one_hot or identity. if transform == TARGET_TRANSFORM: if col_type == STRING_SCHEMA: transform = ONE_HOT_TRANSFORM elif col_type in NUMERIC_SCHEMA: transform = IDENTITY_TRANSFORM else: raise ValueError('Unknown schema type') if transform in (TEXT_TRANSFORMS + CATEGORICAL_TRANSFORMS): if transform in TEXT_TRANSFORMS: # Split strings on space, then extract labels and how many rows each # token is in. This is done by making two temp tables: # SplitTable: each text row is made into an array of strings. The # array may contain repeat tokens # TokenTable: SplitTable with repeated tokens removed per row. # Then to flatten the arrays, TokenTable has to be joined with itself. # See the sections 'Flattening Arrays' and 'Filtering Arrays' at # https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays sql = ( 'WITH SplitTable AS ' ' (SELECT SPLIT({name}, \' \') as token_array FROM {table}), ' ' TokenTable AS ' ' (SELECT ARRAY(SELECT DISTINCT x ' ' FROM UNNEST(token_array) AS x) AS unique_tokens_per_row ' ' FROM SplitTable) ' 'SELECT token, COUNT(token) as token_count ' 'FROM TokenTable ' 'CROSS JOIN UNNEST(TokenTable.unique_tokens_per_row) as token ' 'WHERE LENGTH(token) > 0 ' 'GROUP BY token ' 'ORDER BY token_count DESC, token ASC').format( name=col_name, table=table_name) else: # Extract label and frequency sql = ('SELECT {name} as token, count(*) as count ' 'FROM {table} ' 'WHERE {name} IS NOT NULL ' 'GROUP BY {name} ' 'ORDER BY count DESC, token ASC').format( name=col_name, table=table_name) df = _execute_sql(sql, table) # Save the vocab string_buff = six.StringIO() df.to_csv(string_buff, index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, VOCAB_ANALYSIS_FILE % col_name), string_buff.getvalue()) numerical_vocab_stats[col_name] = {'vocab_size': len(df)} # free memeory del string_buff del df elif transform in NUMERIC_TRANSFORMS: # get min/max/average sql = ( 'SELECT max({name}) as max_value, min({name}) as min_value, ' 'avg({name}) as avg_value from {table}').format( name=col_name, table=table_name) df = _execute_sql(sql, table) numerical_vocab_stats[col_name] = { 'min': df.iloc[0]['min_value'], 'max': df.iloc[0]['max_value'], 'mean': df.iloc[0]['avg_value'] } elif transform == IMAGE_TRANSFORM: pass elif transform == KEY_TRANSFORM: pass else: raise ValueError('Unknown transform %s' % transform) # get num examples sql = 'SELECT count(*) as num_examples from {table}'.format( table=table_name) df = _execute_sql(sql, table) num_examples = df.iloc[0]['num_examples'] # Write the stats file. stats = { 'column_stats': numerical_vocab_stats, 'num_examples': num_examples } file_io.write_string_to_file( os.path.join(output_dir, STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))