def _pipeline_cell(args, cell_body): """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines. The supported syntax is: %%bigquery pipeline [-q|--sql <query identifier>] <other args> <action> [<YAML or JSON cell_body or inline SQL>] Args: args: the arguments following '%bigquery pipeline'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: The QueryResultsTable """ if args['action'] == 'deploy': raise Exception('Deploying a pipeline is not yet supported') env = {} for key, value in _utils.notebook_environment().iteritems(): if isinstance(value, gcp.bigquery._udf.UDF): env[key] = value query = _get_query_argument(args, cell_body, env) if args['verbose']: print query.sql if args['action'] == 'dryrun': print(query.sql) result = query.execute_dry_run() return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'], is_cached=result['cacheHit']) if args['action'] == 'run': return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'], allow_large_results=args['large']).results
def _pipeline_cell(args, cell_body): """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines. The supported syntax is: %%bigquery pipeline [-q|--sql <query identifier>] <other args> <action> [<YAML or JSON cell_body or inline SQL>] Args: args: the arguments following '%bigquery pipeline'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: The QueryResultsTable """ if args['action'] == 'deploy': raise Exception('Deploying a pipeline is not yet supported') env = {} for key, value in _utils.notebook_environment().iteritems(): if isinstance(value, gcp.bigquery._udf.FunctionCall): env[key] = value query = _get_query_argument(args, cell_body, env) if args['verbose']: print query.sql if args['action'] == 'dryrun': print(query.sql) result = query.execute_dry_run() return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'], is_cached=result['cacheHit']) if args['action'] == 'run': return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'], allow_large_results=args['large']).results
def bigquery(line, cell=None): """Implements the bigquery cell magic for ipython notebooks. The supported syntax is: %%bigquery <command> [<args>] <cell> or: %bigquery <command> [<args>] Use %bigquery --help for a list of commands, or %bigquery <command> --help for help on a specific command. Args: line: the magic line. cell: the body of the notebook cell. Returns: The result of processing the magic. """ namespace = {} if line.find('$') >= 0: # We likely have variables to expand; get the appropriate context. namespace = _utils.notebook_environment() return _utils.handle_magic_line(line, cell, _bigquery_parser, namespace=namespace)
def _create_cell(args, cell_body): """Implements the BigQuery cell magic used to create datasets and tables. The supported syntax is: %%bigquery create dataset -n|--name <name> [-f|--friendly <friendlyname>] [<description>] or: %%bigquery create table -n|--name <tablename> [--overwrite] [<YAML or JSON cell_body defining schema to use for tables>] Args: args: the argument following '%bigquery create <command>'. """ if args['command'] == 'dataset': try: gcp.bigquery.DataSet(args['name']).create(friendly_name=args['friendly'], description=cell_body) except Exception as e: print 'Failed to create dataset %s: %s' % (args['name'], e) else: if cell_body is None: print 'Failed to create %s: no schema specified' % args['name'] else: try: record = _utils.parse_config(cell_body, _utils.notebook_environment(), as_dict=False) schema = gcp.bigquery.Schema(record) gcp.bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite']) except Exception as e: print 'Failed to create table %s: %s' % (args['name'], e)
def _sample_cell(args, cell_body): """Implements the bigquery sample cell magic for ipython notebooks. Args: args: the optional arguments following '%%bigquery sample'. cell_body: optional contents of the cell interpreted as SQL, YAML or JSON. Returns: The results of executing the sampling query, or a profile of the sample data. """ env = _utils.notebook_environment() query = None table = None view = None if args['query']: query = _get_query_argument(args, cell_body, env) elif args['table']: table = _get_table(args['table']) elif args['view']: view = _utils.get_notebook_item(args['view']) if not isinstance(view, gcp.bigquery.View): raise Exception('%s is not a view' % args['view']) else: query = gcp.bigquery.Query(cell_body, values=env) count = args['count'] method = args['method'] if method == 'random': sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count) elif method == 'hashed': sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'], percent=args['percent'], count=count) elif method == 'sorted': ascending = args['order'] == 'ascending' sampling = gcp.bigquery.Sampling.sorted(args['field'], ascending=ascending, count=count) elif method == 'limit': sampling = gcp.bigquery.Sampling.default(count=count) else: sampling = gcp.bigquery.Sampling.default(count=count) if query: results = query.sample(sampling=sampling) elif view: results = view.sample(sampling=sampling) else: results = table.sample(sampling=sampling) if args['verbose']: print results.sql if args['profile']: return _utils.profile_df(results.to_dataframe()) else: return results
def _execute_cell(args, cell_body): """Implements the BigQuery cell magic used to execute BQ queries. The supported syntax is: %%bigquery execute [-q|--sql <query identifier>] <other args> [<YAML or JSON cell_body or inline SQL>] Args: args: the arguments following '%bigquery execute'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: The QueryResultsTable """ query = _get_query_argument(args, cell_body, _utils.notebook_environment()) if args['verbose']: print query.sql return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'], allow_large_results=args['large']).results
def _dryrun_cell(args, cell_body): """Implements the BigQuery cell magic used to dry run BQ queries. The supported syntax is: %%bigquery dryrun [-q|--sql <query identifier>] [<YAML or JSON cell_body or inline SQL>] Args: args: the argument following '%bigquery dryrun'. cell_body: optional contents of the cell interpreted as YAML or JSON. Returns: The response wrapped in a DryRunStats object """ query = _get_query_argument(args, cell_body, _utils.notebook_environment()) if args['verbose']: print query.sql result = query.execute_dry_run() return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'], is_cached=result['cacheHit'])
def _udf_cell(args, js): """Implements the bigquery_udf cell magic for ipython notebooks. The supported syntax is: %%bigquery udf --module <var> <js function> Args: args: the optional arguments following '%%bigquery udf'. declaration: the variable to initialize with the resulting UDF object. js: the UDF declaration (inputs and outputs) and implementation in javascript. Returns: The results of executing the UDF converted to a dataframe if no variable was specified. None otherwise. """ variable_name = args['module'] if not variable_name: raise Exception('Declaration must be of the form %%bigquery udf --module <variable name>') # Parse out the input and output specification spec_pattern = r'\{\{([^}]+)\}\}' spec_part_pattern = r'[a-z_][a-z0-9_]*' specs = re.findall(spec_pattern, js) if len(specs) < 2: raise Exception('The JavaScript must declare the input row and output emitter parameters ' 'using valid jsdoc format comments.\n' 'The input row param declaration must be typed as {{field:type, field2:type}} ' 'and the output emitter param declaration must be typed as ' 'function({{field:type, field2:type}}.') inputs = [] input_spec_parts = re.findall(spec_part_pattern, specs[0], flags=re.IGNORECASE) if len(input_spec_parts) % 2 != 0: raise Exception('Invalid input row param declaration. The jsdoc type expression must ' 'define an object with field and type pairs.') for n, t in zip(input_spec_parts[0::2], input_spec_parts[1::2]): inputs.append((n, t)) outputs = [] output_spec_parts = re.findall(spec_part_pattern, specs[1], flags=re.IGNORECASE) if len(output_spec_parts) % 2 != 0: raise Exception('Invalid output emitter param declaration. The jsdoc type expression must ' 'define a function accepting an an object with field and type pairs.') for n, t in zip(output_spec_parts[0::2], output_spec_parts[1::2]): outputs.append((n, t)) # Look for imports. We use a non-standard @import keyword; we could alternatively use @requires. # Object names can contain any characters except \r and \n. import_pattern = r'@import[\s]+(gs://[a-z\d][a-z\d_\.\-]*[a-z\d]/[^\n\r]+)' imports = re.findall(import_pattern, js) # Split the cell if necessary. We look for a 'function(' with no name and a header comment # block with @param and assume this is the primary function, up to a closing '}' at the start # of the line. The remaining cell content is used as support code. split_pattern = r'(.*)(/\*.*?@param.*?@param.*?\*/\w*\n\w*function\w*\(.*?^}\n?)(.*)' parts = re.match(split_pattern, js, re.MULTILINE|re.DOTALL) support_code = '' if parts: support_code = (parts.group(1) + parts.group(3)).strip() if len(support_code): js = parts.group(2) # Finally build the UDF object udf = gcp.bigquery.UDF(inputs, outputs, variable_name, js, support_code, imports) _utils.notebook_environment()[variable_name] = udf
def _udf_cell(args, js): """Implements the bigquery_udf cell magic for ipython notebooks. The supported syntax is: %%bigquery udf --module <var> <js function> Args: args: the optional arguments following '%%bigquery udf'. js: the UDF declaration (inputs and outputs) and implementation in javascript. Returns: The results of executing the UDF converted to a dataframe if no variable was specified. None otherwise. """ variable_name = args['module'] if not variable_name: raise Exception('Declaration must be of the form %%bigquery udf --module <variable name>') # Parse out the input and output specification spec_pattern = r'\{\{([^}]+)\}\}' spec_part_pattern = r'[a-z_][a-z0-9_]*' specs = re.findall(spec_pattern, js) if len(specs) < 2: raise Exception('The JavaScript must declare the input row and output emitter parameters ' 'using valid jsdoc format comments.\n' 'The input row param declaration must be typed as {{field:type, field2:type}} ' 'and the output emitter param declaration must be typed as ' 'function({{field:type, field2:type}}.') inputs = [] input_spec_parts = re.findall(spec_part_pattern, specs[0], flags=re.IGNORECASE) if len(input_spec_parts) % 2 != 0: raise Exception('Invalid input row param declaration. The jsdoc type expression must ' 'define an object with field and type pairs.') for n, t in zip(input_spec_parts[0::2], input_spec_parts[1::2]): inputs.append((n, t)) outputs = [] output_spec_parts = re.findall(spec_part_pattern, specs[1], flags=re.IGNORECASE) if len(output_spec_parts) % 2 != 0: raise Exception('Invalid output emitter param declaration. The jsdoc type expression must ' 'define a function accepting an an object with field and type pairs.') for n, t in zip(output_spec_parts[0::2], output_spec_parts[1::2]): outputs.append((n, t)) # Look for imports. We use a non-standard @import keyword; we could alternatively use @requires. # Object names can contain any characters except \r and \n. import_pattern = r'@import[\s]+(gs://[a-z\d][a-z\d_\.\-]*[a-z\d]/[^\n\r]+)' imports = re.findall(import_pattern, js) # Split the cell if necessary. We look for a 'function(' with no name and a header comment # block with @param and assume this is the primary function, up to a closing '}' at the start # of the line. The remaining cell content is used as support code. split_pattern = r'(.*)(/\*.*?@param.*?@param.*?\*/\w*\n\w*function\w*\(.*?^}\n?)(.*)' parts = re.match(split_pattern, js, re.MULTILINE|re.DOTALL) support_code = '' if parts: support_code = (parts.group(1) + parts.group(3)).strip() if len(support_code): js = parts.group(2) # Finally build the UDF object udf = gcp.bigquery.UDF(inputs, outputs, variable_name, js, support_code, imports) _utils.notebook_environment()[variable_name] = udf