def _extract_line(args): """Implements the BigQuery extract magic used to extract table data to GCS. The supported syntax is: %bigquery extract -S|--source <table> -D|--destination <url> <other_args> Args: args: the arguments following '%bigquery extract'. Returns: A message about whether the extract succeeded or failed. """ name = args['source'] source = _utils.get_notebook_item(name) if not source: source = _get_table(name) if not source: raise Exception('No source named %s found' % name) elif isinstance(source, gcp.bigquery.Table) and not source.exists(): raise Exception('Table %s does not exist' % name) else: job = source.extract(args['destination'], format='CSV' if args['format'] == 'csv' else 'NEWLINE_DELIMITED_JSON', compress=args['compress'], csv_delimiter=args['delimiter'], csv_header=args['header']) if job.failed: raise Exception('Extract failed: %s' % str(job.fatal_error)) elif job.errors: raise Exception('Extract completed with errors: %s' % str(job.errors))
def _get_query_argument(args, cell, env): """ Get a query argument to a cell magic. The query is specified with args['query']. We look that up and if it is a BQ query just return it. If it is instead a SqlModule or SqlStatement it may have variable references. We resolve those using the arg parser for the SqlModule, then override the resulting defaults with either the Python code in cell, or the dictionary in overrides. The latter is for if the overrides are specified with YAML or JSON and eventually we should eliminate code in favor of this. Args: args: the dictionary of magic arguments. cell: the cell contents which can be variable value overrides (if args has a 'query' value) or inline SQL otherwise. env: a dictionary that is used for looking up variable values. Returns: A Query object. """ sql_arg = args.get('query', None) if sql_arg is None: # Assume we have inline SQL in the cell if not isinstance(cell, basestring): raise Exception('Expected a --query argument or inline SQL') return gcp.bigquery.Query(cell, values=env) item = _utils.get_notebook_item(sql_arg) if isinstance(item, gcp.bigquery.Query): # Queries are already expanded. return item # Create an expanded BQ Query. config = _utils.parse_config(cell, env) item, env = gcp.data.SqlModule.get_sql_statement_with_environment(item, config) if cell: env.update(config) # config is both a fallback and an override. return gcp.bigquery.Query(item, values=env)
def _sample_cell(args, cell_body): """Implements the bigquery sample cell magic for ipython notebooks. Args: args: the optional arguments following '%%bigquery sample'. cell_body: optional contents of the cell interpreted as SQL, YAML or JSON. Returns: The results of executing the sampling query, or a profile of the sample data. """ env = _utils.notebook_environment() query = None table = None view = None if args['query']: query = _get_query_argument(args, cell_body, env) elif args['table']: table = _get_table(args['table']) elif args['view']: view = _utils.get_notebook_item(args['view']) if not isinstance(view, gcp.bigquery.View): raise Exception('%s is not a view' % args['view']) else: query = gcp.bigquery.Query(cell_body, values=env) count = args['count'] method = args['method'] if method == 'random': sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count) elif method == 'hashed': sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'], percent=args['percent'], count=count) elif method == 'sorted': ascending = args['order'] == 'ascending' sampling = gcp.bigquery.Sampling.sorted(args['field'], ascending=ascending, count=count) elif method == 'limit': sampling = gcp.bigquery.Sampling.default(count=count) else: sampling = gcp.bigquery.Sampling.default(count=count) if query: results = query.sample(sampling=sampling) elif view: results = view.sample(sampling=sampling) else: results = table.sample(sampling=sampling) if args['verbose']: print results.sql if args['profile']: return _utils.profile_df(results.to_dataframe()) else: return results
def _get_schema(name): """ Given a variable or table name, get the Schema if it exists. """ item = _utils.get_notebook_item(name) if not item: item = _get_table(name) if isinstance(item, gcp.bigquery.Schema): return item if hasattr(item, 'schema') and isinstance(item.schema, gcp.bigquery._schema.Schema): return item.schema return None
def _get_table(name): """ Given a variable or table name, get a Table if it exists. Args: name: the name of the Table or a variable referencing the Table. Returns: The Table, if found. """ # If name is a variable referencing a table, use that. item = _utils.get_notebook_item(name) if isinstance(item, gcp.bigquery.Table): return item # Else treat this as a BQ table name and return the (cached) table if it exists. try: return _table_cache[name] except KeyError: table = gcp.bigquery.Table(name) if table.exists(): _table_cache[name] = table return table return None
def _chart_cell(args, cell): source = args['data'] ipy = IPython.get_ipython() chart_options = _utils.parse_config(cell, ipy.user_ns) if chart_options is None: chart_options = {} elif not isinstance(chart_options, dict): raise Exception("Could not parse chart options") fields = args['fields'] if args['fields'] else '*' div_id = _html.Html.next_id() env = {} controls_html = '' controls_ids = [] if 'variables' in chart_options: variables = chart_options['variables'] del chart_options['variables'] # Just to make sure GCharts doesn't see them. try: item = _utils.get_notebook_item(source) _, defaults = gcp.data.SqlModule.get_sql_statement_with_environment(item, '') except Exception: defaults = {} for varname, control in variables.items(): label = control.get('label', varname) control_id = div_id + '__' + varname controls_ids.append(control_id) value = control.get('value', defaults.get(varname, None)) # The user should usually specify the type but we will default to 'textbox' for strings # and 'set' for lists. if isinstance(value, basestring): type = 'textbox' elif isinstance(value, list): type = 'set' else: type = None type = control.get('type', type) if type == 'picker': choices = control.get('choices', value) if not isinstance(choices, list) or len(choices) == 0: raise Exception('picker control must specify a nonempty set of choices') if value is None: value = choices[0] choices_html = '' for i, choice in enumerate(choices): choices_html += "<option value=\"%s\" %s>%s</option>" % \ (choice, ("selected=\"selected\"" if choice == value else ''), choice) control_html = "{label}<select disabled id=\"{id}\">{choices}</select>"\ .format(label=label, id=control_id, choices=choices_html) elif type == 'set': # Multi-picker; implemented as checkboxes. # TODO(gram): consider using "name" property of the control to group checkboxes. That # way we can save the code of constructing and parsing control Ids with sequential # numbers in it. Multiple checkboxes can share the same name. choices = control.get('choices', value) if not isinstance(choices, list) or len(choices) == 0: raise Exception('set control must specify a nonempty set of choices') if value is None: value = choices choices_html = '' controls_ids[-1] = '%s:%d' % (control_id, len(choices)) # replace ID to include count. for i, choice in enumerate(choices): checked = choice in value choice_id = '%s:%d' % (control_id, i) # TODO(gram): we may want a 'Submit/Refresh button as we may not want to rerun # query on each checkbox change. choices_html += """ <div> <label> <input type="checkbox" id="{id}" value="{choice}" {checked} disabled> {choice} </label> </div> """.format(id=choice_id, choice=choice, checked="checked" if checked else '') control_html = "{label}<div>{choices}</div>".format(label=label, choices=choices_html) elif type == 'checkbox': control_html = """ <label> <input type="checkbox" id="{id}" {checked} disabled> {label} </label> """.format(label=label, id=control_id, checked="checked" if value else '') elif type == 'slider': min = control.get('min', None) max = control.get('max', None) if min is None or max is None: raise Exception('slider control must specify a min and max value') if max <= min: raise Exception('slider control must specify a min value less than max value') step = control.get('step', 1 if isinstance(min, int) and isinstance(max, int) else (max - min) / 10.0) if value is None: value = min control_html = """ {label} <input type="text" class="gchart-slider_value" id="{id}_value" value="{value}" disabled/> <input type="range" class="gchart-slider" id="{id}" min="{min}" max="{max}" step="{step}" value="{value}" disabled/> """.format(label=label, id=control_id, value=value, min=min, max=max, step=step) elif type == 'textbox': if value is None: value = '' control_html = "{label}<input type=\"text\" value=\"{value}\" id=\"{id}\" disabled/>"\ .format(label=label, value=value, id=control_id) else: raise Exception( 'Unknown control type %s (expected picker, slider, checkbox, textbox or set)' % type) env[varname] = value controls_html += "<div class=\"gchart-control\">{control}</div>\n"\ .format(control=control_html) controls_html = "<div class=\"gchart-controls\">{controls}</div>".format(controls=controls_html) _HTML_TEMPLATE = """ <div class="bqgc-container"> {controls} <div class="bqgc{extra_class}" id="{id}"> </div> </div> <script> require(['extensions/charting', 'element!{id}', 'style!/static/extensions/charting.css'], function(charts, dom) {{ charts.render(dom, {{chartStyle:'{chart_type}', dataName:'{source}', fields:'{fields}'}}, {options}, {data}, {control_ids}); }} ); </script> """ chart_type = args['chart'] count = 25 if chart_type == 'paged_table' else -1 data, _ = _utils.get_data(source, fields, env, 0, count) # TODO(gram): check if we need to augment env with user_ns return IPython.core.display.HTML( _HTML_TEMPLATE.format(controls=controls_html, id=div_id, chart_type=chart_type, extra_class=" bqgc-controlled" if len(controls_html) else '', source=_utils.get_data_source_index(source), fields=fields, options=json.dumps(chart_options, cls=gcp._util.JSONEncoder), data=json.dumps(data, cls=gcp._util.JSONEncoder), control_ids=str(controls_ids)))