Example #1
0
def _extract_line(args):
  """Implements the BigQuery extract magic used to extract table data to GCS.

   The supported syntax is:

       %bigquery extract -S|--source <table> -D|--destination <url> <other_args>

  Args:
    args: the arguments following '%bigquery extract'.
  Returns:
    A message about whether the extract succeeded or failed.
  """
  name = args['source']
  source = _utils.get_notebook_item(name)
  if not source:
    source = _get_table(name)

  if not source:
    raise Exception('No source named %s found' % name)
  elif isinstance(source, gcp.bigquery.Table) and not source.exists():
    raise Exception('Table %s does not exist' % name)
  else:

    job = source.extract(args['destination'],
                         format='CSV' if args['format'] == 'csv' else 'NEWLINE_DELIMITED_JSON',
                         compress=args['compress'],
                         csv_delimiter=args['delimiter'],
                         csv_header=args['header'])
    if job.failed:
      raise Exception('Extract failed: %s' % str(job.fatal_error))
    elif job.errors:
      raise Exception('Extract completed with errors: %s' % str(job.errors))
Example #2
0
def _extract_line(args):
  """Implements the BigQuery extract magic used to extract table data to GCS.

   The supported syntax is:

       %bigquery extract -S|--source <table> -D|--destination <url> <other_args>

  Args:
    args: the arguments following '%bigquery extract'.
  Returns:
    A message about whether the extract succeeded or failed.
  """
  name = args['source']
  source = _utils.get_notebook_item(name)
  if not source:
    source = _get_table(name)

  if not source:
    raise Exception('No source named %s found' % name)
  elif isinstance(source, gcp.bigquery.Table) and not source.exists():
    raise Exception('Table %s does not exist' % name)
  else:

    job = source.extract(args['destination'],
                         format='CSV' if args['format'] == 'csv' else 'NEWLINE_DELIMITED_JSON',
                         compress=args['compress'],
                         csv_delimiter=args['delimiter'],
                         csv_header=args['header'])
    if job.failed:
      raise Exception('Extract failed: %s' % str(job.fatal_error))
    elif job.errors:
      raise Exception('Extract completed with errors: %s' % str(job.errors))
Example #3
0
def _get_query_argument(args, cell, env):
  """ Get a query argument to a cell magic.

  The query is specified with args['query']. We look that up and if it is a BQ query
  just return it. If it is instead a SqlModule or SqlStatement it may have variable
  references. We resolve those using the arg parser for the SqlModule, then override
  the resulting defaults with either the Python code in cell, or the dictionary in
  overrides. The latter is for if the overrides are specified with YAML or JSON and
  eventually we should eliminate code in favor of this.

  Args:
    args: the dictionary of magic arguments.
    cell: the cell contents which can be variable value overrides (if args has a 'query'
        value) or inline SQL otherwise.
    env: a dictionary that is used for looking up variable values.
  Returns:
    A Query object.
  """
  sql_arg = args.get('query', None)
  if sql_arg is None:
    # Assume we have inline SQL in the cell
    if not isinstance(cell, basestring):
      raise Exception('Expected a --query argument or inline SQL')
    return gcp.bigquery.Query(cell, values=env)

  item = _utils.get_notebook_item(sql_arg)
  if isinstance(item, gcp.bigquery.Query):  # Queries are already expanded.
    return item

  # Create an expanded BQ Query.
  config = _utils.parse_config(cell, env)
  item, env = gcp.data.SqlModule.get_sql_statement_with_environment(item, config)
  if cell:
    env.update(config)  # config is both a fallback and an override.
  return gcp.bigquery.Query(item, values=env)
Example #4
0
def _sample_cell(args, cell_body):
  """Implements the bigquery sample cell magic for ipython notebooks.

  Args:
    args: the optional arguments following '%%bigquery sample'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The results of executing the sampling query, or a profile of the sample data.
  """

  env = _utils.notebook_environment()
  query = None
  table = None
  view = None

  if args['query']:
    query = _get_query_argument(args, cell_body, env)
  elif args['table']:
    table = _get_table(args['table'])
  elif args['view']:
    view = _utils.get_notebook_item(args['view'])
    if not isinstance(view, gcp.bigquery.View):
      raise Exception('%s is not a view' % args['view'])
  else:
    query = gcp.bigquery.Query(cell_body, values=env)

  count = args['count']
  method = args['method']
  if method == 'random':
    sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count)
  elif method == 'hashed':
    sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'],
                                            percent=args['percent'],
                                            count=count)
  elif method == 'sorted':
    ascending = args['order'] == 'ascending'
    sampling = gcp.bigquery.Sampling.sorted(args['field'],
                                            ascending=ascending,
                                            count=count)
  elif method == 'limit':
    sampling = gcp.bigquery.Sampling.default(count=count)
  else:
    sampling = gcp.bigquery.Sampling.default(count=count)

  if query:
    results = query.sample(sampling=sampling)
  elif view:
    results = view.sample(sampling=sampling)
  else:
    results = table.sample(sampling=sampling)
  if args['verbose']:
    print results.sql
  if args['profile']:
    return _utils.profile_df(results.to_dataframe())
  else:
    return results
Example #5
0
def _get_schema(name):
  """ Given a variable or table name, get the Schema if it exists. """
  item = _utils.get_notebook_item(name)
  if not item:
    item = _get_table(name)

  if isinstance(item, gcp.bigquery.Schema):
    return item
  if hasattr(item, 'schema') and isinstance(item.schema, gcp.bigquery._schema.Schema):
    return item.schema
  return None
Example #6
0
def _get_schema(name):
  """ Given a variable or table name, get the Schema if it exists. """
  item = _utils.get_notebook_item(name)
  if not item:
    item = _get_table(name)

  if isinstance(item, gcp.bigquery.Schema):
    return item
  if hasattr(item, 'schema') and isinstance(item.schema, gcp.bigquery._schema.Schema):
    return item.schema
  return None
Example #7
0
def _get_table(name):
  """ Given a variable or table name, get a Table if it exists.

  Args:
    name: the name of the Table or a variable referencing the Table.
  Returns:
    The Table, if found.
  """
  # If name is a variable referencing a table, use that.
  item = _utils.get_notebook_item(name)
  if isinstance(item, gcp.bigquery.Table):
    return item
  # Else treat this as a BQ table name and return the (cached) table if it exists.
  try:
    return _table_cache[name]
  except KeyError:
    table = gcp.bigquery.Table(name)
    if table.exists():
      _table_cache[name] = table
      return table
  return None
Example #8
0
def _get_table(name):
  """ Given a variable or table name, get a Table if it exists.

  Args:
    name: the name of the Table or a variable referencing the Table.
  Returns:
    The Table, if found.
  """
  # If name is a variable referencing a table, use that.
  item = _utils.get_notebook_item(name)
  if isinstance(item, gcp.bigquery.Table):
    return item
  # Else treat this as a BQ table name and return the (cached) table if it exists.
  try:
    return _table_cache[name]
  except KeyError:
    table = gcp.bigquery.Table(name)
    if table.exists():
      _table_cache[name] = table
      return table
  return None
Example #9
0
def _chart_cell(args, cell):
  source = args['data']
  ipy = IPython.get_ipython()
  chart_options = _utils.parse_config(cell, ipy.user_ns)
  if chart_options is None:
    chart_options = {}
  elif not isinstance(chart_options, dict):
    raise Exception("Could not parse chart options")
  fields = args['fields'] if args['fields'] else '*'
  div_id = _html.Html.next_id()
  env = {}
  controls_html = ''
  controls_ids = []
  if 'variables' in chart_options:
    variables = chart_options['variables']
    del chart_options['variables']  # Just to make sure GCharts doesn't see them.
    try:
      item = _utils.get_notebook_item(source)
      _, defaults = gcp.data.SqlModule.get_sql_statement_with_environment(item, '')
    except Exception:
      defaults = {}
    for varname, control in variables.items():
      label = control.get('label', varname)
      control_id = div_id + '__' + varname
      controls_ids.append(control_id)
      value = control.get('value', defaults.get(varname, None))
      # The user should usually specify the type but we will default to 'textbox' for strings
      # and 'set' for lists.
      if isinstance(value, basestring):
        type = 'textbox'
      elif isinstance(value, list):
        type = 'set'
      else:
        type = None
      type = control.get('type', type)

      if type == 'picker':
        choices = control.get('choices', value)
        if not isinstance(choices, list) or len(choices) == 0:
          raise Exception('picker control must specify a nonempty set of choices')
        if value is None:
          value = choices[0]
        choices_html = ''
        for i, choice in enumerate(choices):
          choices_html += "<option value=\"%s\" %s>%s</option>" % \
              (choice, ("selected=\"selected\"" if choice == value else ''), choice)
        control_html = "{label}<select disabled id=\"{id}\">{choices}</select>"\
            .format(label=label, id=control_id, choices=choices_html)
      elif type == 'set':  # Multi-picker; implemented as checkboxes.
        # TODO(gram): consider using "name" property of the control to group checkboxes. That
        # way we can save the code of constructing and parsing control Ids with sequential
        #  numbers in it. Multiple checkboxes can share the same name.
        choices = control.get('choices', value)
        if not isinstance(choices, list) or len(choices) == 0:
          raise Exception('set control must specify a nonempty set of choices')
        if value is None:
          value = choices
        choices_html = ''
        controls_ids[-1] = '%s:%d' % (control_id, len(choices))  # replace ID to include count.
        for i, choice in enumerate(choices):
          checked = choice in value
          choice_id = '%s:%d' % (control_id, i)
          # TODO(gram): we may want a 'Submit/Refresh button as we may not want to rerun
          # query on each checkbox change.
          choices_html += """
            <div>
              <label>
                <input type="checkbox" id="{id}" value="{choice}" {checked} disabled>
                {choice}
              </label>
            </div>
          """.format(id=choice_id, choice=choice, checked="checked" if checked else '')
        control_html = "{label}<div>{choices}</div>".format(label=label, choices=choices_html)
      elif type == 'checkbox':
        control_html = """
              <label>
                <input type="checkbox" id="{id}" {checked} disabled>
                {label}
              </label>
          """.format(label=label, id=control_id, checked="checked" if value else '')
      elif type == 'slider':
        min = control.get('min', None)
        max = control.get('max', None)
        if min is None or max is None:
          raise Exception('slider control must specify a min and max value')
        if max <= min:
          raise Exception('slider control must specify a min value less than max value')
        step = control.get('step', 1 if isinstance(min, int) and isinstance(max, int)
            else (max - min) / 10.0)
        if value is None:
          value = min
        control_html = """
          {label}
          <input type="text" class="gchart-slider_value" id="{id}_value" value="{value}" disabled/>
          <input type="range" class="gchart-slider" id="{id}" min="{min}" max="{max}" step="{step}"
              value="{value}" disabled/>
        """.format(label=label, id=control_id, value=value, min=min, max=max, step=step)
      elif type == 'textbox':
        if value is None:
          value = ''
        control_html = "{label}<input type=\"text\" value=\"{value}\" id=\"{id}\" disabled/>"\
            .format(label=label, value=value, id=control_id)
      else:
        raise Exception(
            'Unknown control type %s (expected picker, slider, checkbox, textbox or set)' % type)

      env[varname] = value
      controls_html += "<div class=\"gchart-control\">{control}</div>\n"\
          .format(control=control_html)

    controls_html = "<div class=\"gchart-controls\">{controls}</div>".format(controls=controls_html)

  _HTML_TEMPLATE = """
    <div class="bqgc-container">
      {controls}
      <div class="bqgc{extra_class}" id="{id}">
      </div>
    </div>
    <script>
      require(['extensions/charting', 'element!{id}', 'style!/static/extensions/charting.css'],
        function(charts, dom) {{
          charts.render(dom, {{chartStyle:'{chart_type}', dataName:'{source}', fields:'{fields}'}},
            {options}, {data}, {control_ids});
        }}
      );
    </script>
  """

  chart_type = args['chart']
  count = 25 if chart_type == 'paged_table' else -1
  data, _ = _utils.get_data(source, fields, env, 0, count)

  # TODO(gram): check if we need to augment env with user_ns
  return IPython.core.display.HTML(
    _HTML_TEMPLATE.format(controls=controls_html,
                      id=div_id,
                      chart_type=chart_type,
                      extra_class=" bqgc-controlled" if len(controls_html) else '',
                      source=_utils.get_data_source_index(source),
                      fields=fields,
                      options=json.dumps(chart_options, cls=gcp._util.JSONEncoder),
                      data=json.dumps(data, cls=gcp._util.JSONEncoder),
                      control_ids=str(controls_ids)))