Exemple #1
0
def _chart_cell(args, cell):
    source = args['data']
    ipy = IPython.get_ipython()
    chart_options = _utils.parse_config(cell, ipy.user_ns)
    if chart_options is None:
        chart_options = {}
    fields = args['fields'] if args['fields'] else '*'

    _HTML_TEMPLATE = u"""
    <div class="bqgc" id="%s">
    </div>
    <script>
      require(['extensions/charting', 'element!%s', 'style!/static/extensions/charting.css'],
        function(charts, dom) {
          charts.render(dom, {chartStyle:'%s', dataName:'%s', fields:'%s'}, %s, %s);
        }
      );
    </script>
  """
    div_id = _html.Html.next_id()
    chart_type = args['chart']
    count = 25 if chart_type == 'paged_table' else -1
    data, _ = _utils.get_data(source, fields, 0, count)

    return IPython.core.display.HTML(
        _HTML_TEMPLATE %
        (div_id, div_id, chart_type, _utils.get_data_source_index(source),
         fields, json.dumps(chart_options, cls=gcp._util.JSONEncoder),
         json.dumps(data, cls=gcp._util.JSONEncoder)))
Exemple #2
0
def _get_query_argument(args, cell, env):
  """ Get a query argument to a cell magic.

  The query is specified with args['query']. We look that up and if it is a BQ query
  just return it. If it is instead a SqlModule or SqlStatement it may have variable
  references. We resolve those using the arg parser for the SqlModule, then override
  the resulting defaults with either the Python code in cell, or the dictionary in
  overrides. The latter is for if the overrides are specified with YAML or JSON and
  eventually we should eliminate code in favor of this.

  Args:
    args: the dictionary of magic arguments.
    cell: the cell contents which can be variable value overrides (if args has a 'query'
        value) or inline SQL otherwise.
    env: a dictionary that is used for looking up variable values.
  Returns:
    A Query object.
  """
  sql_arg = args.get('query', None)
  if sql_arg is None:
    # Assume we have inline SQL in the cell
    if not isinstance(cell, basestring):
      raise Exception('Expected a --query argument or inline SQL')
    return gcp.bigquery.Query(cell, values=env)

  item = _get_notebook_item(sql_arg)
  if isinstance(item, gcp.bigquery.Query):  # Queries are already expanded.
    return item

  # Create an expanded BQ Query.
  env = _utils.parse_config(cell, env)
  item, env = gcp.data.SqlModule.get_sql_statement_with_environment(item, env)
  if cell:
    env.update(cell)
  return gcp.bigquery.Query(item, values=env)
Exemple #3
0
def _create_cell(args, cell_body):
  """Implements the BigQuery cell magic used to create datasets and tables.

   The supported syntax is:

     %%bigquery create dataset -n|--name <name> [-f|--friendly <friendlyname>]
     [<description>]

   or:

     %%bigquery create table -n|--name <tablename> [--overwrite]
     [<YAML or JSON cell_body defining schema to use for tables>]

  Args:
    args: the argument following '%bigquery create <command>'.
  """
  if args['command'] == 'dataset':
    try:
      gcp.bigquery.DataSet(args['name']).create(friendly_name=args['friendly'],
                                                description=cell_body)
    except Exception as e:
      print 'Failed to create dataset %s: %s' % (args['name'], e)
  else:
    if cell_body is None:
      print 'Failed to create %s: no schema specified' % args['name']
    else:
      try:
        record = _utils.parse_config(cell_body, _utils.notebook_environment(), as_dict=False)
        schema = gcp.bigquery.Schema(record)
        gcp.bigquery.Table(args['name']).create(schema=schema, overwrite=args['overwrite'])
      except Exception as e:
        print 'Failed to create table %s: %s' % (args['name'], e)
Exemple #4
0
def _pipeline_cell(args, config):
  """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines.

   The supported syntax is:
   %%bigquery pipeline -q|--sql <query identifier> <other args> <action>
   <config>

  Args:
    args: the arguments following '%bigquery pipeline'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  if args['action'] == 'deploy':
    raise Exception('Deploying a pipeline is not yet supported')

  env = {}
  for key, value in _notebook_environment().iteritems():
    if isinstance(value, gcp.bigquery._udf.FunctionCall):
      env[key] = value

  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)
  if args['verbose']:
    print query.sql
  if args['action'] == 'dryrun':
    print(query.sql)
    result = query.execute_dry_run()
    return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                                is_cached=result['cacheHit'])
  if args['action'] == 'run':
    return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                         allow_large_results=args['large']).results
Exemple #5
0
def _get_query_argument(args, cell, env):
  """ Get a query argument to a cell magic.

  The query is specified with args['query']. We look that up and if it is a BQ query
  just return it. If it is instead a SqlModule or SqlStatement it may have variable
  references. We resolve those using the arg parser for the SqlModule, then override
  the resulting defaults with either the Python code in cell, or the dictionary in
  overrides. The latter is for if the overrides are specified with YAML or JSON and
  eventually we should eliminate code in favor of this.

  Args:
    args: the dictionary of magic arguments.
    cell: the cell contents which can be variable value overrides (if args has a 'query'
        value) or inline SQL otherwise.
    env: a dictionary that is used for looking up variable values.
  Returns:
    A Query object.
  """
  sql_arg = args.get('query', None)
  if sql_arg is None:
    # Assume we have inline SQL in the cell
    if not isinstance(cell, basestring):
      raise Exception('Expected a --query argument or inline SQL')
    return gcp.bigquery.Query(cell, values=env)

  item = _utils.get_notebook_item(sql_arg)
  if isinstance(item, gcp.bigquery.Query):  # Queries are already expanded.
    return item

  # Create an expanded BQ Query.
  config = _utils.parse_config(cell, env)
  item, env = gcp.data.SqlModule.get_sql_statement_with_environment(item, config)
  if cell:
    env.update(config)  # config is both a fallback and an override.
  return gcp.bigquery.Query(item, values=env)
Exemple #6
0
def _pipeline_cell(args, config):
  """Implements the BigQuery cell magic used to validate, execute or deploy BQ pipelines.

   The supported syntax is:
   %%bigquery pipeline -q|--sql <query identifier> <other args> <action>
   <config>

  Args:
    args: the arguments following '%bigquery pipeline'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  if args['action'] == 'deploy':
    raise Exception('Deploying a pipeline is not yet supported')

  env = {}
  for key, value in _notebook_environment().iteritems():
    if isinstance(value, gcp.bigquery._udf.FunctionCall):
      env[key] = value

  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)
  if args['verbose']:
    print query.sql
  if args['action'] == 'dryrun':
    print(query.sql)
    result = query.execute_dry_run()
    return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                                is_cached=result['cacheHit'])
  if args['action'] == 'run':
    return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                         allow_large_results=args['large']).results
Exemple #7
0
def _chart_cell(args, cell):
  source = args['data']
  ipy = IPython.get_ipython()
  chart_options = _utils.parse_config(cell, ipy.user_ns)
  if chart_options is None:
    chart_options = {}
  fields = args['fields'] if args['fields'] else '*'

  _HTML_TEMPLATE = """
    <div class="bqgc" id="%s">
    </div>
    <script>
      require(['extensions/charting', 'element!%s', 'style!/static/extensions/charting.css'],
        function(charts, dom) {
          charts.render(dom, {chartStyle:'%s', dataName:'%s', fields:'%s'}, %s, %s);
        }
      );
    </script>
  """
  div_id = _html.Html.next_id()
  chart_type = args['chart']
  count = 25 if chart_type == 'paged_table' else -1
  data, _ = _utils.get_data(source, fields, 0, count)

  return IPython.core.display.HTML(
    _HTML_TEMPLATE % (div_id, div_id, chart_type, _utils.get_data_source_index(source), fields,
                      json.dumps(chart_options, cls=gcp._util.JSONEncoder),
                      json.dumps(data, ensure_ascii=False, cls=gcp._util.JSONEncoder)))
Exemple #8
0
def _sample_cell(args, cell_body):
  """Implements the bigquery sample cell magic for ipython notebooks.

  Args:
    args: the optional arguments following '%%bigquery sample'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The results of executing the query converted to a dataframe if no variable
    was specified. None otherwise.
  """

  env = _notebook_environment()
  query = None
  table = None
  view = None

  if args['query']:
    config = _utils.parse_config(cell_body, env)
    query = _get_query_argument(args, config, env)
  elif args['table']:
    table = _get_table(args['table'])
  elif args['view']:
    view = _get_notebook_item(args['view'])
    if not isinstance(view, gcp.bigquery.View):
      raise Exception('%s is not a view' % args['view'])
  else:
    query = gcp.bigquery.Query(cell_body, values=env)

  count = args['count']
  method = args['method']
  if method == 'random':
    sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count)
  elif method == 'hashed':
    sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'],
                                            percent=args['percent'],
                                            count=count)
  elif method == 'sorted':
    ascending = args['order'] == 'ascending'
    sampling = gcp.bigquery.Sampling.sorted(args['field'],
                                            ascending=ascending,
                                            count=count)
  elif method == 'limit':
    sampling = gcp.bigquery.Sampling.default(count=count)
  else:
    sampling = gcp.bigquery.Sampling.default(count=count)

  if query:
    results = query.sample(sampling=sampling)
  elif view:
    results = view.sample(sampling=sampling)
  else:
    results = table.sample(sampling=sampling)
  if args['verbose']:
    print results.sql
  return results
Exemple #9
0
def _sample_cell(args, cell_body):
  """Implements the bigquery sample cell magic for ipython notebooks.

  Args:
    args: the optional arguments following '%%bigquery sample'.
    cell_body: optional contents of the cell interpreted as SQL, YAML or JSON.
  Returns:
    The results of executing the query converted to a dataframe if no variable
    was specified. None otherwise.
  """

  env = _notebook_environment()
  query = None
  table = None
  view = None

  if args['query']:
    config = _utils.parse_config(cell_body, env)
    query = _get_query_argument(args, config, env)
  elif args['table']:
    table = _get_table(args['table'])
  elif args['view']:
    view = _get_notebook_item(args['view'])
    if not isinstance(view, gcp.bigquery.View):
      raise Exception('%s is not a view' % args['view'])
  else:
    query = gcp.bigquery.Query(cell_body, values=env)

  count = args['count']
  method = args['method']
  if method == 'random':
    sampling = gcp.bigquery.Sampling.random(percent=args['percent'], count=count)
  elif method == 'hashed':
    sampling = gcp.bigquery.Sampling.hashed(field_name=args['field'],
                                            percent=args['percent'],
                                            count=count)
  elif method == 'sorted':
    ascending = args['order'] == 'ascending'
    sampling = gcp.bigquery.Sampling.sorted(args['field'],
                                            ascending=ascending,
                                            count=count)
  elif method == 'limit':
    sampling = gcp.bigquery.Sampling.default(count=count)
  else:
    sampling = gcp.bigquery.Sampling.default(count=count)

  if query:
    results = query.sample(sampling=sampling)
  elif view:
    results = view.sample(sampling=sampling)
  else:
    results = table.sample(sampling=sampling)
  if args['verbose']:
    print results.sql
  return results
Exemple #10
0
def _chart_cell(args, cell):
  source = args['data']
  ipy = IPython.get_ipython()
  chart_options = _utils.parse_config(cell, ipy.user_ns)
  if chart_options is None:
    chart_options = {}
  elif not isinstance(chart_options, dict):
    raise Exception("Could not parse chart options")
  chart_type = args['chart']
  fields = args['fields'] if args['fields'] else '*'
  return IPython.core.display.HTML(_utils.chart_html('gcharts', chart_type, source=source,
                                                     chart_options=chart_options, fields=fields))
Exemple #11
0
def _get_chart_data(line, cell_body=''):
  try:
    args = line.strip().split()
    source = _utils._data_sources[int(args[0])]
    fields = args[1]
    first_row = int(args[2]) if len(args) > 2 else 0
    count = int(args[3]) if len(args) > 3 else -1
    env = _utils.parse_config(cell_body, IPython.get_ipython().user_ns)
    data, _ = _utils.get_data(source, fields, env, first_row, count)
  except Exception, e:
    gcp._util.print_exception_with_last_stack(e)
    data = {}
Exemple #12
0
def _view(args, cell):
  csv = gcp.data.Csv(args['input'])
  num_lines = int(args['count'] or 5)
  headers = None
  if cell:
    ipy = IPython.get_ipython()
    config = _utils.parse_config(cell, ipy.user_ns)
    if 'columns' in config:
      headers = [e.strip() for e in config['columns'].split(',')]
  df = pd.DataFrame(csv.browse(num_lines, headers))
  if args['profile']:
    # TODO(gram): We need to generate a schema and type-convert the columns before this
    # will be useful for CSV
    return _utils.profile_df(df)
  else:
    return IPython.core.display.HTML(df.to_html(index=False))
Exemple #13
0
def _chart_cell(args, cell):
    source = args['data']
    ipy = IPython.get_ipython()
    chart_options = _utils.parse_config(cell, ipy.user_ns)
    if chart_options is None:
        chart_options = {}
    elif not isinstance(chart_options, dict):
        raise Exception("Could not parse chart options")
    chart_type = args['chart']
    fields = args['fields'] if args['fields'] else '*'
    return IPython.core.display.HTML(
        _utils.chart_html('gcharts',
                          chart_type,
                          source=source,
                          chart_options=chart_options,
                          fields=fields))
Exemple #14
0
def _execute_cell(args, config):
  """Implements the BigQuery cell magic used to execute BQ queries.

   The supported syntax is:
   %%bigquery execute -q|--sql <query identifier> <other args>
   <config>

  Args:
    args: the arguments following '%bigquery execute'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  env = _notebook_environment()
  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)
  if args['verbose']:
    print query.sql
  return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                       allow_large_results=args['large']).results
Exemple #15
0
def _execute_cell(args, config):
  """Implements the BigQuery cell magic used to execute BQ queries.

   The supported syntax is:
   %%bigquery execute -q|--sql <query identifier> <other args>
   <config>

  Args:
    args: the arguments following '%bigquery execute'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The QueryResultsTable
  """
  env = _notebook_environment()
  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)
  if args['verbose']:
    print query.sql
  return query.execute(args['target'], table_mode=args['mode'], use_cache=not args['nocache'],
                       allow_large_results=args['large']).results
Exemple #16
0
def _dryrun_cell(args, config):
  """Implements the BigQuery cell magic used to dry run BQ queries.

   The supported syntax is:
   %%bigquery dryrun -q|--sql <query identifier>
   <config>

  Args:
    args: the argument following '%bigquery dryrun'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The response wrapped in a DryRunStats object
  """
  env = _notebook_environment()
  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)

  if args['verbose']:
    print query.sql
  result = query.execute_dry_run()
  return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                              is_cached=result['cacheHit'])
Exemple #17
0
def _dryrun_cell(args, config):
  """Implements the BigQuery cell magic used to dry run BQ queries.

   The supported syntax is:
   %%bigquery dryrun -q|--sql <query identifier>
   <config>

  Args:
    args: the argument following '%bigquery dryrun'.
    config: optional contents of the cell interpreted as YAML or JSON.
  Returns:
    The response wrapped in a DryRunStats object
  """
  env = _notebook_environment()
  config = _utils.parse_config(config, env)
  query = _get_query_argument(args, config, env)

  if args['verbose']:
    print query.sql
  result = query.execute_dry_run()
  return gcp.bigquery._query_stats.QueryStats(total_bytes=result['totalBytesProcessed'],
                                              is_cached=result['cacheHit'])
Exemple #18
0
def _chart_cell(args, cell):
  source = args['data']
  ipy = IPython.get_ipython()
  chart_options = _utils.parse_config(cell, ipy.user_ns)
  if chart_options is None:
    chart_options = {}
  elif not isinstance(chart_options, dict):
    raise Exception("Could not parse chart options")
  fields = args['fields'] if args['fields'] else '*'
  div_id = _html.Html.next_id()
  env = {}
  controls_html = ''
  controls_ids = []
  if 'variables' in chart_options:
    variables = chart_options['variables']
    del chart_options['variables']  # Just to make sure GCharts doesn't see them.
    try:
      item = _utils.get_notebook_item(source)
      _, defaults = gcp.data.SqlModule.get_sql_statement_with_environment(item, '')
    except Exception:
      defaults = {}
    for varname, control in variables.items():
      label = control.get('label', varname)
      control_id = div_id + '__' + varname
      controls_ids.append(control_id)
      value = control.get('value', defaults.get(varname, None))
      # The user should usually specify the type but we will default to 'textbox' for strings
      # and 'set' for lists.
      if isinstance(value, basestring):
        type = 'textbox'
      elif isinstance(value, list):
        type = 'set'
      else:
        type = None
      type = control.get('type', type)

      if type == 'picker':
        choices = control.get('choices', value)
        if not isinstance(choices, list) or len(choices) == 0:
          raise Exception('picker control must specify a nonempty set of choices')
        if value is None:
          value = choices[0]
        choices_html = ''
        for i, choice in enumerate(choices):
          choices_html += "<option value=\"%s\" %s>%s</option>" % \
              (choice, ("selected=\"selected\"" if choice == value else ''), choice)
        control_html = "{label}<select disabled id=\"{id}\">{choices}</select>"\
            .format(label=label, id=control_id, choices=choices_html)
      elif type == 'set':  # Multi-picker; implemented as checkboxes.
        # TODO(gram): consider using "name" property of the control to group checkboxes. That
        # way we can save the code of constructing and parsing control Ids with sequential
        #  numbers in it. Multiple checkboxes can share the same name.
        choices = control.get('choices', value)
        if not isinstance(choices, list) or len(choices) == 0:
          raise Exception('set control must specify a nonempty set of choices')
        if value is None:
          value = choices
        choices_html = ''
        controls_ids[-1] = '%s:%d' % (control_id, len(choices))  # replace ID to include count.
        for i, choice in enumerate(choices):
          checked = choice in value
          choice_id = '%s:%d' % (control_id, i)
          # TODO(gram): we may want a 'Submit/Refresh button as we may not want to rerun
          # query on each checkbox change.
          choices_html += """
            <div>
              <label>
                <input type="checkbox" id="{id}" value="{choice}" {checked} disabled>
                {choice}
              </label>
            </div>
          """.format(id=choice_id, choice=choice, checked="checked" if checked else '')
        control_html = "{label}<div>{choices}</div>".format(label=label, choices=choices_html)
      elif type == 'checkbox':
        control_html = """
              <label>
                <input type="checkbox" id="{id}" {checked} disabled>
                {label}
              </label>
          """.format(label=label, id=control_id, checked="checked" if value else '')
      elif type == 'slider':
        min = control.get('min', None)
        max = control.get('max', None)
        if min is None or max is None:
          raise Exception('slider control must specify a min and max value')
        if max <= min:
          raise Exception('slider control must specify a min value less than max value')
        step = control.get('step', 1 if isinstance(min, int) and isinstance(max, int)
            else (max - min) / 10.0)
        if value is None:
          value = min
        control_html = """
          {label}
          <input type="text" class="gchart-slider_value" id="{id}_value" value="{value}" disabled/>
          <input type="range" class="gchart-slider" id="{id}" min="{min}" max="{max}" step="{step}"
              value="{value}" disabled/>
        """.format(label=label, id=control_id, value=value, min=min, max=max, step=step)
      elif type == 'textbox':
        if value is None:
          value = ''
        control_html = "{label}<input type=\"text\" value=\"{value}\" id=\"{id}\" disabled/>"\
            .format(label=label, value=value, id=control_id)
      else:
        raise Exception(
            'Unknown control type %s (expected picker, slider, checkbox, textbox or set)' % type)

      env[varname] = value
      controls_html += "<div class=\"gchart-control\">{control}</div>\n"\
          .format(control=control_html)

    controls_html = "<div class=\"gchart-controls\">{controls}</div>".format(controls=controls_html)

  _HTML_TEMPLATE = """
    <div class="bqgc-container">
      {controls}
      <div class="bqgc{extra_class}" id="{id}">
      </div>
    </div>
    <script>
      require(['extensions/charting', 'element!{id}', 'style!/static/extensions/charting.css'],
        function(charts, dom) {{
          charts.render(dom, {{chartStyle:'{chart_type}', dataName:'{source}', fields:'{fields}'}},
            {options}, {data}, {control_ids});
        }}
      );
    </script>
  """

  chart_type = args['chart']
  count = 25 if chart_type == 'paged_table' else -1
  data, _ = _utils.get_data(source, fields, env, 0, count)

  # TODO(gram): check if we need to augment env with user_ns
  return IPython.core.display.HTML(
    _HTML_TEMPLATE.format(controls=controls_html,
                      id=div_id,
                      chart_type=chart_type,
                      extra_class=" bqgc-controlled" if len(controls_html) else '',
                      source=_utils.get_data_source_index(source),
                      fields=fields,
                      options=json.dumps(chart_options, cls=gcp._util.JSONEncoder),
                      data=json.dumps(data, cls=gcp._util.JSONEncoder),
                      control_ids=str(controls_ids)))