def view_results(request, id, first_row=0): """ Returns the view for the results of the QueryHistory with the given id. The query results MUST be ready. To display query results, one should always go through the watch_query view. If ``first_row`` is 0, restarts (if necessary) the query read. Otherwise, just spits out a warning if first_row doesn't match the servers conception. Multiple readers will produce a confusing interaction here, and that's known. It understands the ``context`` GET parameter. (See watch_query().) """ # Coerce types; manage arguments id = int(id) first_row = long(first_row) start_over = (first_row == 0) # Retrieve models from database query_history = models.QueryHistory.objects.get(id=id) handle = QueryHandle(id=query_history.server_id, log_context=query_history.log_context) context = _parse_query_context(request.GET.get('context')) # Retrieve query results try: results = db_utils.db_client().fetch(handle, start_over) assert results.ready, 'Trying to display result that is not yet ready. Query id %s' % (id,) # We display the "Download" button only when we know # that there are results: downloadable = (first_row > 0 or len(results.data) > 0) fetch_error = False except BeeswaxException, ex: fetch_error = True error_message, log = expand_exception(ex)
def explain_directly(request, query_str, query_msg, design): """ Runs explain query. """ explanation = db_utils.db_client().explain(query_msg) context = ("design", design) return render('explain.mako', request, dict(query=query_str, explanation=explanation.textual, query_context=context))
def watch_query(request, id): """ Wait for the query to finish and (by default) displays the results of query id. It understands the optional GET params: on_success_url If given, it will be displayed when the query is successfully finished. Otherwise, it will display the view query results page by default. context A string of "name:data" that describes the context that generated this query result. It may be: - "table":"<table_name>" - "design":<design_id> All other GET params will be passed to on_success_url (if present). """ # Coerce types; manage arguments id = int(id) query_history = authorized_get_history(request, id, must_exist=True) # GET param: context. context_param = request.GET.get('context', '') # GET param: on_success_url. Default to view_results results_url = urlresolvers.reverse(view_results, kwargs=dict(id=str(id), first_row=0, last_result_len=0)) on_success_url = request.GET.get('on_success_url') if not on_success_url: on_success_url = results_url # Get the server_id server_id, state = _get_server_id_and_state(query_history) query_history.save_state(state) # Query finished? if state == QueryHistory.STATE.expired: raise PopupException(_("The result of this query has expired.")) elif state == QueryHistory.STATE.available: return format_preserving_redirect(request, on_success_url, request.GET) elif state == QueryHistory.STATE.failed: # When we fetch, Beeswax server will throw us a BeeswaxException, which has the # log we want to display. return format_preserving_redirect(request, results_url, request.GET) # Still running log = db_utils.db_client(query_history.get_query_server()).get_log(server_id) # Keep waiting # - Translate context into something more meaningful (type, data) context = _parse_query_context(context_param) return render('watch_wait.mako', request, { 'query': query_history, 'fwd_params': request.GET.urlencode(), 'log': log, 'hadoop_jobs': _parse_out_hadoop_jobs(log), 'query_context': context, })
def data_generator(query_model, formatter): """ data_generator(query_model, formatter) -> generator object Return a generator object for a csv. The first line is the column names. This is similar to export_csvxls.generator, but has one or two extra complexities. """ global _DATA_WAIT_SLEEP is_first_row = True next_row = 0 results = None handle = QueryHandle(query_model.server_id, query_model.log_context) yield formatter.init_doc() while True: # Make sure that we have the next batch of ready results while results is None or not results.ready: results = db_utils.db_client().fetch(handle, start_over=is_first_row) if not results.ready: time.sleep(_DATA_WAIT_SLEEP) # Someone is reading the results concurrently. Abort. # But unfortunately, this current generator will produce incomplete data. if next_row != results.start_row: msg = 'Error: Potentially incomplete results as an error occur during data retrieval.' yield formatter.format_row([msg]) err = ('Detected another client retrieving results for %s. ' 'Expect next row being %s and got %s. Aborting' % (query_model.server_id, next_row, results.start_row)) LOG.error(err) raise RuntimeError(err) if is_first_row: is_first_row = False yield formatter.format_header(results.columns) else: for i, row in enumerate(results.data): # TODO(bc): Hive seems to always return tab delimited row data. # What if a cell has a tab? row = row.split('\t') try: yield formatter.format_row(row) except TooBigToDownloadException, ex: LOG.error(ex) # Exceeded limit. Stop. results.has_more = False break if results.has_more: next_row += len(results.data) results = None else: yield formatter.fini_doc() break
def data_generator(query_model, formatter): """ data_generator(query_model, formatter) -> generator object Return a generator object for a csv. The first line is the column names. This is similar to export_csvxls.generator, but has one or two extra complexities. """ global _DATA_WAIT_SLEEP is_first_row = True next_row = 0 results = None handle = QueryHandle(query_model.server_id, query_model.log_context) yield formatter.init_doc() while True: # Make sure that we have the next batch of ready results while results is None or not results.ready: results = db_utils.db_client(query_model.get_query_server()).fetch(handle, start_over=is_first_row, fetch_size=-1) if not results.ready: time.sleep(_DATA_WAIT_SLEEP) # Someone is reading the results concurrently. Abort. # But unfortunately, this current generator will produce incomplete data. if next_row != results.start_row: msg = _('Error: Potentially incomplete results as an error occurred during data retrieval.') yield formatter.format_row([msg]) err = (_('Detected another client retrieving results for %(server_id)s. ' 'Expected next row to be %(row)s and got %(start_row)s. Aborting') % {'server_id': query_model.server_id, 'row': next_row, 'start_row': results.start_row}) LOG.error(err) raise RuntimeError(err) if is_first_row: is_first_row = False yield formatter.format_header(results.columns) else: for i, row in enumerate(results.data): # TODO(bc): Hive seems to always return tab delimited row data. # What if a cell has a tab? row = row.split('\t') try: yield formatter.format_row(row) except TooBigToDownloadException, ex: LOG.error(ex) # Exceeded limit. Stop. results.has_more = False break if results.has_more: next_row += len(results.data) results = None else: yield formatter.fini_doc() break
def expand_exception(exc): """expand_exception(exc) -> (error msg, log message)""" try: log = db_utils.db_client().get_log(exc.log_context) except: # Always show something, even if server has died on the job. log = "Could not retrieve log." if not exc.message: error_message = "Unknown exception." else: error_message = exc.message return error_message, log
def expand_exception(exc): """expand_exception(exc) -> (error msg, log message)""" try: log = db_utils.db_client().get_log(exc.log_context) except: # Always show something, even if server has died on the job. log = _("Could not retrieve log.") if not exc.message: error_message = _("Unknown exception.") else: error_message = force_unicode(exc.message, strings_only=True, errors='replace') return error_message, log
def configuration(request): if request.method == 'POST': server_form = QueryServerForm(request.POST) if server_form.is_valid(): query_server = db_utils.get_query_server(server_form.cleaned_data["server"]) config_values = db_utils.db_client(query_server).get_default_configuration( bool(request.REQUEST.get("include_hadoop", False))) else: server_form = QueryServerForm() config_values = {} return render("configuration.mako", request, {'config_values': config_values, 'server_form': server_form})
def view_results(request, id, first_row=0, last_result_len=0): """ Returns the view for the results of the QueryHistory with the given id. The query results MUST be ready. To display query results, one should always go through the watch_query view. If ``first_row`` is 0, restarts (if necessary) the query read. Otherwise, just spits out a warning if first_row doesn't match the servers conception. Multiple readers will produce a confusing interaction here, and that's known. It understands the ``context`` GET parameter. (See watch_query().) """ # Coerce types; manage arguments id = int(id) first_row = long(first_row) start_over = (first_row == 0) query_history = authorized_get_history(request, id, must_exist=True) handle = QueryHandle(id=query_history.server_id, log_context=query_history.log_context) context = _parse_query_context(request.GET.get('context')) # Retrieve query results try: results = db_utils.db_client(query_history.get_query_server()).fetch( handle, start_over, -1) assert results.ready, _( 'Trying to display result that is not yet ready. Query id %(id)s' ) % { 'id': id } # We display the "Download" button only when we know # that there are results: downloadable = (first_row > 0 or len(results.data) > 0) fetch_error = False except BeeswaxException, ex: fetch_error = True error_message, log = expand_exception(ex)
def save_results(request, id): """ Save the results of a query to an HDFS directory """ id = int(id) query_history = models.QueryHistory.objects.get(id=id) if query_history.owner != request.user: raise PopupException('This action is only available to the user who submitted the query.') _, state = _get_server_id_and_state(query_history) query_history.save_state(state) error_msg, log = None, None if request.method == 'POST': # Make sure the result is available. # Note that we may still hit errors during the actual save if state != models.QueryHistory.STATE.available: if state in (models.QueryHistory.STATE.failed, models.QueryHistory.STATE.expired): msg = 'This query has %s. Results unavailable.' % (state,) else: msg = 'The result of this query is not available yet.' raise PopupException(msg) form = beeswax.forms.SaveResultsForm(request.POST) # Cancel goes back to results if request.POST.get('cancel'): return format_preserving_redirect(request, '/beeswax/watch/%s' % (id,)) if form.is_valid(): # Do save # 1. Get the results metadata assert request.POST.get('save') handle = QueryHandle(id=query_history.server_id, log_context=query_history.log_context) try: result_meta = db_utils.db_client().get_results_metadata(handle) except QueryNotFoundException, ex: LOG.exception(ex) raise PopupException('Cannot find query.') if result_meta.table_dir: result_meta.table_dir = request.fs.urlsplit(result_meta.table_dir)[2] # 2. Check for partitioned tables if result_meta.table_dir is None: raise PopupException( 'Saving results from a partitioned table is not supported. ' 'You may copy from the HDFS location manually.') # 3. Actual saving of results try: if form.cleaned_data['save_target'] == form.SAVE_TYPE_DIR: # To dir if result_meta.in_tablename: raise PopupException( 'Saving results from a table to a directory is not supported. ' 'You may copy from the HDFS location manually.') target_dir = form.cleaned_data['target_dir'] request.fs.rename_star(result_meta.table_dir, target_dir) LOG.debug("Moved results from %s to %s" % (result_meta.table_dir, target_dir)) query_history.save_state(models.QueryHistory.STATE.expired) fb_url = location_to_url(request, target_dir, strict=False) popup = PopupWithJframe('Query results stored in %s' % (target_dir,), launch_app_name='FileBrowser', launch_app_url=fb_url) return render_injected(list_query_history(request), popup) elif form.cleaned_data['save_target'] == form.SAVE_TYPE_TBL: # To new table try: return _save_results_ctas(request, query_history, form.cleaned_data['target_table'], result_meta) except BeeswaxException, bex: LOG.exception(bex) error_msg, log = expand_exception(bex) except IOError, ex: LOG.exception(ex) error_msg = str(ex)
def configuration(request): config_values = db_utils.db_client().get_default_configuration( bool(request.REQUEST.get("include_hadoop", False))) return render("configuration.mako", request, dict(config_values=config_values))
fetch_error = True error_message, log = expand_exception(ex) # Handle errors if fetch_error: return render('watch_results.mako', request, { 'query': query_history, 'error': True, 'error_message': error_message, 'log': log, 'hadoop_jobs': _parse_out_hadoop_jobs(log), 'query_context': context, 'can_save': False, }) log = db_utils.db_client().get_log(query_history.server_id) download_urls = {} if downloadable: for format in common.DL_FORMATS: download_urls[format] = urlresolvers.reverse( download, kwargs=dict(id=str(id), format=format)) save_form = beeswax.forms.SaveResultsForm() # Display the results return render('watch_results.mako', request, { 'error': False, 'query': query_history, # Materialize, for easier testability. 'results': list(parse_results(results.data)), 'has_more': results.has_more,
def watch_query(request, id): """ Wait for the query to finish and (by default) displays the results of query id. It understands the optional GET params: on_success_url If given, it will be displayed when the query is successfully finished. Otherwise, it will display the view query results page by default. context A string of "name:data" that describes the context that generated this query result. It may be: - "table":"<table_name>" - "design":<design_id> All other GET params will be passed to on_success_url (if present). """ # Coerce types; manage arguments id = int(id) query_history = authorized_get_history(request, id, must_exist=True) # GET param: context. context_param = request.GET.get('context', '') # GET param: on_success_url. Default to view_results results_url = urlresolvers.reverse(view_results, kwargs=dict(id=str(id), first_row=0, last_result_len=0)) on_success_url = request.GET.get('on_success_url') if not on_success_url: on_success_url = results_url # Get the server_id server_id, state = _get_server_id_and_state(query_history) query_history.save_state(state) # Query finished? if state == QueryHistory.STATE.expired: raise PopupException(_("The result of this query has expired.")) elif state == QueryHistory.STATE.available: return format_preserving_redirect(request, on_success_url, request.GET) elif state == QueryHistory.STATE.failed: # When we fetch, Beeswax server will throw us a BeeswaxException, which has the # log we want to display. return format_preserving_redirect(request, results_url, request.GET) # Still running log = db_utils.db_client( query_history.get_query_server()).get_log(server_id) # Keep waiting # - Translate context into something more meaningful (type, data) context = _parse_query_context(context_param) return render( 'watch_wait.mako', request, { 'query': query_history, 'fwd_params': request.GET.urlencode(), 'log': log, 'hadoop_jobs': _parse_out_hadoop_jobs(log), 'query_context': context, })
error_message, log = expand_exception(ex) # Handle errors if fetch_error: return render( 'watch_results.mako', request, { 'query': query_history, 'error': True, 'error_message': error_message, 'log': log, 'hadoop_jobs': _parse_out_hadoop_jobs(log), 'query_context': context, 'can_save': False, }) log = db_utils.db_client(query_history.get_query_server()).get_log( query_history.server_id) download_urls = {} if downloadable: for format in common.DL_FORMATS: download_urls[format] = urlresolvers.reverse(download, kwargs=dict( id=str(id), format=format)) save_form = SaveResultsForm() has_more = True last_result_len = long(last_result_len) if (last_result_len != 0 and len(results.data) != last_result_len) or len( results.data) == 0: has_more = False # Display the results
def save_results(request, id): """ Save the results of a query to an HDFS directory """ id = int(id) query_history = models.QueryHistory.objects.get(id=id) if query_history.owner != request.user: raise PopupException(_('This action is only available to the user who submitted the query.')) server_id, state = _get_server_id_and_state(query_history) query_history.save_state(state) error_msg, log = None, None if request.method == 'POST': # Make sure the result is available. # Note that we may still hit errors during the actual save if state != models.QueryHistory.STATE.available: if state in (models.QueryHistory.STATE.failed, models.QueryHistory.STATE.expired): msg = _('This query has %(state)s. Results unavailable.') % {'state': state} else: msg = _('The result of this query is not available yet.') raise PopupException(msg) form = beeswax.forms.SaveResultsForm(request.POST) # Cancel goes back to results if request.POST.get('cancel'): return format_preserving_redirect(request, '/beeswax/watch/%s' % (id,)) if form.is_valid(): # Do save # 1. Get the results metadata assert request.POST.get('save') handle = QueryHandle(id=query_history.server_id, log_context=query_history.log_context) try: result_meta = db_utils.db_client().get_results_metadata(handle) except QueryNotFoundException, ex: LOG.exception(ex) raise PopupException(_('Cannot find query.')) if result_meta.table_dir: result_meta.table_dir = request.fs.urlsplit(result_meta.table_dir)[2] # 2. Check for partitioned tables if result_meta.table_dir is None: raise PopupException(_('Saving results from a partitioned table is not supported. You may copy from the HDFS location manually.')) # 3. Actual saving of results try: if form.cleaned_data['save_target'] == form.SAVE_TYPE_DIR: # To dir if result_meta.in_tablename: raise PopupException(_('Saving results from a table to a directory is not supported. You may copy from the HDFS location manually.')) target_dir = form.cleaned_data['target_dir'] request.fs.rename_star(result_meta.table_dir, target_dir) LOG.debug("Moved results from %s to %s" % (result_meta.table_dir, target_dir)) query_history.save_state(models.QueryHistory.STATE.expired) return HttpResponse(urlresolvers.reverse('filebrowser.views.view', kwargs={'path': target_dir})) elif form.cleaned_data['save_target'] == form.SAVE_TYPE_TBL: # To new table try: return _save_results_ctas(request, query_history, form.cleaned_data['target_table'], result_meta) except BeeswaxException, bex: LOG.exception(bex) error_msg, log = expand_exception(bex) except WebHdfsException, ex: raise PopupException(_('The table could not be saved.'), detail=ex) except IOError, ex: LOG.exception(ex) error_msg = str(ex)