def check_status(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['query_status'] = get_api(request, snippet).check_status(notebook, snippet) response['status'] = 0 except SessionExpired: response['status'] = 'expired' raise except QueryExpired: response['status'] = 'expired' raise finally: if response['status'] == 0 and snippet['status'] != response['query_status']: status = response['query_status']['status'] elif response['status'] == 'expired': status = 'expired' else: status = 'failed' if notebook['type'].startswith('query'): nb_doc = Document2.objects.get(id=notebook['id']) nb_doc.can_write_or_exception(request.user) nb = Notebook(document=nb_doc).get_data() nb['snippets'][0]['status'] = status nb_doc.update_data(nb) nb_doc.save() return JsonResponse(response)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': #TODO get schema from explain query pass return JsonResponse(format_)
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response['handle']['statements_count'] else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def get_logs(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) startFrom = request.POST.get('from') startFrom = int(startFrom) if startFrom else None size = request.POST.get('size') size = int(size) if size else None db = get_api(request, snippet) full_log = smart_str(request.POST.get('full_log', '')) logs = db.get_log(notebook, snippet, startFrom=startFrom, size=size) full_log += logs jobs = db.get_jobs(notebook, snippet, full_log) response['logs'] = logs.strip() response['progress'] = min(db.progress(snippet, full_log), 99) if snippet['status'] != 'available' and snippet['status'] != 'success' else 100 response['jobs'] = jobs response['isFullLogs'] = snippet.get('interface') == 'oozie' response['status'] = 0 return JsonResponse(response)
def get_logs(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) startFrom = request.POST.get('from') startFrom = int(startFrom) if startFrom else None size = request.POST.get('size') size = int(size) if size else None db = get_api(request, snippet) logs = db.get_log(notebook, snippet, startFrom=startFrom, size=size) jobs = json.loads(request.POST.get('jobs', '[]')) # Get any new jobs from current logs snippet new_jobs = db.get_jobs(notebook, snippet, logs) # Append new jobs to known jobs and get the unique set if new_jobs: all_jobs = jobs + new_jobs jobs = dict((job['name'], job) for job in all_jobs).values() # Retrieve full log for job progress parsing full_log = request.POST.get('full_log', logs) response['logs'] = logs response['progress'] = db.progress(snippet, full_log) if snippet['status'] != 'available' and snippet['status'] != 'success' else 100 response['jobs'] = jobs response['status'] = 0 return JsonResponse(response)
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') else: raise PopupException(_('Action %s is unknown') % action) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', 'success_url': success_url }), 'editor_type': editor_type, })
def export_result(request): response = {'status': -1, 'message': _('Exporting result failed.')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = json.loads(request.POST.get('destination', '')) overwrite = json.loads(request.POST.get('overwrite', False)) api = get_api(request, snippet) if data_format == 'hdfs-file': if overwrite and request.fs.exists(destination): if request.fs.isfile(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) else: raise ValidationError(_("The target path is a directory")) response['watch_url'] = api.export_data_as_hdfs_file(snippet, destination, overwrite) response['status'] = 0 elif data_format == 'hive-table': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 elif data_format == 'hdfs-directory': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 return JsonResponse(response)
def get_logs(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) startFrom = request.POST.get("from") startFrom = int(startFrom) if startFrom else None size = request.POST.get("size") size = int(size) if size else None db = get_api(request, snippet) full_log = str(request.POST.get("full_log", "")) logs = db.get_log(notebook, snippet, startFrom=startFrom, size=size) full_log += logs jobs = db.get_jobs(notebook, snippet, full_log) response["logs"] = logs.strip() response["progress"] = ( db.progress(snippet, full_log) if snippet["status"] != "available" and snippet["status"] != "success" else 100 ) response["jobs"] = jobs response["isFullLogs"] = snippet.get("interface") == "oozie" response["status"] = 0 return JsonResponse(response)
def get_logs(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) startFrom = request.POST.get("from") startFrom = int(startFrom) if startFrom else None size = request.POST.get("size") size = int(size) if size else None db = get_api(request.user, snippet, request.fs, request.jt) logs = db.get_log(notebook, snippet, startFrom=startFrom, size=size) jobs = json.loads(request.POST.get("jobs", "[]")) # Get any new jobs from current logs snippet new_jobs = db.get_jobs(notebook, snippet, logs) # Append new jobs to known jobs and get the unique set if new_jobs: all_jobs = jobs + new_jobs jobs = dict((job["name"], job) for job in all_jobs).values() response["logs"] = logs response["progress"] = ( db.progress(snippet, logs) if snippet["status"] != "available" and snippet["status"] != "success" else 100 ) response["jobs"] = jobs response["status"] = 0 return JsonResponse(response)
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook['isSaved']: # Keep track of history of saved queries response['history_parent_uuid'] = history.dependencies.filter(type__startswith='query-').latest('last_modified').uuid # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def dt_logout(request, next_page=None): """Log out the user""" username = request.user.get_username() request.audit = { 'username': username, 'operation': 'USER_LOGOUT', 'operationText': 'Logged out user: %s' % username } # Close Impala session on logout session_app = "impala" if request.user.has_hue_permission(action='access', app=session_app): session = {"type":session_app,"sourceMethod":"dt_logout"} try: get_api(request, session).close_session(session) except Exception, e: LOG.warn("Error closing Impala session: %s" % e)
def close_session(request): response = {"status": -1} session = json.loads(request.POST.get("session", "{}")) response["session"] = get_api(request, {"type": session["type"]}).close_session(session=session) response["status"] = 0 return JsonResponse(response)
def explain(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response = get_api(request, snippet).explain(notebook, snippet) return JsonResponse(response)
def close_session(request): response = {'status': -1} session = json.loads(request.POST.get('session', '{}')) response['session'] = get_api(request, {'type': session['type']}).close_session(session=session) response['status'] = 0 return JsonResponse(response)
def explain(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response = get_api(request, snippet).explain(notebook, snippet) return JsonResponse(response)
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response['handle'] = get_api(request.user, snippet).execute(notebook, snippet) response['status'] = 0 return JsonResponse(response)
def fetch_result_metadata(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response['result'] = get_api(request, snippet).fetch_result_metadata(notebook, snippet) response['status'] = 0 return JsonResponse(response)
def cancel_statement(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response['result'] = get_api(request, snippet).cancel(notebook, snippet) response['status'] = 0 return JsonResponse(response)
def check_status(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response['query_status'] = get_api(request, snippet).check_status(notebook, snippet) response['status'] = 0 return JsonResponse(response)
def check_status(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response["query_status"] = get_api(request.user, snippet, request.fs, request.jt).check_status(notebook, snippet) response["status"] = 0 return JsonResponse(response)
def cancel_statement(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response["result"] = get_api(request.user, snippet, request.fs, request.jt).cancel(notebook, snippet) response["status"] = 0 return JsonResponse(response)
def fetch_result_metadata(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response["result"] = get_api(request.user, snippet, request.fs, request.jt).fetch_result_metadata(notebook, snippet) response["status"] = 0 return JsonResponse(response)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } return JsonResponse(format_)
def fetch_result_data(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) rows = json.loads(request.POST.get('rows', 100)) start_over = json.loads(request.POST.get('startOver', False)) response['result'] = get_api(request.user, snippet).fetch_result(notebook, snippet, rows, start_over) response['status'] = 0 return JsonResponse(response)
def close_notebook(request): response = {'status': -1, 'result': []} notebook = json.loads(request.POST.get('notebook', '{}')) for session in notebook['sessions']: try: response['result'].append(get_api(request.user, session, request.fs, request.jt).close_session(session)) except QueryExpired: pass except Exception, e: LOG.exception('Error closing session %s' % str(e))
def close_notebook(request): response = {'status': -1, 'result': []} notebook = json.loads(request.POST.get('notebook', '{}')) for session in [_s for _s in notebook['sessions'] if _s['type'] in ('scala', 'spark', 'pyspark', 'sparkr')]: try: response['result'].append(get_api(request, session).close_session(session)) except QueryExpired: pass except Exception, e: LOG.exception('Error closing session %s' % str(e))
def upgrade_session_properties(request, notebook): # Upgrade session data if using old format data = notebook.get_data() for session in data.get("sessions", []): api = get_api(request, session) if "type" in session and hasattr(api, "upgrade_properties"): properties = session.get("properties", None) session["properties"] = api.upgrade_properties(session["type"], properties) notebook.data = json.dumps(data) return notebook
def statement_risk(request): response = {'status': -1, 'message': ''} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) api = get_api(request, snippet) response['query_complexity'] = api.statement_risk(notebook, snippet) response['status'] = 0 return JsonResponse(response)
def upgrade_session_properties(request, notebook): # Upgrade session data if using old format data = notebook.get_data() for session in data.get('sessions', []): api = get_api(request, session) if 'type' in session and hasattr(api, 'upgrade_properties'): properties = session.get('properties', None) session['properties'] = api.upgrade_properties(session['type'], properties) notebook.data = json.dumps(data) return notebook
def create_session(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) session = json.loads(request.POST.get('session', '{}')) properties = session.get('properties', []) response['session'] = get_api(request, session).create_session(lang=session['type'], properties=properties) response['status'] = 0 return JsonResponse(response)
def execute(request): response = {'status': -1} result = None notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response['handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response['handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response['handle'].get('statement', snippet['statement']) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook['isSaved']: # Keep track of history of saved queries response['history_parent_uuid'] = history.dependencies.filter(type__startswith='query-').latest('last_modified').uuid except QueryError, ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex
def fields(self, dashboard): database, table = self._get_database_table_names(dashboard) snippet = {'type': self.engine} table_metadata = get_api(MockRequest(self.user), snippet).autocomplete(snippet, database, table) return { 'schema': { 'fields': dict([(col['name'], { 'name': str(escape(col['name'])), 'type': str(col['type']), 'uniqueKey': col.get('primary_key') == 'true', # 'dynamicBase': False, 'indexed': False, 'stored': True, 'required': col.get('primary_key') == 'true' }) for col in table_metadata['extended_columns']] ) } }
def browse(request, database, table, partition_spec=None): snippet = {'type': 'hive'} statement = get_api(request, snippet).get_browse_query(snippet, database, table, partition_spec) editor_type = snippet['type'] if request.method == 'POST': notebook = make_notebook(name='Execute and watch', editor_type=editor_type, statement=statement, status='ready-execute', is_task=True) return JsonResponse(notebook.execute(request, batch=False)) else: editor = make_notebook(name='Browse', editor_type=editor_type, statement=statement, status='ready-execute') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': get_ordered_interpreters(request.user), 'mode': 'editor', 'editor_type': editor_type }), 'editor_type': editor_type, })
def autocomplete(request, server=None, database=None, table=None, column=None, nested=None): response = {'status': -1} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: autocomplete_data = get_api(request, snippet).autocomplete( snippet, database, table, column, nested) response.update(autocomplete_data) except QueryExpired: pass response['status'] = 0 return JsonResponse(response)
def check_status(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) if not snippet: nb_doc = Document2.objects.get_by_uuid(user=request.user, uuid=notebook['id']) notebook = Notebook(document=nb_doc).get_data() snippet = notebook['snippets'][0] try: response['query_status'] = get_api(request, snippet).check_status(notebook, snippet) response['status'] = 0 except SessionExpired: response['status'] = 'expired' raise except QueryExpired: response['status'] = 'expired' raise finally: if response['status'] == 0 and snippet['status'] != response['query_status']: status = response['query_status']['status'] elif response['status'] == 'expired': status = 'expired' else: status = 'failed' if notebook['type'].startswith('query') or notebook.get('isManaged'): nb_doc = Document2.objects.get(id=notebook['id']) if nb_doc.can_write(request.user): nb = Notebook(document=nb_doc).get_data() if status != nb['snippets'][0]['status']: nb['snippets'][0]['status'] = status nb_doc.update_data(nb) nb_doc.save() return JsonResponse(response)
def create_session(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) session = json.loads(request.POST.get('session', '{}')) properties = session.get('properties', []) # If not properties look for previously used notebook session if not properties: old_session = [ _session for _session in notebook['sessions'] if _session['type'] == session['type'] ] if any(old_session) and 'properties' in old_session[0]: properties = old_session[0]['properties'] response['session'] = get_api(request, session).create_session( lang=session['type'], properties=properties) response['status'] = 0 return JsonResponse(response)
def download(request): if not ENABLE_DOWNLOAD.get(): return serve_403_error(request) notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) file_format = request.POST.get('format', 'csv') response = get_api(request, snippet).download(notebook, snippet, file_format) if response: request.audit = { 'operation': 'DOWNLOAD', 'operationText': 'User %s downloaded results from %s as %s' % (request.user.username, _get_snippet_name(notebook), file_format), 'allowed': True } return response
def _get_api(request): file_format = json.loads(request.POST.get('source', request.POST.get('fileFormat', '{}'))) options = None query_server = None if file_format['rdbmsMode'] == 'customRdbms': type = 'custom' if file_format['rdbmsType'] == 'jdbc': name = file_format['rdbmsHostname'] # We make sure it's unique as name is the cache key interface = file_format['rdbmsType'] options = {'driver': file_format['rdbmsJdbcDriver'], 'url': file_format['rdbmsHostname'], 'user': file_format['rdbmsUsername'], 'password': file_format['rdbmsPassword'] } else: interface = 'rdbms' query_server = { 'server_name': file_format['rdbmsType'], 'server_host': file_format['rdbmsHostname'], 'server_port': int(file_format['rdbmsPort'] or '3306'), 'username': file_format['rdbmsUsername'], 'password': file_format['rdbmsPassword'], 'options': {}, 'alias': file_format['rdbmsType'] } name = 'rdbms:%(server_name)s://%(server_host)s:%(server_port)s' % query_server # We make sure it's unique as name is the cache key else: if file_format['rdbmsType'] == 'jdbc': type = file_format['rdbmsJdbcDriverName'] and file_format['rdbmsJdbcDriverName'].lower() else: type = file_format['rdbmsType'] query_server = rdbms.get_query_server_config(server=file_format['rdbmsType']) name = type interface = file_format['inputFormat'] return get_api(request, { 'type': type, 'interface': interface, 'options': options, 'query_server': query_server, 'name': name})
def get_logs(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) startFrom = request.POST.get('from') startFrom = int(startFrom) if startFrom else None size = request.POST.get('size') size = int(size) if size else None db = get_api(request, snippet) logs = db.get_log(notebook, snippet, startFrom=startFrom, size=size) jobs = json.loads(request.POST.get('jobs', '[]')) # Get any new jobs from current logs snippet new_jobs = db.get_jobs(notebook, snippet, logs) # Append new jobs to known jobs and get the unique set if new_jobs: all_jobs = jobs + new_jobs jobs = dict((job['name'], job) for job in all_jobs).values() # Retrieve full log for job progress parsing full_log = request.POST.get('full_log', logs) response['logs'] = logs.strip() response['progress'] = db.progress(snippet, full_log) if snippet[ 'status'] != 'available' and snippet['status'] != 'success' else 100 response['jobs'] = jobs response['status'] = 0 return JsonResponse(response)
def export_result(request): response = {'status': -1, 'message': _('Exporting result failed.')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = json.loads(request.POST.get('destination', '')) overwrite = json.loads(request.POST.get('overwrite', False)) api = get_api(request, snippet) if data_format == 'hdfs-file': if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file(snippet, destination, overwrite) response['status'] = 0 elif data_format == 'hive-table': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 elif data_format == 'hdfs-directory': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 elif data_format == 'search-index': notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query¬ebook=' + str(notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 return JsonResponse(response)
def browse(request, database, table): snippet = {'type': 'hive'} sql_select = get_api(request, snippet).get_select_star_query( snippet, database, table) editor_type = snippet['type'] editor = make_notebook(name='Browse', editor_type=editor_type, statement=sql_select, status='ready-execute') return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': get_ordered_interpreters(request.user), 'mode': 'editor', 'editor_type': editor_type }), 'editor_type': editor_type, })
def check_status(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['query_status'] = get_api(request, snippet).check_status( notebook, snippet) response['status'] = 0 finally: if response['status'] == 0 and snippet['status'] != response[ 'query_status']: status = response['query_status']['status'] else: status = 'failed' nb_doc = Document2.objects.get(id=notebook['id']) nb_doc.can_write_or_exception(request.user) nb = Notebook(document=nb_doc).get_data() nb['snippets'][0]['status'] = status nb_doc.update_data(nb) nb_doc.save() return JsonResponse(response)
def run_morphline(self, request, collection_name, morphline, input_path, query=None, start_time=None, lib_path=None): workspace_path = self._upload_workspace(morphline) task = make_notebook(name=_('Indexing into %s') % collection_name, editor_type='notebook', on_success_url=reverse( 'search:browse', kwargs={'name': collection_name}), pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, _success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location task.add_hive_snippet(snippet['database'], sql) client = SolrClient(self.user) extra_args = ['-Dmapreduce.job.user.classpath.first=true' ] if client.is_solr_six_or_more() else [] task.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=lib_path if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(), arguments=extra_args + [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', client.get_zookeeper_host(), u'--collection', collection_name, input_path, ], files=[{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }]) return task.execute(request, batch=True)
def close_statement_async(notebook, snippet, **kwargs): request = _get_request(**kwargs) get_api(request, snippet).close_statement(notebook, snippet)
def cancel_async(notebook, snippet, **kwargs): request = _get_request(**kwargs) get_api(request, snippet).cancel(notebook, snippet)
def export_result(request): response = {'status': -1, 'message': _('Exporting result failed.')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = json.loads(request.POST.get('destination', '')) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=save_as_table¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=insert_as_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'search-index': if is_embedded: if destination == '__hue__': destination = _get_snippet_name(notebook, unique=True, table_format=True) live_indexing = True else: live_indexing = False sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will be moved to the lib from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in sample['meta'] ] } if live_indexing: file_format['inputFormat'] = 'hs2_handle' file_format['fetch_handle'] = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) response['rowcount'] = _index(request, file_format, destination, query=notebook['uuid']) response['watch_url'] = reverse('search:browse', kwargs={'name': destination}) response['status'] = 0 else: response = _index(request, file_format, destination, query=notebook['uuid']) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=index_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Search index: %s' % (request.user.username, destination), 'allowed': True } return JsonResponse(response)
def export_result(request): response = {'status': -1, 'message': _('Success')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = urllib.unquote( json.loads(request.POST.get('destination', ''))) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) start_time = json.loads(request.POST.get('start_time', '-1')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=save_as_table¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=insert_as_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format in ('search-index', 'dashboard'): # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable) if is_embedded: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) if data_format == 'dashboard': engine = notebook['type'].replace('query-', '') response['watch_url'] = reverse( 'dashboard:browse', kwargs={ 'name': notebook_id }) + '?source=query&engine=%(engine)s' % { 'engine': engine } response['status'] = 0 else: sample = get_api(request, snippet).fetch_result(notebook, snippet, rows=4, start_over=True) for col in sample['meta']: col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string') response['status'] = 0 response['id'] = notebook_id response['name'] = _get_snippet_name(notebook) response['source_type'] = 'query' response['target_type'] = 'index' response['target_path'] = destination response['sample'] = list(sample['data']) response['columns'] = [ Field(col['name'], col['type']).to_dict() for col in sample['meta'] ] else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=index_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 if response.get('status') != 0: response['message'] = _('Exporting result failed.') return JsonResponse(response)
try: response['result'].append( get_api(request, session).close_session(session)) except QueryExpired: pass except Exception, e: LOG.exception('Error closing session %s' % str(e)) for snippet in [ _s for _s in notebook['snippets'] if _s['type'] in ('hive', 'impala') ]: try: if snippet['status'] != 'running': response['result'].append( get_api(request, snippet).close_statement(snippet)) else: LOG.info('Not closing SQL snippet as still running.') except QueryExpired: pass except Exception, e: LOG.exception('Error closing statement %s' % str(e)) response['status'] = 0 response['message'] = _('Notebook closed successfully') return JsonResponse(response) @require_POST @check_document_access_permission()
def close_statement_async(notebook, snippet, postdict=None, user_id=None): request = _get_request(postdict, user_id) get_api(request, snippet).close_statement(notebook, snippet)
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'insert_as_query': # TODO: checks/workarounds in case of non impersonation or Sentry # TODO: keep older simpler way in case of known not many rows? sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url) elif action == 'index_query': if destination == '__hue__': destination = _get_snippet_name(notebook, unique=True, table_format=True) live_indexing = True else: live_indexing = False sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': {'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001'}, "sample": '', "columns": [ Field(col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } if live_indexing: file_format['inputFormat'] = 'hs2_handle' file_format['fetch_handle'] = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) job_handle = _index(request, file_format, destination, query=notebook['uuid']) if live_indexing: return redirect(reverse('search:browse', kwargs={'name': destination})) else: return redirect(reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', 'editor_type': editor_type, 'success_url': success_url }), 'editor_type': editor_type, })
def cancel_async(notebook, snippet, postdict=None, user_id=None): request = _get_request(postdict, user_id) get_api(request, snippet).cancel(notebook, snippet)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format[ 'inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get( id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } elif file_format['inputFormat'] == 'rdbms': query_server = rdbms.get_query_server_config( server=file_format['rdbmsType']) db = rdbms.get(request.user, query_server=query_server) sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data( mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName']) table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in table_metadata ] } return JsonResponse(format_)
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib_unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': kwargs['separator'] = source['format']['fieldSeparator'] path = urllib_unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] # Assumes handle still live fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception as e: try: client.delete_index(index_name, keep_config=False) except Exception as e2: LOG.warning( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2)) raise e return { 'status': 0, 'on_success_url': reverse('indexer:indexes', kwargs={'index': index_name}), 'pub_sub_url': 'assist.collections.refresh', 'errors': errors }
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } # data = """%(kafkaFieldNames)s # %(data)s""" % { # 'kafkaFieldNames': ','.join(kafkaFieldNames), # 'data': '\n'.join([','.join(cols) for cols in topics_data]) # } # stream = string_io() # stream.write(data) # _convert_format(file_format["format"], inverse=True) # indexer = MorphlineIndexer(request.user, request.fs) # format_ = indexer.guess_field_types({ # "file": { # "stream": stream, # "name": file_format['path'] # }, # "format": file_format['format'] # }) # type_mapping = dict( # list( # zip(kafkaFieldNames, kafkaFieldTypes) # ) # ) # for col in format_['columns']: # col['keyType'] = type_mapping[col['name']] # col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def datasets(self, show_all=False): snippet = {'type': self.engine} # Ideally from left assist at some point instead databases = get_api(MockRequest(self.user), snippet).autocomplete(snippet)['databases'] database = databases and 'default' not in databases and databases[0] or 'default' return [database + '.' + table['name'] for table in get_api(MockRequest(self.user), snippet).autocomplete(snippet, database=database)['tables_meta']]
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get( id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') elif action == 'index_query': sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib in next commit from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } job_handle = _index(request, file_format, destination, query=notebook['uuid']) return redirect( reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{ "name": "%s SQL" % editor_type.title(), "type": editor_type }], 'mode': 'editor', 'success_url': success_url }), 'editor_type': editor_type, })
def download(request): notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) file_format = request.POST.get('format', 'csv') return get_api(request, snippet).download(notebook, snippet, file_format)
response = {'status': -1, 'result': []} notebook = json.loads(request.POST.get('notebook', '{}')) for session in [_s for _s in notebook['sessions'] if _s['type'] in ('scala', 'spark', 'pyspark', 'sparkr')]: try: response['result'].append(get_api(request, session).close_session(session)) except QueryExpired: pass except Exception, e: LOG.exception('Error closing session %s' % str(e)) for snippet in [_s for _s in notebook['snippets'] if _s['type'] in ('hive', 'impala')]: try: if snippet['status'] != 'running': response['result'].append(get_api(request, snippet).close_statement(snippet)) else: LOG.info('Not closing SQL snippet as still running.') except QueryExpired: pass except Exception, e: LOG.exception('Error closing statement %s' % str(e)) response['status'] = 0 response['message'] = _('Notebook closed successfully') return JsonResponse(response) @require_POST @check_document_access_permission()
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib.unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query'] notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn('Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def stats(self, dashboard, fields): database, table = self._get_database_table_names(dashboard) # TODO: check column stats to go faster sql = "SELECT MIN(`%(field)s`), MAX(`%(field)s`) FROM `%(database)s`.`%(table)s`" % { 'field': fields[0], 'database': database, 'table': table } editor = make_notebook(name='Execute and watch', editor_type=self.engine, statement=sql, database=database, status='ready-execute', skip_historify=True # async=False ) request = MockRequest(self.user) snippet = {'type': self.engine} response = editor.execute(request) if 'handle' in response: snippet['result'] = response if response['handle'].get('sync'): result = response['result'] else: timeout_sec = 20 # To move to Notebook API sleep_interval = 0.5 curr = time.time() end = curr + timeout_sec api = get_api(request, snippet) while curr <= end: status = api.check_status(dashboard, snippet) if status['status'] == 'available': result = api.fetch_result(dashboard, snippet, rows=10, start_over=True) api.close_statement(snippet) break time.sleep(sleep_interval) curr = time.time() if curr > end: try: api.cancel_operation(snippet) except Exception, e: LOG.warning("Failed to cancel query: %s" % e) api.close_statement(snippet) raise OperationTimeout(e) stats = list(result['data']) min_value, max_value = stats[0] if not isinstance(min_value, numbers.Number): min_value = min_value.replace(' ', 'T') + 'Z' max_value = max_value.replace(' ', 'T') + 'Z' return { 'stats': { 'stats_fields': { fields[0]: { 'min': min_value, 'max': max_value } } } }