def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get( id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'insert_as_query': # TODO: checks/workarounds in case of non impersonation or Sentry # TODO: keep older simpler way in case of known not many rows? sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url) elif action == 'index_query': if destination == '__hue__': destination = _get_snippet_name(notebook, unique=True, table_format=True) live_indexing = True else: live_indexing = False sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } if live_indexing: file_format['inputFormat'] = 'hs2_handle' file_format['fetch_handle'] = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) job_handle = _index(request, file_format, destination, query=notebook['uuid']) if live_indexing: return redirect( reverse('search:browse', kwargs={'name': destination})) else: return redirect( reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{ "name": "%s SQL" % editor_type.title(), "type": editor_type }], 'mode': 'editor', 'editor_type': editor_type, 'success_url': success_url }), 'editor_type': editor_type, })
def __init__(self): geo_ip_operation = get_operator("geo_ip").get_default_operation() geo_ip_operation['settings']["/country/names/en"] = True geo_ip_operation['settings']["/city/names/en"] = True geo_ip_operation['settings']["/location/latitude"] = True geo_ip_operation['settings']["/location/longitude"] = True geo_ip_operation['fields'] += [ Field("country", "string").to_dict(), Field("city", "string").to_dict(), Field("latitude", "double").to_dict(), Field("longitude", "double").to_dict() ] self._fields = [ Field("date", "date"), Field("component", "string"), Field("log_level", "string"), Field("details", "string"), Field("message", "text_general"), Field("ip", "string", [geo_ip_operation]), Field("user", "string"), Field("http_method", "string"), Field("path", "string"), Field("protocol", "string") ]
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib.unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def export_result(request): response = {'status': -1, 'message': _('Success')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', 'hdfs-file')) destination = json.loads(request.POST.get('destination', '')) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) start_time = json.loads(request.POST.get('start_time', '-1')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=save_as_table¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=insert_as_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'search-index': # Open results in a Dashboard via either: a SQL sub-query or a Solr index via the wizard (and its quick indexing or Morphline job) if is_embedded: if destination == '__hue__': notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) engine = notebook['type'].replace('query-', '') response['watch_url'] = reverse( 'dashboard:browse', kwargs={ 'name': notebook_id }) + '?source=query&engine=%(engine)s' % { 'engine': engine } response['status'] = 0 return JsonResponse( response ) # Currently do not live index into Solr, but uses a SQL sub-query destination = _get_snippet_name(notebook, unique=True, table_format=True) live_indexing = True else: live_indexing = False sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will be moved to the lib from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in sample['meta'] ] } if live_indexing: file_format['inputFormat'] = 'hs2_handle' file_format['fetch_handle'] = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) response['rowcount'] = _index(request, file_format, destination, query=notebook['uuid'], start_time=start_time) response['watch_url'] = reverse('search:browse', kwargs={'name': destination}) response['status'] = 0 else: response = _index(request, file_format, destination, query=notebook['uuid'], start_time=start_time) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=index_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 if response.get('status') != 0: response['message'] = _('Exporting result failed.') request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Search index: %s' % (request.user.username, destination), 'allowed': True } return JsonResponse(response)
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get( id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'index_query': sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib in next commit from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } job_handle = _index(request, file_format, destination, query=notebook['uuid']) return redirect( reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{ "name": "%s SQL" % editor_type.title(), "type": editor_type }], 'mode': 'editor', 'editor_type': editor_type, 'success_url': success_url }), 'editor_type': editor_type, })
db = rdbms.get(request.user, query_server=query_server) sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data( mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName']) table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in table_metadata ] } elif file_format['inputFormat'] == 'stream': # Note: mocked here, should come from SFDC or Kafka API or sampling job if file_format['streamSelection'] == 'kafka': data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': file_format.get('kafkaFieldNames', ''), 'data': '\n'.join([ ','.join(['...'] * len( file_format.get('kafkaFieldTypes', '').split(',')))
def export_result(request): response = {'status': -1, 'message': _('Success')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', '"hdfs-file"')) destination = urllib_unquote( json.loads(request.POST.get('destination', '""'))) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) start_time = json.loads(request.POST.get('start_time', '-1')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=save_as_table¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if destination.lower().startswith("abfs"): destination = abfspath(destination) if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=insert_as_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format in ('search-index', 'dashboard'): # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable) if is_embedded: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) if data_format == 'dashboard': engine = notebook['type'].replace('query-', '') response['watch_url'] = reverse( 'dashboard:browse', kwargs={ 'name': notebook_id }) + '?source=query&engine=%(engine)s' % { 'engine': engine } response['status'] = 0 else: sample = get_api(request, snippet).fetch_result(notebook, snippet, rows=4, start_over=True) for col in sample['meta']: col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string') response['status'] = 0 response['id'] = notebook_id response['name'] = _get_snippet_name(notebook) response['source_type'] = 'query' response['target_type'] = 'index' response['target_path'] = destination response['sample'] = list(sample['data']) response['columns'] = [ Field(col['name'], col['type']).to_dict() for col in sample['meta'] ] else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse( 'notebook:execute_and_watch' ) + '?action=index_query¬ebook=' + str( notebook_id) + '&snippet=0&destination=' + destination response['status'] = 0 if response.get('status') != 0: response['message'] = _('Exporting result failed.') return JsonResponse(response)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'localfile': path = urllib_unquote(file_format['path']) with open(path, 'r') as local_file: reader = csv.reader(local_file) csv_data = list(reader) if file_format['format']['hasHeader']: sample = csv_data[1:5] column_row = [ re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0] ] else: sample = csv_data[:4] column_row = [ 'field_' + str(count + 1) for count, col in enumerate(sample[0]) ] field_type_guesses = [] for count, col in enumerate(column_row): column_samples = [ sample_row[count] for sample_row in sample if len(sample_row) > count ] field_type_guess = guess_field_type_from_samples( column_samples) field_type_guesses.append(field_type_guess) columns = [ Field(column_row[count], field_type_guesses[count]).to_dict() for count, col in enumerate(column_row) ] format_ = {'columns': columns, 'sample': sample} elif file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) LOG.debug('File %s encoding is %s' % (path, encoding)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format[ 'inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get( id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } elif file_format['inputFormat'] == 'rdbms': query_server = rdbms.get_query_server_config( server=file_format['rdbmsType']) db = rdbms.get(request.user, query_server=query_server) sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data( mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName']) table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in table_metadata ] } return JsonResponse(format_)
def __init__(self): self._fields = [ Field("clientip", "string"), Field("ident", "string"), Field("auth", "string"), Field("timestamp", "date"), Field("verb", "string"), Field("request", "string"), Field("httpversion", "double"), Field("rawrequest", "long"), Field("response", "long"), Field("bytes", "long"), Field("referrer", "string"), Field("field_line", "text_general") ]
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type', 'url', 'user' ] kafkaFieldTypes = ['string'] * len(kafkaFieldNames) kafkaFieldNames.append('timeDate') kafkaFieldTypes.append('date') else: # Note: mocked here, should come from SFDC or Kafka API or sampling job kafkaFieldNames = file_format.get('kafkaFieldNames', '').split(',') kafkaFieldTypes = file_format.get('kafkaFieldTypes', '').split(',') data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': ','.join(kafkaFieldNames), 'data': '\n'.join( [','.join(['...'] * len(kafkaFieldTypes))] * 5) } stream = string_io() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
columns = [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get('kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type',