Beispiel #1
0
def action_parameters(request):
  response = {'status': -1}
  parameters = set()

  try:
    node_data = json.loads(request.POST.get('node', '{}'))

    parameters = parameters.union(set(Node(node_data).find_parameters()))

    script_path = node_data.get('properties', {}).get('script_path', {})
    if script_path:
      script_path = script_path.replace('hdfs://', '')

      if request.fs.do_as_user(request.user, request.fs.exists, script_path):
        data = request.fs.do_as_user(request.user, request.fs.read, script_path, 0, 16 * 1024 ** 2)

        if node_data['type'] in ('hive', 'hive2'):
          parameters = parameters.union(set(find_dollar_braced_variables(data)))
        elif node_data['type'] == 'pig':
          parameters = parameters.union(set(find_dollar_variables(data)))
    elif node_data['type'] == 'hive-document':
      notebook = Notebook(document=Document2.objects.get_by_uuid(user=request.user, uuid=node_data['properties']['uuid']))
      parameters = parameters.union(set(find_dollar_braced_variables(notebook.get_str())))

    response['status'] = 0
    response['parameters'] = list(parameters)
  except Exception, e:
    response['message'] = str(e)
Beispiel #2
0
def get_history(request):
  response = {'status': -1}

  doc_type = request.GET.get('doc_type')
  limit = max(request.GET.get('len', 50), 100)

  response['status'] = 0
  history = []
  for doc in Document2.objects.get_history(doc_type='query-%s' % doc_type, user=request.user).order_by('-last_modified')[:limit]:
    notebook = Notebook(document=doc).get_data()
    if 'snippets' in notebook:
      history.append({
        'name': doc.name,
        'id': doc.id,
        'uuid': doc.uuid,
        'type': doc.type,
        'data': {
            'statement_raw': notebook['snippets'][0]['statement_raw'][:1001],
            'lastExecuted':  notebook['snippets'][0]['lastExecuted'],
            'status':  notebook['snippets'][0]['status'],
            'parentUuid': notebook.get('parentUuid', '')
        } if notebook['snippets'] else {},
        'absoluteUrl': doc.get_absolute_url(),
      })
    else:
      LOG.error('Incomplete History Notebook: %s' % notebook)
  response['history'] = history
  response['message'] = _('History fetched')

  return JsonResponse(response)
Beispiel #3
0
def notebook(request):
    notebook_id = request.GET.get("notebook")

    if notebook_id:
        notebook = Notebook(document=Document2.objects.get(id=notebook_id))
    else:
        notebook = Notebook()

    autocomplete_base_url = ""
    try:
        autocomplete_base_url = reverse("beeswax:api_autocomplete_databases", kwargs={})
    except:
        LOG.exception("failed to get autocomplete base url")

    is_yarn_mode = False
    try:
        from spark.conf import LIVY_SERVER_SESSION_KIND

        is_yarn_mode = LIVY_SERVER_SESSION_KIND.get()
    except:
        LOG.exception("Spark is not enabled")

    return render(
        "notebook.mako",
        request,
        {
            "notebooks_json": json.dumps([notebook.get_data()]),
            "options_json": json.dumps(
                {"languages": get_interpreters(request.user), "session_properties": SparkApi.PROPERTIES}
            ),
            "autocomplete_base_url": autocomplete_base_url,
            "is_yarn_mode": is_yarn_mode,
        },
    )
Beispiel #4
0
def notebook(request):
  notebook_id = request.GET.get('notebook')

  if notebook_id:
    notebook = Notebook(document=Document2.objects.get(id=notebook_id))
  else:
    notebook = Notebook()

  autocomplete_base_url = ''
  try:
    autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={})
  except:
    LOG.exception('failed to get autocomplete base url')

  return render('notebook.mako', request, {
      'notebooks_json': json.dumps([notebook.get_data()]),
      'options_json': json.dumps({
          'languages': get_interpreters(),
          'snippet_placeholders' : {
              'sql': _('Example: 1 + 1, or press CTRL + space'),
              'spark': _('Example: 1 + 1, or press CTRL + space'),
              'pyspark': _('Example: 1 + 1, or press CTRL + space'),
              'impala': _('Example: SELECT * FROM tablename, or press CTRL + space'),
              'hive': _('Example: SELECT * FROM tablename, or press CTRL + space'),
              'r': _('Example: 1 + 1, or press CTRL + space')
          },
          'session_properties': SparkApi.PROPERTIES
      }),
      'autocomplete_base_url': autocomplete_base_url,
      'is_yarn_mode': LIVY_SERVER_SESSION_KIND.get()
  })
Beispiel #5
0
Datei: views.py Projekt: RoxC/hue
def editor(request):
  editor_id = request.GET.get('editor')

  if editor_id:
    editor = Notebook(document=Document2.objects.get(id=editor_id))
  else:
    editor = Notebook()
    data = editor.get_data()
    data['name'] = 'Hive SQL Editor'
    data['snippets'] = json.loads('[{"id":"c111cbb4-f475-4050-c5a1-02df6c31e3d8","name":"","type":"hive","editorMode":"text/x-hiveql","statement_raw":"Example: SELECT * FROM tablename, or press CTRL + space","codemirrorSize":100,"status":"ready","properties":{"settings":[],"files":[]},"variables":[],"variableNames":[],"statement":"Example: SELECT * FROM tablename, or press CTRL + space","result":{"id":"149347d9-3ae7-8d06-4cc8-d4bce5e72dc8","type":"table","hasResultset":true,"handle":{},"meta":[],"cleanedMeta":[],"fetchedOnce":false,"startTime":"2015-07-17T20:38:21.970Z","endTime":"2015-07-17T20:38:21.970Z","executionTime":0,"cleanedNumericMeta":[],"cleanedStringMeta":[],"cleanedDateTimeMeta":[],"data":[],"logs":"","logLines":0,"errors":"","hasSomeResults":false},"showGrid":true,"showChart":false,"showLogs":false,"progress":0,"size":12,"offset":0,"isLoading":false,"klass":"snippet card card-widget","editorKlass":"editor span12","resultsKlass":"results hive","errorsKlass":"results hive alert alert-error","chartType":"bars","chartSorting":"none","chartYMulti":[],"chartData":[],"tempChartOptions":{},"isLeftPanelVisible":false,"codeVisible":true,"settingsVisible":false,"checkStatusTimeout":null}]')
    editor.data = json.dumps(data)

  autocomplete_base_url = ''
  try:
    autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={})
  except:
    LOG.exception('failed to get autocomplete base url')

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "Hive SQL", "type": "hive"}],
          'snippet_placeholders' : {
              'scala': _('Example: 1 + 1, or press CTRL + space'),
              'python': _('Example: 1 + 1, or press CTRL + space'),
              'impala': _('Example: SELECT * FROM tablename, or press CTRL + space'),
              'hive': _('Example: SELECT * FROM tablename, or press CTRL + space'),
              'text': _('<h2>This is a text snippet</h2>Type your text here')
          }
      }),
      'autocomplete_base_url': autocomplete_base_url,
  })
Beispiel #6
0
def execute_and_watch(request):
  notebook_id = request.GET.get('editor', request.GET.get('notebook'))
  snippet_id = int(request.GET['snippet'])
  action = request.GET['action']
  destination = request.GET['destination']

  notebook = Notebook(document=Document2.objects.get(id=notebook_id))
  snippet = notebook.get_data()['snippets'][snippet_id]
  editor_type = snippet['type']

  api = get_api(request, snippet)

  if action == 'save_as_table':
    sql, success_url = api.export_data_as_table(snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute')
  elif action == 'insert_as_query':
    sql, success_url = api.export_large_data_to_hdfs(snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute')
  else:
    raise PopupException(_('Action %s is unknown') % action)

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
          'success_url': success_url
      }),
      'editor_type': editor_type,
  })
Beispiel #7
0
def notebook(request):
  notebook_id = request.GET.get('notebook')

  if notebook_id:
    notebook = Notebook(document=Document2.objects.get(id=notebook_id))
  else:
    notebook = Notebook()

  autocomplete_base_url = ''
  try:
    autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={})
  except:
    LOG.exception('failed to get autocomplete base url')

  is_yarn_mode = False
  try:
    from spark.conf import LIVY_SERVER_SESSION_KIND
    is_yarn_mode = LIVY_SERVER_SESSION_KIND.get()
  except:
    LOG.exception('Spark is not enabled')

  return render('notebook.mako', request, {
      'notebooks_json': json.dumps([notebook.get_data()]),
      'options_json': json.dumps({
          'languages': get_interpreters(request.user),
          'session_properties': SparkApi.PROPERTIES,
      }),
      'autocomplete_base_url': autocomplete_base_url,
      'is_yarn_mode': is_yarn_mode
  })
Beispiel #8
0
def editor(request):
  editor_id = request.GET.get('editor')
  editor_type = request.GET.get('type', 'hive')

  if editor_id:
    editor = Notebook(document=Document2.objects.get(id=editor_id))
  else:
    editor = Notebook()
    data = editor.get_data()
    data['name'] = 'Untitled %s Query' % editor_type.title()
    data['type'] = 'query-%s' % editor_type
    editor.data = json.dumps(data)

  autocomplete_base_url = ''
  try:
    autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={})
  except:
    LOG.exception('failed to get autocomplete base url')

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
      }),
      'editor_type': editor_type,
      'autocomplete_base_url': autocomplete_base_url,
  })
Beispiel #9
0
def make_notebook2(name='Browse', description='', is_saved=False, snippets=None):

  from notebook.connectors.hiveserver2 import HS2Api

  editor = Notebook()

  _snippets = []

  for snippet in snippets:
    default_properties = {
        'files': [],
        'functions': [],
        'settings': []
    }

    if snippet['type'] == 'hive':
      pass
    elif snippet['type'] == 'impala':
      pass
    elif snippet['type'] == 'java':
      pass

    _snippets.append(snippet)

  print _snippets

  data = {
    'name': name,
    'uuid': str(uuid.uuid4()),
    'description': description,
    'sessions': [
      {
         'type': _snippet['type'],
         'properties': HS2Api.get_properties(snippet['type']),
         'id': None
      } for _snippet in _snippets # Non unique types currently
    ],
    'selectedSnippet': _snippets[0]['type'],
    'type': 'notebook',
    'showHistory': False,
    'isSaved': is_saved,
    'snippets': [
      {
         'status': _snippet.get('status', 'ready'),
         'id': str(uuid.uuid4()),
         'statement_raw': _snippet.get('statement', ''),
         'statement': _snippet.get('statement', ''),
         'type': _snippet.get('type'),
         'properties': _snippet.properties,
         'name': name,
         'database': _snippet.get('database'),
         'result': {}
      } for _snippet in _snippets
    ]
  }

  editor.data = json.dumps(data)

  return editor
Beispiel #10
0
def open_notebook(request):
    response = {"status": -1}

    notebook_id = request.GET.get("notebook")
    notebook = Notebook(document=Document2.objects.get(id=notebook_id))

    response["status"] = 0
    response["notebook"] = notebook.get_json()
    response["message"] = _("Notebook loaded successfully")
Beispiel #11
0
def open_notebook(request):
  response = {'status': -1}

  notebook_id = request.GET.get('notebook')
  notebook = Notebook(document=Document2.objects.get(id=notebook_id))

  response['status'] = 0
  response['notebook'] = notebook.get_json()
  response['message'] = _('Notebook loaded successfully')
Beispiel #12
0
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready',
                  files=None, functions=None, settings=None):

  from notebook.connectors.hiveserver2 import HS2Api

  editor = Notebook()

  properties = HS2Api.get_properties(editor_type)

  if editor_type == 'hive':
    if files is not None:
      _update_property_value(properties, 'files', files)

    if functions is not None:
      _update_property_value(properties, 'functions', functions)

    if settings is not None:
      _update_property_value(properties, 'settings', settings)
  elif editor_type == 'impala':
    if settings is not None:
      _update_property_value(properties, 'files', files)

  editor.data = json.dumps({
    'name': name,
    'description': description,
    'sessions': [
      {
         'type': editor_type,
         'properties': properties,
         'id': None
      }
    ],
    'selectedSnippet': editor_type,
    'type': 'query-%s' % editor_type,
    'showHistory': True,

    'snippets': [
      {
         'status': status,
         'id': str(uuid.uuid4()),
         'statement_raw': statement,
         'statement': statement,
         'type': editor_type,
         'properties': {
            'files': [] if files is None else files,
            'functions': [] if functions is None else functions,
            'settings': [] if settings is None else settings
         },
         'name': name,
         'database': 'default',
         'result': {}
      }
    ]
  })
  
  return editor
Beispiel #13
0
def open_notebook(request):
  response = {'status': -1}

  notebook_id = request.GET.get('notebook')
  notebook = Notebook(document=Document2.objects.get(id=notebook_id))

  response['status'] = 0
  response['notebook'] = notebook.get_json()
  response['message'] = _('Notebook saved !')

  return JsonResponse(response)
Beispiel #14
0
def extract_archive_in_hdfs(request, upload_path, file_name):

  _upload_extract_archive_script_to_hdfs(request.fs)

  shell_notebook = Notebook()
  shell_notebook.add_shell_snippet(
      shell_command='extract_archive_in_hdfs.sh',
      arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + file_name}],
      archives=[],
      files=[{'value': '/user/' + DEFAULT_USER.get() + '/common/extract_archive_in_hdfs.sh'}, {"value": upload_path + '/' + file_name}],
      env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}])
  return shell_notebook.execute(request, batch=True)
Beispiel #15
0
def browse(request, database, table):
  editor_type = request.GET.get('type', 'hive')

  snippet = {'type': editor_type}
  sql_select = get_api(request.user, snippet, request.fs, request.jt).get_select_star_query(snippet, database, table)

  editor = Notebook()
  editor.data = json.dumps({
    'description':'',
    'sessions':[
      {
         'type':'hive',
         'properties':[

         ],
         'id':None
      }
    ],
    'selectedSnippet':'hive',
    'type': 'query-%s' % editor_type,

    'snippets':[
      {
         'status':'ready-execute',
         'id':'e8b323b3-88ef-3a84-6264-af11fa5fbefb',
         'statement_raw': sql_select,
         'statement': sql_select,
         'type': editor_type,
         'properties':{
            'files':[
            ],
            'settings':[
            ]
         },
         'name': 'Browse',
         'database':'default',
         'result':{  }
      }
    ],
    'name':'Browse'
  })

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
      }),
      'editor_type': editor_type,
  })
Beispiel #16
0
def get_external_statement(request):
  response = {'status': -1, 'message': ''}

  notebook = json.loads(request.POST.get('notebook', '{}'))
  snippet = json.loads(request.POST.get('snippet', '{}'))

  if snippet.get('statementType') == 'file':
    response['statement'] = _get_statement_from_file(request.user, request.fs, snippet)
  elif snippet.get('statementType') == 'document':
    notebook = Notebook(Document2.objects.get_by_uuid(user=request.user, uuid=snippet['associatedDocumentUuid'], perm_type='read'))
    response['statement'] = notebook.get_str()

  response['status'] = 0

  return JsonResponse(response)
Beispiel #17
0
def get_history(request):
    response = {"status": -1}

    doc_type = request.GET.get("doc_type")
    doc_text = request.GET.get("doc_text")
    limit = min(request.GET.get("len", 50), 100)

    docs = Document2.objects.get_history(doc_type="query-%s" % doc_type, user=request.user)

    if doc_text:
        docs = docs.filter(
            Q(name__icontains=doc_text) | Q(description__icontains=doc_text) | Q(search__icontains=doc_text)
        )

    history = []
    for doc in docs.order_by("-last_modified")[:limit]:
        notebook = Notebook(document=doc).get_data()
        if "snippets" in notebook:
            statement = _get_statement(notebook)
            history.append(
                {
                    "name": doc.name,
                    "id": doc.id,
                    "uuid": doc.uuid,
                    "type": doc.type,
                    "data": {
                        "statement": statement[:1001] if statement else "",
                        "lastExecuted": notebook["snippets"][0]["lastExecuted"],
                        "status": notebook["snippets"][0]["status"],
                        "parentSavedQueryUuid": notebook.get("parentSavedQueryUuid", ""),
                    }
                    if notebook["snippets"]
                    else {},
                    "absoluteUrl": doc.get_absolute_url(),
                }
            )
        else:
            LOG.error("Incomplete History Notebook: %s" % notebook)
    response["history"] = sorted(history, key=lambda row: row["data"]["lastExecuted"], reverse=True)
    response["message"] = _("History fetched")
    response["status"] = 0

    return JsonResponse(response)
Beispiel #18
0
def get_history(request):
  response = {'status': -1}

  doc_type = request.GET.get('doc_type')
  doc_text = request.GET.get('doc_text')
  limit = min(request.GET.get('len', 50), 100)

  docs = Document2.objects.get_history(doc_type='query-%s' % doc_type, user=request.user)

  if doc_text:
    docs = docs.filter(Q(name__icontains=doc_text) | Q(description__icontains=doc_text))

  history = []
  for doc in docs.order_by('-last_modified')[:limit]:
    notebook = Notebook(document=doc).get_data()
    if 'snippets' in notebook:
      try:
        statement = notebook['snippets'][0]['result']['handle']['statement']
        if type(statement) == dict: # Old format
          statement = notebook['snippets'][0]['statement_raw']
      except KeyError: # Old format
        statement = notebook['snippets'][0]['statement_raw']
      history.append({
        'name': doc.name,
        'id': doc.id,
        'uuid': doc.uuid,
        'type': doc.type,
        'data': {
            'statement': statement[:1001] if statement else '',
            'lastExecuted': notebook['snippets'][0]['lastExecuted'],
            'status':  notebook['snippets'][0]['status'],
            'parentSavedQueryUuid': notebook.get('parentSavedQueryUuid', '')
        } if notebook['snippets'] else {},
        'absoluteUrl': doc.get_absolute_url(),
      })
    else:
      LOG.error('Incomplete History Notebook: %s' % notebook)
  response['history'] = sorted(history, key=lambda row: row['data']['lastExecuted'], reverse=True)
  response['message'] = _('History fetched')
  response['status'] = 0

  return JsonResponse(response)
Beispiel #19
0
def create_notebook(request):
  response = {'status': -1}

  editor_type = request.POST.get('type', 'notebook')
  directory_uuid = request.POST.get('directory_uuid')

  editor = Notebook()
  data = editor.get_data()

  if editor_type != 'notebook':
    data['name'] = ''
    data['type'] = 'query-%s' % editor_type  # TODO: Add handling for non-SQL types

  data['directoryUuid'] = directory_uuid
  editor.data = json.dumps(data)

  response['notebook'] = editor.get_data()
  response['status'] = 0

  return JsonResponse(response)
Beispiel #20
0
def create_notebook(request):
    response = {"status": -1}

    editor_type = request.POST.get("type", "notebook")
    directory_uuid = request.POST.get("directory_uuid")

    editor = Notebook()
    data = editor.get_data()

    if editor_type != "notebook":
        data["name"] = ""
        data["type"] = "query-%s" % editor_type  # TODO: Add handling for non-SQL types

    data["directoryUuid"] = directory_uuid
    editor.data = json.dumps(data)

    response["notebook"] = editor.get_data()
    response["status"] = 0

    return JsonResponse(response)
Beispiel #21
0
def editor(request):
  editor_id = request.GET.get('editor')
  editor_type = request.GET.get('type', 'hive')

  if editor_id:
    editor = Notebook(document=Document2.objects.get(id=editor_id))
  else:
    editor = Notebook()
    data = editor.get_data()
    data['name'] = 'Unsaved %s Query' % editor_type.title()
    data['type'] = 'query-%s' % editor_type
    editor.data = json.dumps(data)

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
      }),
      'editor_type': editor_type,
  })
Beispiel #22
0
def editor(request):
  editor_id = request.GET.get('editor')
  editor_type = request.GET.get('type', 'hive')

  if editor_id:  # Open existing saved editor document
    editor = Notebook(document=Document2.objects.get(id=editor_id))
    editor_type = editor.get_data()['type'].rsplit('-', 1)[-1]
    editor = upgrade_session_properties(request, notebook=editor)
  else:  # Create new editor
    editor = Notebook()
    data = editor.get_data()

    data['name'] = ''
    data['type'] = 'query-%s' % editor_type  # TODO: Add handling for non-SQL types
    editor.data = json.dumps(data)

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
          'is_optimizer_enabled': has_optimizer(),
      }),
      'editor_type': editor_type,
  })
Beispiel #23
0
def extract_archive_in_hdfs(request, upload_path, file_name):
  _upload_extract_archive_script_to_hdfs(request.fs)

  output_path = upload_path + '/' + file_name.split('.')[0]
  start_time = json.loads(request.POST.get('start_time', '-1'))

  shell_notebook = Notebook(
      name=_('HDFS Extraction of %(upload_path)s/%(file_name)s') % {'upload_path': upload_path, 'file_name': file_name},
      isManaged=True,
      onSuccessUrl=reverse('filebrowser.views.view', kwargs={'path': output_path})
  )

  shell_notebook.add_shell_snippet(
      shell_command='extract_archive_in_hdfs.sh',
      arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + file_name}, {'value': '-o=' + output_path}],
      archives=[],
      files=[{'value': '/user/' + DEFAULT_USER.get() + '/common/extract_archive_in_hdfs.sh'}, {"value": upload_path + '/' + urllib.quote(file_name)}],
      env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}],
      last_executed=start_time
  )

  return shell_notebook.execute(request, batch=True)
Beispiel #24
0
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None):
  editor = Notebook()

  editor.data = json.dumps({
    'name': name,
    'description': description,
    'sessions': [
      {
         'type': editor_type,
         'properties': [

         ],
         'id': None
      }
    ],
    'selectedSnippet': editor_type,
    'type': 'query-%s' % editor_type,
    'showHistory': True,

    'snippets': [
      {
         'status': status,
         'id': str(uuid.uuid4()),
         'statement_raw': statement,
         'statement': statement,
         'type': editor_type,
         'properties': {
            'files': [] if files is None else files,
            'functions': [] if functions is None else functions,
            'settings': [] if settings is None else settings
         },
         'name': name,
         'database': 'default',
         'result': {}
      }
    ]
  })
  
  return editor
Beispiel #25
0
def notebook(request):
  notebook_id = request.GET.get('notebook')

  if notebook_id:
    notebook = Notebook(document=Document2.objects.get(id=notebook_id))
  else:
    notebook = Notebook()

  is_yarn_mode = False
  try:
    from spark.conf import LIVY_SERVER_SESSION_KIND
    is_yarn_mode = LIVY_SERVER_SESSION_KIND.get()
  except:
    LOG.exception('Spark is not enabled')

  return render('notebook.mako', request, {
      'notebooks_json': json.dumps([notebook.get_data()]),
      'options_json': json.dumps({
          'languages': get_interpreters(request.user),
          'session_properties': SparkApi.PROPERTIES,
      }),
      'is_yarn_mode': is_yarn_mode
  })
Beispiel #26
0
def compress_files_in_hdfs(request, file_names, upload_path, archive_name):

  _upload_compress_files_script_to_hdfs(request.fs)

  files = [{"value": upload_path + '/' + file_name} for file_name in file_names]
  files.append({'value': '/user/' + DEFAULT_USER.get() + '/common/compress_files_in_hdfs.sh'})
  start_time = json.loads(request.POST.get('start_time', '-1'))

  shell_notebook = Notebook(
    name=_('HDFS Compression to %(upload_path)s/hue_compressed.zip') % {'upload_path': upload_path},
    isManaged=True,
    onSuccessUrl=reverse('filebrowser.views.view', kwargs={'path': upload_path})
  )

  shell_notebook.add_shell_snippet(
      shell_command='compress_files_in_hdfs.sh',
      arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + ','.join(file_names)}, {'value': '-n=' + archive_name}],
      archives=[],
      files=files,
      env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}],
      last_executed=start_time
  )

  return shell_notebook.execute(request, batch=True)
Beispiel #27
0
def editor(request):
  editor_id = request.GET.get('editor')

  if editor_id:
    editor = Notebook(document=Document2.objects.get(id=editor_id))
  else:
    editor = Notebook()
    data = editor.get_data()
    data['name'] = 'My SQL query'
    editor.data = json.dumps(data)

  autocomplete_base_url = ''
  try:
    autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={})
  except:
    LOG.exception('failed to get autocomplete base url')

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "Hive SQL", "type": "hive"}]
      }),
      'autocomplete_base_url': autocomplete_base_url,
  })
Beispiel #28
0
def _get_statement(notebook):
    if notebook['snippets'] and len(notebook['snippets']) > 0:
        return Notebook.statement_with_variables(notebook['snippets'][0])
    return ''
Beispiel #29
0
def make_notebook(name='Browse',
                  description='',
                  editor_type='hive',
                  statement='',
                  status='ready',
                  files=None,
                  functions=None,
                  settings=None,
                  is_saved=False,
                  database='default',
                  snippet_properties=None,
                  batch_submit=False,
                  on_success_url=None,
                  skip_historify=False,
                  is_task=False,
                  last_executed=-1,
                  is_notebook=False,
                  pub_sub_url=None):
    '''
  skip_historify: do not add the task to the query history. e.g. SQL Dashboard
  isManaged: true when being a managed by Hue operation (include_managed=True in document), e.g. exporting query result, dropping some tables
  '''
    from notebook.connectors.hiveserver2 import HS2Api

    editor = Notebook()
    if snippet_properties is None:
        snippet_properties = {}

    if editor_type == 'hive':
        sessions_properties = HS2Api.get_properties(editor_type)
        if files is not None:
            _update_property_value(sessions_properties, 'files', files)

        if functions is not None:
            _update_property_value(sessions_properties, 'functions', functions)

        if settings is not None:
            _update_property_value(sessions_properties, 'settings', settings)
    elif editor_type == 'impala':
        sessions_properties = HS2Api.get_properties(editor_type)
        if settings is not None:
            _update_property_value(sessions_properties, 'files', files)
    elif editor_type == 'java':
        sessions_properties = []  # Java options
    else:
        sessions_properties = []

    data = {
        'name':
        name,
        'uuid':
        str(uuid.uuid4()),
        'description':
        description,
        'sessions': [{
            'type': editor_type,
            'properties': sessions_properties,
            'id': None
        }],
        'selectedSnippet':
        editor_type,
        'type':
        'notebook' if is_notebook else 'query-%s' % editor_type,
        'showHistory':
        True,
        'isSaved':
        is_saved,
        'onSuccessUrl':
        on_success_url,
        'pubSubUrl':
        pub_sub_url,
        'skipHistorify':
        skip_historify,
        'isManaged':
        is_task,
        'snippets': [{
            'status': status,
            'id': str(uuid.uuid4()),
            'statement_raw': statement,
            'statement': statement,
            'type': editor_type,
            'wasBatchExecuted': batch_submit,
            'lastExecuted': last_executed,
            'properties': {
                'files': [] if files is None else files,
                'functions': [] if functions is None else functions,
                'settings': [] if settings is None else settings
            },
            'name': name,
            'database': database,
            'result': {
                'handle': {}
            },
            'variables': []
        }] if not is_notebook else []
    }

    if snippet_properties:
        data['snippets'][0]['properties'].update(snippet_properties)

    editor.data = json.dumps(data)

    return editor
Beispiel #30
0
def _set_search_field(notebook_doc):
    notebook = Notebook(document=notebook_doc).get_data()
    statement = _get_statement(notebook)
    notebook_doc.search = statement
    return notebook_doc
Beispiel #31
0
def make_notebook2(name='Browse',
                   description='',
                   is_saved=False,
                   snippets=None):

    from notebook.connectors.hiveserver2 import HS2Api

    editor = Notebook()

    _snippets = []

    for snippet in snippets:
        default_properties = {'files': [], 'functions': [], 'settings': []}

        default_properties.update(snippet['properties'])
        snippet['properties'] = default_properties

        if snippet['type'] == 'hive':
            pass
        elif snippet['type'] == 'impala':
            pass
        elif snippet['type'] == 'java':
            pass

        _snippets.append(snippet)

    data = {
        'name':
        name,
        'uuid':
        str(uuid.uuid4()),
        'type':
        'notebook',
        'description':
        description,
        'sessions': [
            {
                'type': _snippet['type'],
                'properties': HS2Api.get_properties(snippet['type']),
                'id': None
            } for _snippet in _snippets  # Non unique types currently
        ],
        'selectedSnippet':
        _snippets[0]['type'],
        'type':
        'notebook',
        'showHistory':
        False,
        'isSaved':
        is_saved,
        'snippets': [{
            'status': _snippet.get('status', 'ready'),
            'id': str(uuid.uuid4()),
            'statement_raw': _snippet.get('statement', ''),
            'statement': _snippet.get('statement', ''),
            'type': _snippet.get('type'),
            'properties': _snippet['properties'],
            'name': name,
            'database': _snippet.get('database'),
            'result': {},
            'variables': []
        } for _snippet in _snippets]
    }

    editor.data = json.dumps(data)

    return editor
Beispiel #32
0
def execute_and_watch(request):
    notebook_id = request.GET.get('editor', request.GET.get('notebook'))
    snippet_id = int(request.GET['snippet'])
    action = request.GET['action']
    destination = request.GET['destination']

    notebook = Notebook(document=Document2.objects.get(
        id=notebook_id)).get_data()
    snippet = notebook['snippets'][snippet_id]
    editor_type = snippet['type']

    api = get_api(request, snippet)

    if action == 'save_as_table':
        sql, success_url = api.export_data_as_table(notebook, snippet,
                                                    destination)
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute',
                               database=snippet['database'])
    elif action == 'insert_as_query':
        # TODO: checks/workarounds in case of non impersonation or Sentry
        # TODO: keep older simpler way in case of known not many rows?
        sql, success_url = api.export_large_data_to_hdfs(
            notebook, snippet, destination)
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute',
                               database=snippet['database'],
                               on_success_url=success_url)
    elif action == 'index_query':
        if destination == '__hue__':
            destination = _get_snippet_name(notebook,
                                            unique=True,
                                            table_format=True)
            live_indexing = True
        else:
            live_indexing = False

        sql, success_url = api.export_data_as_table(notebook,
                                                    snippet,
                                                    destination,
                                                    is_temporary=True,
                                                    location='')
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute')

        sample = get_api(request, snippet).fetch_result(notebook,
                                                        snippet,
                                                        0,
                                                        start_over=True)

        from indexer.api3 import _index  # Will ve moved to the lib
        from indexer.file_format import HiveFormat
        from indexer.fields import Field

        file_format = {
            'name':
            'col',
            'inputFormat':
            'query',
            'format': {
                'quoteChar': '"',
                'recordSeparator': '\n',
                'type': 'csv',
                'hasHeader': False,
                'fieldSeparator': '\u0001'
            },
            "sample":
            '',
            "columns": [
                Field(
                    col['name'].rsplit('.')[-1],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in sample['meta']
            ]
        }

        if live_indexing:
            file_format['inputFormat'] = 'hs2_handle'
            file_format['fetch_handle'] = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over)

        job_handle = _index(request,
                            file_format,
                            destination,
                            query=notebook['uuid'])

        if live_indexing:
            return redirect(
                reverse('search:browse', kwargs={'name': destination}))
        else:
            return redirect(
                reverse('oozie:list_oozie_workflow',
                        kwargs={'job_id': job_handle['handle']['id']}))
    else:
        raise PopupException(_('Action %s is unknown') % action)

    return render(
        'editor.mako', request, {
            'notebooks_json':
            json.dumps([editor.get_data()]),
            'options_json':
            json.dumps({
                'languages': [{
                    "name": "%s SQL" % editor_type.title(),
                    "type": editor_type
                }],
                'mode':
                'editor',
                'editor_type':
                editor_type,
                'success_url':
                success_url
            }),
            'editor_type':
            editor_type,
        })
Beispiel #33
0
def execute_and_watch(request):
    notebook_id = request.GET.get('editor', request.GET.get('notebook'))
    snippet_id = int(request.GET['snippet'])
    action = request.GET['action']
    destination = request.GET['destination']

    notebook = Notebook(document=Document2.objects.get(
        id=notebook_id)).get_data()
    snippet = notebook['snippets'][snippet_id]
    editor_type = snippet['type']

    api = get_api(request, snippet)

    if action == 'save_as_table':
        sql, success_url = api.export_data_as_table(notebook, snippet,
                                                    destination)
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute',
                               database=snippet['database'])
    elif action == 'insert_as_query':
        sql, success_url = api.export_large_data_to_hdfs(
            notebook, snippet, destination)
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute',
                               database=snippet['database'])
    elif action == 'index_query':
        sql, success_url = api.export_data_as_table(notebook,
                                                    snippet,
                                                    destination,
                                                    is_temporary=True,
                                                    location='')
        editor = make_notebook(name='Execute and watch',
                               editor_type=editor_type,
                               statement=sql,
                               status='ready-execute')

        sample = get_api(request, snippet).fetch_result(notebook,
                                                        snippet,
                                                        0,
                                                        start_over=True)

        from indexer.api3 import _index  # Will ve moved to the lib in next commit
        from indexer.file_format import HiveFormat
        from indexer.fields import Field

        file_format = {
            'name':
            'col',
            'inputFormat':
            'query',
            'format': {
                'quoteChar': '"',
                'recordSeparator': '\n',
                'type': 'csv',
                'hasHeader': False,
                'fieldSeparator': '\u0001'
            },
            "sample":
            '',
            "columns": [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in sample['meta']
            ]
        }

        job_handle = _index(request,
                            file_format,
                            destination,
                            query=notebook['uuid'])
        return redirect(
            reverse('oozie:list_oozie_workflow',
                    kwargs={'job_id': job_handle['handle']['id']}))
    else:
        raise PopupException(_('Action %s is unknown') % action)

    return render(
        'editor.mako', request, {
            'notebooks_json':
            json.dumps([editor.get_data()]),
            'options_json':
            json.dumps({
                'languages': [{
                    "name": "%s SQL" % editor_type.title(),
                    "type": editor_type
                }],
                'mode':
                'editor',
                'editor_type':
                editor_type,
                'success_url':
                success_url
            }),
            'editor_type':
            editor_type,
        })
Beispiel #34
0
def _get_document_helper(request, uuid, with_data, with_dependencies, path):
    if uuid:
        if uuid.isdigit():
            document = Document2.objects.document(user=request.user,
                                                  doc_id=uuid)
        else:
            document = Document2.objects.get_by_uuid(user=request.user,
                                                     uuid=uuid)
    else:  # Find by path
        document = Document2.objects.get_by_path(user=request.user, path=path)

    response = {
        'document':
        document.to_dict(),
        'parent':
        document.parent_directory.to_dict()
        if document.parent_directory else None,
        'children': [],
        'dependencies': [],
        'dependents': [],
        'data':
        '',
        'status':
        0
    }

    response['user_perms'] = {
        'can_read': document.can_read(request.user),
        'can_write': document.can_write(request.user)
    }

    if with_data:
        data = json.loads(document.data)
        # Upgrade session properties for Hive and Impala
        if document.type.startswith('query'):
            from notebook.models import upgrade_session_properties
            notebook = Notebook(document=document)
            notebook = upgrade_session_properties(request, notebook)
            data = json.loads(notebook.data)
            if document.type == 'query-pig':  # Import correctly from before Hue 4.0
                properties = data['snippets'][0]['properties']
                if 'hadoopProperties' not in properties:
                    properties['hadoopProperties'] = []
                if 'parameters' not in properties:
                    properties['parameters'] = []
                if 'resources' not in properties:
                    properties['resources'] = []
            if data.get('uuid') != document.uuid:  # Old format < 3.11
                data['uuid'] = document.uuid

        response['data'] = data

    if with_dependencies:
        response['dependencies'] = [
            dependency.to_dict() for dependency in document.dependencies.all()
        ]
        response['dependents'] = [
            dependent.to_dict() for dependent in document.dependents.exclude(
                is_history=True).all()
        ]

    # Get children documents if this is a directory
    if document.is_directory:
        directory = Directory.objects.get(id=document.id)

        # If this is the user's home directory, fetch shared docs too
        if document.is_home_directory:
            children = directory.get_children_and_shared_documents(
                user=request.user)
            response.update(
                _filter_documents(request, queryset=children, flatten=True))
        else:
            children = directory.get_children_documents()
            response.update(
                _filter_documents(request, queryset=children, flatten=False))

    # Paginate and serialize Results
    if 'documents' in response:
        response.update(_paginate(request, queryset=response['documents']))
        # Rename documents to children
        response['children'] = response.pop('documents')
        response['children'] = [doc.to_dict() for doc in response['children']]

    return response
Beispiel #35
0
def make_notebook(name='Browse',
                  description='',
                  editor_type='hive',
                  statement='',
                  status='ready',
                  files=None,
                  functions=None,
                  settings=None,
                  is_saved=False,
                  database='default'):

    from notebook.connectors.hiveserver2 import HS2Api

    editor = Notebook()

    properties = HS2Api.get_properties(editor_type)

    if editor_type == 'hive':
        if files is not None:
            _update_property_value(properties, 'files', files)

        if functions is not None:
            _update_property_value(properties, 'functions', functions)

        if settings is not None:
            _update_property_value(properties, 'settings', settings)
    elif editor_type == 'impala':
        if settings is not None:
            _update_property_value(properties, 'files', files)

    editor.data = json.dumps({
        'name':
        name,
        'description':
        description,
        'sessions': [{
            'type': editor_type,
            'properties': properties,
            'id': None
        }],
        'selectedSnippet':
        editor_type,
        'type':
        'query-%s' % editor_type,
        'showHistory':
        True,
        'isSaved':
        is_saved,
        'snippets': [{
            'status': status,
            'id': str(uuid.uuid4()),
            'statement_raw': statement,
            'statement': statement,
            'type': editor_type,
            'properties': {
                'files': [] if files is None else files,
                'functions': [] if functions is None else functions,
                'settings': [] if settings is None else settings
            },
            'name': name,
            'database': database,
            'result': {}
        }]
    })

    return editor
Beispiel #36
0
def _get_notebook(user, notebook, operation_id):
    if operation_id and not notebook:
        nb_doc = Document2.objects.get_by_uuid(user=user, uuid=operation_id)
        notebook = Notebook(document=nb_doc).get_data()

    return notebook
Beispiel #37
0
    def test_delete_notebook(self):
        trash_notebook_json = """
        {
          "selectedSnippet": "hive",
          "showHistory": false,
          "description": "Test Hive Query",
          "name": "Test Hive Query",
          "sessions": [
              {
                  "type": "hive",
                  "properties": [],
                  "id": null
              }
          ],
          "type": "query-hive",
          "id": null,
          "snippets": [{"id": "e069ef32-5c95-4507-b961-e79c090b5abf","type":"hive","status":"ready","database":"default",""" \
              """"statement":"select * from web_logs","statement_raw":"select * from web_logs","variables":[],"properties":""" \
              """{"settings":[],"files":[],"functions":[]},"result":{}}],
          "uuid": "8a20da5f-b69c-4843-b17d-dea5c74c41d1"
      }
      """

        # Assert that the notebook is first saved
        response = self.client.post(reverse('notebook:save_notebook'),
                                    {'notebook': trash_notebook_json})
        data = json.loads(response.content)
        assert_equal(0, data['status'], data)

        # Test that deleting it moves it to the user's Trash folder
        notebook_doc = Document2.objects.get(id=data['id'])
        trash_notebooks = [Notebook(notebook_doc).get_data()]
        response = self.client.post(reverse('notebook:delete'),
                                    {'notebooks': json.dumps(trash_notebooks)})
        data = json.loads(response.content)
        assert_equal(0, data['status'], data)
        assert_equal('Trashed 1 notebook(s)', data['message'], data)

        response = self.client.get('/desktop/api2/doc', {'path': '/.Trash'})
        data = json.loads(response.content)
        trash_uuids = [doc['uuid'] for doc in data['children']]
        assert_true(notebook_doc.uuid in trash_uuids, data)

        # Test that any errors are reported in the response
        nonexistant_doc = {
            "id":
            12345,
            "uuid":
            "ea22da5f-b69c-4843-b17d-dea5c74c41d1",
            "selectedSnippet":
            "hive",
            "showHistory":
            False,
            "description":
            "Test Hive Query",
            "name":
            "Test Hive Query",
            "sessions": [{
                "type": "hive",
                "properties": [],
                "id": None,
            }],
            "type":
            "query-hive",
            "snippets": [{
                "id": "e069ef32-5c95-4507-b961-e79c090b5abf",
                "type": "hive",
                "status": "ready",
                "database": "default",
                "statement": "select * from web_logs",
                "statement_raw": "select * from web_logs",
                "variables": [],
                "properties": {
                    "settings": [],
                    "files": [],
                    "functions": []
                },
                "result": {}
            }]
        }
        trash_notebooks = [nonexistant_doc]
        response = self.client.post(reverse('notebook:delete'),
                                    {'notebooks': json.dumps(trash_notebooks)})
        data = json.loads(response.content)
        assert_equal(0, data['status'], data)
        assert_equal(
            'Trashed 0 notebook(s) and failed to delete 1 notebook(s).',
            data['message'], data)
        assert_equal(['ea22da5f-b69c-4843-b17d-dea5c74c41d1'], data['errors'])
Beispiel #38
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    for field in fields:
        for operation in field['operations']:
            if operation['type'] == 'split':
                field[
                    'multiValued'] = True  # Solr requires multiValued to be set when splitting
                kwargs['f.%(name)s.split' % field] = 'true'
                kwargs['f.%(name)s.separator' %
                       field] = operation['settings']['splitChar'] or ','

    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'
        kwargs['map'] = 'NULL:'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Beispiel #39
0
 def _get_query(self, name):
   nb_doc = Document2.objects.document(user=self.user, doc_id=name)
   notebook = Notebook(document=nb_doc).get_data()
   snippet = notebook['snippets'][0]
   return snippet['statement'].strip(';')
Beispiel #40
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib.unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = chardet.detect(stream.read(10000)).get('encoding')
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception, e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
Beispiel #41
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = Indexer(request.user, request.fs)
        stream = request.fs.open(file_format["path"])
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": file_format['path']
            },
            "format": file_format['format']
        })
    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format[
            'inputFormat'] == 'query':  # Only support open query history
        # TODO get schema from explain query, which is not possible
        notebook = Notebook(document=Document2.objects.get(
            id=file_format['query'])).get_data()
        snippet = notebook['snippets'][0]
        sample = get_api(request, snippet).fetch_result(notebook,
                                                        snippet,
                                                        4,
                                                        start_over=True)

        format_ = {
            "sample":
            sample['rows'][:4],
            "sample_cols":
            sample.meta,
            "columns": [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in sample.meta
            ]
        }

    return JsonResponse(format_)
Beispiel #42
0
def get_history(request):
    response = {'status': -1}

    doc_type = request.GET.get('doc_type')
    doc_text = request.GET.get('doc_text')
    connector_id = request.GET.get('doc_connector')
    page = min(int(request.GET.get('page', 1)), 100)
    limit = min(int(request.GET.get('limit', 50)), 100)
    is_notification_manager = request.GET.get('is_notification_manager',
                                              'false') == 'true'

    if is_notification_manager:
        docs = Document2.objects.get_tasks_history(user=request.user)
    else:
        docs = Document2.objects.get_history(doc_type='query-%s' % doc_type,
                                             connector_id=connector_id,
                                             user=request.user)

    if doc_text:
        docs = docs.filter(
            Q(name__icontains=doc_text) | Q(description__icontains=doc_text)
            | Q(search__icontains=doc_text))

    # Paginate
    docs = docs.order_by('-last_modified')
    response['count'] = docs.count()
    docs = __paginate(page, limit, queryset=docs)['documents']

    history = []
    for doc in docs:
        notebook = Notebook(document=doc).get_data()
        if 'snippets' in notebook:
            statement = notebook[
                'description'] if is_notification_manager else _get_statement(
                    notebook)
            history.append({
                'name': doc.name,
                'id': doc.id,
                'uuid': doc.uuid,
                'type': doc.type,
                'data': {
                    'statement':
                    statement[:1001] if statement else '',
                    'lastExecuted':
                    notebook['snippets'][0].get('lastExecuted', -1),
                    'status':
                    notebook['snippets'][0]['status'],
                    'parentSavedQueryUuid':
                    notebook.get('parentSavedQueryUuid', '')
                } if notebook['snippets'] else {},
                'absoluteUrl': doc.get_absolute_url(),
            })
        else:
            LOG.error('Incomplete History Notebook: %s' % notebook)
    response['history'] = sorted(history,
                                 key=lambda row: row['data']['lastExecuted'],
                                 reverse=True)
    response['message'] = _('History fetched')
    response['status'] = 0

    return JsonResponse(response)
Beispiel #43
0
def make_notebook(name='Browse',
                  description='',
                  editor_type='hive',
                  statement='',
                  status='ready',
                  files=None,
                  functions=None,
                  settings=None,
                  is_saved=False,
                  database='default',
                  snippet_properties=None,
                  batch_submit=False,
                  on_success_url=None,
                  skip_historify=False,
                  is_task=False,
                  last_executed=-1,
                  is_notebook=False,
                  pub_sub_url=None,
                  result_properties={},
                  namespace=None,
                  compute=None,
                  is_presentation_mode=False):
    '''
  skip_historify: do not add the task to the query history. e.g. SQL Dashboard
  is_task / isManaged: true when being a managed by Hue operation (include_managed=True in document), e.g. exporting query result, dropping some tables
  '''
    from notebook.connectors.hiveserver2 import HS2Api

    if has_connectors():
        interpreter = get_interpreter(connector_type=editor_type)
        editor_connector = editor_type
        editor_type = interpreter['dialect']
    else:
        editor_connector = editor_type

    editor = Notebook()
    if snippet_properties is None:
        snippet_properties = {}

    if editor_type == 'hive':
        sessions_properties = HS2Api.get_properties(editor_type)
        if files is not None:
            _update_property_value(sessions_properties, 'files', files)

        if functions is not None:
            _update_property_value(sessions_properties, 'functions', functions)

        if settings is not None:
            _update_property_value(sessions_properties, 'settings', settings)
    elif editor_type == 'impala':
        sessions_properties = HS2Api.get_properties(editor_type)
        if settings is not None:
            _update_property_value(sessions_properties, 'files', files)
    elif editor_type == 'java':
        sessions_properties = []  # Java options
    else:
        sessions_properties = []

    data = {
        'name':
        name,
        'uuid':
        str(uuid.uuid4()),
        'description':
        description,
        'sessions': [{
            'type': editor_connector,
            'properties': sessions_properties,
            'id': None
        }],
        'selectedSnippet':
        editor_connector,  # TODO: might need update in notebook.ko.js
        'type':
        'notebook' if is_notebook else 'query-%s' % editor_type,
        'showHistory':
        True,
        'isSaved':
        is_saved,
        'onSuccessUrl':
        urllib_quote(on_success_url.encode('utf-8'), safe=SAFE_CHARACTERS_URI)
        if on_success_url else None,
        'pubSubUrl':
        pub_sub_url,
        'skipHistorify':
        skip_historify,
        'isPresentationModeDefault':
        is_presentation_mode,
        'isManaged':
        is_task,
        'snippets': [{
            'status': status,
            'id': str(uuid.uuid4()),
            'statement_raw': statement,
            'statement': statement,
            'type': editor_connector,
            'wasBatchExecuted': batch_submit,
            'lastExecuted': last_executed,
            'properties': {
                'files': [] if files is None else files,
                'functions': [] if functions is None else functions,
                'settings': [] if settings is None else settings
            },
            'name': name,
            'database': database,
            'namespace': namespace if namespace else {},
            'compute': compute if compute else {},
            'result': {
                'handle': {}
            },
            'variables': []
        }] if not is_notebook else []
    }

    if snippet_properties:
        data['snippets'][0]['properties'].update(snippet_properties)
    if result_properties:
        data['snippets'][0]['result'].update(result_properties)

    editor.data = json.dumps(data)

    return editor
Beispiel #44
0
    def get_default(self, user, name, engine='solr', source='data'):
        fields = self.fields_data(user, name, engine, source=source)
        id_field = [field['name'] for field in fields if field.get('isId')]

        if id_field:
            id_field = id_field[0]
        else:
            id_field = ''  # Schemaless might not have an id

        if source == 'query':
            nb_doc = Document2.objects.document(user=user, doc_id=name)
            notebook = Notebook(document=nb_doc).get_data()
            label = _get_snippet_name(notebook, unique=True)
        else:
            label = name

        TEMPLATE = {
            "extracode":
            escape(
                "<style type=\"text/css\">\nem {\n  font-weight: bold;\n  background-color: yellow;\n}</style>\n\n<script>\n</script>"
            ),
            "highlighting": [""],
            "properties": {
                "highlighting_enabled": True
            },
            "template":
            """
      <div class="row-fluid">
        <div class="row-fluid">
          <div class="span12">%s</div>
        </div>
        <br/>
      </div>""" % ' '.join(['{{%s}}' % field['name'] for field in fields]),
            "isGridLayout":
            True,
            "showFieldList":
            True,
            "showGrid":
            True,
            "showChart":
            False,
            "chartSettings": {
                'chartType': 'bars',
                'chartSorting': 'none',
                'chartScatterGroup': None,
                'chartScatterSize': None,
                'chartScope': 'world',
                'chartX': None,
                'chartYSingle': None,
                'chartYMulti': [],
                'chartData': [],
                'chartMapLabel': None,
            },
            "fieldsAttributes":
            [self._make_gridlayout_header_field(field) for field in fields],
            "fieldsSelected": [],
            "leafletmap": {
                'latitudeField': None,
                'longitudeField': None,
                'labelField': None
            },
            "rows":
            25,
        }

        FACETS = []

        return {
            'id': None,
            'name': name,
            'engine': engine,
            'source': source,
            'label': label,
            'enabled': False,
            'template': TEMPLATE,
            'facets': FACETS,
            'fields': fields,
            'idField': id_field,
        }
Beispiel #45
0
def _execute_notebook(request, notebook, snippet):
    response = {'status': -1}
    result = None
    history = None
    active_executable = None

    historify = (notebook['type'] != 'notebook'
                 or snippet.get('wasBatchExecuted')
                 ) and not notebook.get('skipHistorify')

    try:
        try:
            sessions = notebook.get('sessions') and notebook[
                'sessions']  # Session reference for snippet execution without persisting it
            active_executable = json.loads(request.POST.get(
                'executable', '{}'))  # Editor v2
            # TODO: Use statement, database etc. from active_executable

            if historify:
                history = _historify(notebook, request.user)
                notebook = Notebook(document=history).get_data()

            interpreter = get_api(request, snippet)
            if snippet.get('interface') == 'sqlalchemy':
                interpreter.options['session'] = sessions[0]

            with opentracing.tracer.start_span('interpreter') as span:
                # interpreter.execute needs the sessions, but we don't want to persist them
                pre_execute_sessions = notebook['sessions']
                notebook['sessions'] = sessions
                response['handle'] = interpreter.execute(notebook, snippet)
                notebook['sessions'] = pre_execute_sessions

            # Retrieve and remove the result from the handle
            if response['handle'].get('sync'):
                result = response['handle'].pop('result')
        finally:
            if historify:
                _snippet = [
                    s for s in notebook['snippets'] if s['id'] == snippet['id']
                ][0]

                if 'id' in active_executable:  # Editor v2
                    # notebook_executable is the 1-to-1 match of active_executable in the notebook structure
                    notebook_executable = [
                        e for e in _snippet['executor']['executables']
                        if e['id'] == active_executable['id']
                    ][0]
                    if 'handle' in response:
                        notebook_executable['handle'] = response['handle']
                    if history:
                        notebook_executable['history'] = {
                            'id': history.id,
                            'uuid': history.uuid
                        }
                        notebook_executable['operationId'] = history.uuid

                if 'handle' in response:  # No failure
                    if 'result' not in _snippet:  # Editor v2
                        _snippet['result'] = {}
                    _snippet['result']['handle'] = response['handle']
                    _snippet['result']['statements_count'] = response[
                        'handle'].get('statements_count', 1)
                    _snippet['result']['statement_id'] = response[
                        'handle'].get('statement_id', 0)
                    _snippet['result']['handle']['statement'] = response[
                        'handle'].get('statement', snippet['statement']).strip(
                        )  # For non HS2, as non multi query yet
                else:
                    _snippet['status'] = 'failed'

                if history:  # If _historify failed, history will be None.
                    # If we get Atomic block exception, something underneath interpreter.execute() crashed and is not handled.
                    history.update_data(notebook)
                    history.save()

                    response['history_id'] = history.id
                    response['history_uuid'] = history.uuid
                    if notebook[
                            'isSaved']:  # Keep track of history of saved queries
                        response[
                            'history_parent_uuid'] = history.dependencies.filter(
                                type__startswith='query-').latest(
                                    'last_modified').uuid
    except QueryError as ex:  # We inject the history information from _historify() to the failed queries
        if response.get('history_id'):
            ex.extra['history_id'] = response['history_id']
        if response.get('history_uuid'):
            ex.extra['history_uuid'] = response['history_uuid']
        if response.get('history_parent_uuid'):
            ex.extra['history_parent_uuid'] = response['history_parent_uuid']
        raise ex

    # Inject and HTML escape results
    if result is not None:
        response['result'] = result
        response['result']['data'] = escape_rows(result['data'])

    response['status'] = 0

    return response
Beispiel #46
0
def _execute_notebook(request, notebook, snippet):
    response = {'status': -1}
    result = None
    history = None

    historify = (notebook['type'] != 'notebook'
                 or snippet.get('wasBatchExecuted')
                 ) and not notebook.get('skipHistorify')

    try:
        try:
            session = notebook.get('sessions') and notebook['sessions'][
                0]  # Session reference for snippet execution without persisting it
            if historify:
                history = _historify(notebook, request.user)
                notebook = Notebook(document=history).get_data()

            interpreter = get_api(request, snippet)
            if snippet.get('interface') == 'sqlalchemy':
                interpreter.options['session'] = session

            response['handle'] = interpreter.execute(notebook, snippet)

            # Retrieve and remove the result from the handle
            if response['handle'].get('sync'):
                result = response['handle'].pop('result')
        finally:
            if historify:
                _snippet = [
                    s for s in notebook['snippets'] if s['id'] == snippet['id']
                ][0]
                if 'handle' in response:  # No failure
                    _snippet['result']['handle'] = response['handle']
                    _snippet['result']['statements_count'] = response[
                        'handle'].get('statements_count', 1)
                    _snippet['result']['statement_id'] = response[
                        'handle'].get('statement_id', 0)
                    _snippet['result']['handle']['statement'] = response[
                        'handle'].get('statement', snippet['statement']).strip(
                        )  # For non HS2, as non multi query yet
                else:
                    _snippet['status'] = 'failed'

                if history:  # If _historify failed, history will be None
                    history.update_data(notebook)
                    history.save()

                    response['history_id'] = history.id
                    response['history_uuid'] = history.uuid
                    if notebook[
                            'isSaved']:  # Keep track of history of saved queries
                        response[
                            'history_parent_uuid'] = history.dependencies.filter(
                                type__startswith='query-').latest(
                                    'last_modified').uuid
    except QueryError, ex:  # We inject the history information from _historify() to the failed queries
        if response.get('history_id'):
            ex.extra['history_id'] = response['history_id']
        if response.get('history_uuid'):
            ex.extra['history_uuid'] = response['history_uuid']
        if response.get('history_parent_uuid'):
            ex.extra['history_parent_uuid'] = response['history_parent_uuid']
        raise ex
Beispiel #47
0
def _get_snippet(user, notebook, snippet, operation_id):
  if operation_id or not snippet:
    nb_doc = Document2.objects.get_by_uuid(user=user, uuid=operation_id or notebook['uuid'])
    notebook = Notebook(document=nb_doc).get_data()
    snippet = notebook['snippets'][0]
  return snippet
Beispiel #48
0
def export_documents(request):
    if request.GET.get('documents'):
        selection = json.loads(request.GET.get('documents'))
    else:
        selection = json.loads(request.POST.get('documents'))

    include_history = request.GET.get('history', 'false') == 'true'

    # Only export documents the user has permissions to read
    docs = Document2.objects.documents(user=request.user, perms='both', include_history=True, include_trashed=True).\
      filter(id__in=selection).order_by('-id')

    # Add any dependencies to the set of exported documents
    export_doc_set = _get_dependencies(docs, include_history=include_history)

    # For directories, add any children docs to the set of exported documents
    export_doc_set.update(_get_dependencies(docs, deps_mode=False))

    # Get PKs of documents to export
    doc_ids = [doc.pk for doc in export_doc_set]
    num_docs = len(doc_ids)

    if len(selection) == 1 and num_docs >= len(selection) and docs[0].name:
        filename = docs[0].name
    else:
        filename = 'hue-documents-%s-(%s)' % (
            datetime.today().strftime('%Y-%m-%d'), num_docs)

    f = string_io()

    if doc_ids:
        doc_ids = ','.join(map(str, doc_ids))
        management.call_command('dumpdata',
                                'desktop.Document2',
                                primary_keys=doc_ids,
                                indent=2,
                                use_natural_foreign_keys=True,
                                verbosity=2,
                                stdout=f)

    if request.GET.get('format') == 'json':
        return JsonResponse(f.getvalue(), safe=False)
    elif request.GET.get('format') == 'zip':
        zfile = zipfile.ZipFile(f, 'w')
        zfile.writestr("hue.json", f.getvalue())
        for doc in docs:
            if doc.type == 'notebook':
                try:
                    from spark.models import Notebook
                    zfile.writestr("notebook-%s-%s.txt" % (doc.name, doc.id),
                                   smart_str(Notebook(document=doc).get_str()))
                except Exception as e:
                    LOG.exception(e)
        zfile.close()
        response = HttpResponse(content_type="application/zip")
        response["Content-Length"] = len(f.getvalue())
        response[
            'Content-Disposition'] = b'attachment; filename="%s".zip' % filename
        response.write(f.getvalue())
        return response
    else:
        return make_response(f.getvalue(), 'json', filename)
Beispiel #49
0
def get_document(request):
  """
  Returns the document or directory found for the given uuid or path and current user.
  If a directory is found, return any children documents too.
  Optional params:
    page=<n>    - Controls pagination. Defaults to 1.
    limit=<n>   - Controls limit per page. Defaults to all.
    type=<type> - Show documents of given type(s) (directory, query-hive, query-impala, query-mysql, etc). Default to all.
    sort=<key>  - Sort by the attribute <key>, which is one of:
                    "name", "type", "owner", "last_modified"
                  Accepts the form "-last_modified", which sorts in descending order.
                  Default to "-last_modified".
    text=<frag> - Search for fragment "frag" in names and descriptions.
    data=<false|true> - Return all the data of the document. Default to false.
    dependencies=<false|true> - Return all the dependencies and dependents of the document. Default to false.
  """
  path = request.GET.get('path', '/')
  uuid = request.GET.get('uuid')
  with_data = request.GET.get('data', 'false').lower() == 'true'
  with_dependencies = request.GET.get('dependencies', 'false').lower() == 'true'

  if uuid:
    if uuid.isdigit():
      document = Document2.objects.document(user=request.user, doc_id=uuid)
    else:
      document = Document2.objects.get_by_uuid(user=request.user, uuid=uuid)
  else:  # Find by path
    document = Document2.objects.get_by_path(user=request.user, path=path)

  response = {
    'document': document.to_dict(),
    'parent': document.parent_directory.to_dict() if document.parent_directory else None,
    'children': [],
    'dependencies': [],
    'dependents': [],
    'data': '',
    'status': 0
  }

  response['user_perms'] = {
    'can_read': document.can_read(request.user),
    'can_write': document.can_write(request.user)
  }

  if with_data:
    data = json.loads(document.data)
    # Upgrade session properties for Hive and Impala
    if document.type.startswith('query'):
      notebook = Notebook(document=document)
      notebook = upgrade_session_properties(request, notebook)
      data = json.loads(notebook.data)
      if data.get('uuid') != document.uuid: # Old format < 3.11
        data['uuid'] = document.uuid

    response['data'] = data

  if with_dependencies:
    response['dependencies'] = [dependency.to_dict() for dependency in document.dependencies.all()]
    response['dependents'] = [dependent.to_dict() for dependent in document.dependents.all()]

  # Get children documents if this is a directory
  if document.is_directory:
    directory = Directory.objects.get(id=document.id)

    # If this is the user's home directory, fetch shared docs too
    if document.is_home_directory:
      children = directory.get_children_and_shared_documents(user=request.user)
    else:
      children = directory.get_children_documents()

    # Filter and order results
    response.update(_filter_documents(request, queryset=children, flatten=False))

  # Paginate and serialize Results
  if 'documents' in response:
    response.update(_paginate(request, queryset=response['documents']))
    # Rename documents to children
    response['children'] = response.pop('documents')
    response['children'] = [doc.to_dict() for doc in response['children']]

  return JsonResponse(response)
Beispiel #50
0
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready',
                  files=None, functions=None, settings=None, is_saved=False, database='default', snippet_properties=None, batch_submit=False,
                  on_success_url=None):
  from notebook.connectors.hiveserver2 import HS2Api

  editor = Notebook()
  if snippet_properties is None:
    snippet_properties = {}

  if editor_type == 'hive':
    sessions_properties = HS2Api.get_properties(editor_type)
    if files is not None:
      _update_property_value(sessions_properties, 'files', files)

    if functions is not None:
      _update_property_value(sessions_properties, 'functions', functions)

    if settings is not None:
      _update_property_value(sessions_properties, 'settings', settings)
  elif editor_type == 'impala':
    sessions_properties = HS2Api.get_properties(editor_type)
    if settings is not None:
      _update_property_value(sessions_properties, 'files', files)
  elif editor_type == 'java':
    sessions_properties = [] # Java options
  else:
    sessions_properties = []

  data = {
    'name': name,
    'uuid': str(uuid.uuid4()),
    'description': description,
    'sessions': [
      {
         'type': editor_type,
         'properties': sessions_properties,
         'id': None
      }
    ],
    'selectedSnippet': editor_type,
    'type': 'query-%s' % editor_type,
    'showHistory': True,
    'isSaved': is_saved,
    'onSuccessUrl': on_success_url,
    'snippets': [
      {
         'status': status,
         'id': str(uuid.uuid4()),
         'statement_raw': statement,
         'statement': statement,
         'type': editor_type,
         'wasBatchExecuted': batch_submit,
         'properties': {
            'files': [] if files is None else files,
            'functions': [] if functions is None else functions,
            'settings': [] if settings is None else settings
         },
         'name': name,
         'database': database,
         'result': {'handle':{}},
         'variables': []
      }
    ]
  }

  if snippet_properties:
    data['snippets'][0]['properties'].update(snippet_properties)

  editor.data = json.dumps(data)

  return editor
Beispiel #51
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = chardet.detect(stream.read(10000)).get('encoding')
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                kafkaFieldNames = [
                    'id', 'additionalInfo', 'allowed', 'collectionName',
                    'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst',
                    'entityId', 'family', 'impersonator', 'ip', 'name',
                    'objectType', 'objType', 'objUsageType', 'operationParams',
                    'operationText', 'op', 'opText', 'path', 'perms',
                    'privilege', 'qualifier', 'QUERY_ID', 'resourcePath',
                    'service', 'SESSION_ID', 'solrVersion', 'src', 'status',
                    'subOperation', 'tableName', 'table', 'time', 'type',
                    'url', 'user'
                ]
                kafkaFieldTypes = ['string'] * len(kafkaFieldNames)
                kafkaFieldNames.append('timeDate')
                kafkaFieldTypes.append('date')
            else:
                # Note: mocked here, should come from SFDC or Kafka API or sampling job
                kafkaFieldNames = file_format.get('kafkaFieldNames',
                                                  '').split(',')
                kafkaFieldTypes = file_format.get('kafkaFieldTypes',
                                                  '').split(',')

            data = """%(kafkaFieldNames)s
%(data)s""" % {
                'kafkaFieldNames': ','.join(kafkaFieldNames),
                'data': '\n'.join(
                    [','.join(['...'] * len(kafkaFieldTypes))] * 5)
            }
            stream = string_io()
            stream.write(data)

            _convert_format(file_format["format"], inverse=True)

            indexer = MorphlineIndexer(request.user, request.fs)
            format_ = indexer.guess_field_types({
                "file": {
                    "stream": stream,
                    "name": file_format['path']
                },
                "format": file_format['format']
            })
            type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes)))

            for col in format_['columns']:
                col['keyType'] = type_mapping[col['name']]
                col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Beispiel #52
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        stream = request.fs.open(file_format["path"])
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": file_format['path']
            },
            "format": file_format['format']
        })
    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception, e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
Beispiel #53
0
def make_notebook(name='Browse',
                  description='',
                  editor_type='hive',
                  statement='',
                  status='ready',
                  files=None,
                  functions=None,
                  settings=None,
                  is_saved=False,
                  database='default',
                  snippet_properties=None,
                  batch_submit=False):
    from notebook.connectors.hiveserver2 import HS2Api

    editor = Notebook()
    if snippet_properties is None:
        snippet_properties = {}

    if editor_type == 'hive':
        sessions_properties = HS2Api.get_properties(editor_type)
        if files is not None:
            _update_property_value(sessions_properties, 'files', files)

        if functions is not None:
            _update_property_value(sessions_properties, 'functions', functions)

        if settings is not None:
            _update_property_value(sessions_properties, 'settings', settings)
    elif editor_type == 'impala':
        sessions_properties = HS2Api.get_properties(editor_type)
        if settings is not None:
            _update_property_value(sessions_properties, 'files', files)
    elif editor_type == 'java':
        sessions_properties = []  # Java options
    else:
        sessions_properties = []

    data = {
        'name':
        name,
        'uuid':
        str(uuid.uuid4()),
        'description':
        description,
        'sessions': [{
            'type': editor_type,
            'properties': sessions_properties,
            'id': None
        }],
        'selectedSnippet':
        editor_type,
        'type':
        'query-%s' % editor_type,
        'showHistory':
        True,
        'isSaved':
        is_saved,
        'snippets': [{
            'status': status,
            'id': str(uuid.uuid4()),
            'statement_raw': statement,
            'statement': statement,
            'type': editor_type,
            'wasBatchExecuted': batch_submit,
            'properties': {
                'files': [] if files is None else files,
                'functions': [] if functions is None else functions,
                'settings': [] if settings is None else settings
            },
            'name': name,
            'database': database,
            'result': {},
            'variables': []
        }]
    }

    if snippet_properties:
        data['snippets'][0]['properties'].update(snippet_properties)

    editor.data = json.dumps(data)

    return editor
Beispiel #54
0
def _small_indexing(user, fs, client, source, destination, index_name):
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        path = urllib.unquote(source["path"])
        stats = fs.stats(path)
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)

    fields = indexer.get_field_list(destination['columns'])
    _create_solr_collection(user, fs, client, destination, index_name, kwargs)

    if source['inputFormat'] == 'file':
        path = urllib.unquote(source["path"])
        data = fs.read(path, 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'
        kwargs['map'] = 'NULL:'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        elif source['inputFormat'] == 'manual':
            pass  # No need to do anything
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Beispiel #55
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            data = get_topic_data(request.user,
                                  file_format.get('kafkaSelectedTopics'))

            kafkaFieldNames = [col['name'] for col in data['full_headers']]
            kafkaFieldTypes = [col['type'] for col in data['full_headers']]
            topics_data = data['rows']

            format_ = {
                "sample":
                topics_data,
                "columns": [
                    Field(col, 'string', unique=False).to_dict()
                    for col in kafkaFieldNames
                ]
            }


#       data = """%(kafkaFieldNames)s
# %(data)s""" % {
#         'kafkaFieldNames': ','.join(kafkaFieldNames),
#         'data': '\n'.join([','.join(cols) for cols in topics_data])
#       }
#       stream = string_io()
#       stream.write(data)

#       _convert_format(file_format["format"], inverse=True)

#       indexer = MorphlineIndexer(request.user, request.fs)

#       format_ = indexer.guess_field_types({
#         "file": {
#             "stream": stream,
#             "name": file_format['path']
#         },
#         "format": file_format['format']
#       })
#       type_mapping = dict(
#         list(
#           zip(kafkaFieldNames, kafkaFieldTypes)
#         )
#       )

#       for col in format_['columns']:
#         col['keyType'] = type_mapping[col['name']]
#         col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)