Exemple #1
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = Indexer(request.user, request.fs)
        stream = request.fs.open(file_format["path"])
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": file_format['path']
            },
            "format": file_format['format']
        })
    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format[
            'inputFormat'] == 'query':  # Only support open query history
        # TODO get schema from explain query, which is not possible
        notebook = Notebook(document=Document2.objects.get(
            id=file_format['query'])).get_data()
        snippet = notebook['snippets'][0]
        sample = get_api(request, snippet).fetch_result(notebook,
                                                        snippet,
                                                        4,
                                                        start_over=True)

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in sample.meta
            ]
        }

    return JsonResponse(format_)
Exemple #2
0
    def test_generate_geo_ip_morphline(self):
        geo_ip_dict = get_operator('geo_ip').get_default_operation()

        geo_ip_dict['fields'] = [
            Field("test_field_1", "string").to_dict(),
            Field("test_field_2", "string").to_dict()
        ]

        self._test_generate_field_operation_morphline(geo_ip_dict)
Exemple #3
0
  def test_generate_extract_uri_components_operation_morphline(self):
    extract_uri_dict = get_operator('extract_uri_components').get_default_operation()

    extract_uri_dict['fields'] = [
        Field("test_field_1", "string").to_dict(),
        Field("test_field_2", "string").to_dict()
      ]

    self._test_generate_field_operation_morphline(extract_uri_dict)
Exemple #4
0
    def test_generate_grok_operation_morphline(self):
        grok_dict = get_operator('grok').get_default_operation()

        grok_dict['fields'] = [
            Field("test_field_1", "string").to_dict(),
            Field("test_field_2", "string").to_dict()
        ]

        _test_generate_field_operation_morphline(grok_dict)
Exemple #5
0
    def test_generate_split_operation_morphline(self):
        split_dict = get_operator('split').get_default_operation()

        split_dict['fields'] = [
            Field("test_field_1", "string").to_dict(),
            Field("test_field_2", "string").to_dict()
        ]

        self._test_generate_field_operation_morphline(split_dict)
Exemple #6
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    # Only support non expired query history. Otherwise would need to get schema without executing a query.
    notebook = Notebook(document=Document2.objects.document(id=file_format['query'])).get_data()
    snippet = notebook['snippets'][0]
    sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True)

    format_ = {
        "sample": sample['rows'][:4],
        "sample_cols": sample.meta,
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in sample.meta
        ]
    }
  elif file_format['inputFormat'] == 'rdbms':
    query_server = rdbms.get_query_server_config(server=file_format['rdbmsType'])
    db = rdbms.get(request.user, query_server=query_server)
    sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data(mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName'])
    table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False)

    format_ = {
        "sample": list(sample['rows'])[:4],
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in table_metadata
        ]
    }

  return JsonResponse(format_)
Exemple #7
0
  def test_generate_translate_morphline(self):
    translate_dict = get_operator('translate').get_default_operation()

    translate_dict['fields'] = [
      Field("test_field_1", "string").to_dict(),
      Field("test_field_2", "string").to_dict()
    ]

    translate_dict['settings']['mapping'].append({"key":"key","value":"value"})

    self._test_generate_field_operation_morphline(translate_dict)
Exemple #8
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query']

    notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data()
    snippet = notebook['snippets'][0]
    db = get_api(request, snippet)

    if file_format.get('sampleCols'):
      columns = file_format.get('sampleCols')
      sample = file_format.get('sample')
    else:
      snippet['query'] = snippet['statement']
      try:
        sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4]
      except Exception, e:
        LOG.warn('Skipping sample data as query handle might be expired: %s' % e)
        sample = [[], [], [], [], []]
      columns = db.autocomplete(snippet=snippet, database='', table='')
      columns = [
          Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }
Exemple #9
0
def _importer(request, prefill):
    source_type = request.GET.get('sourceType') or get_cluster_config(
        request.user)['default_sql_interpreter']

    return render(
        'importer.mako', request, {
            'is_embeddable':
            request.GET.get('is_embeddable', False),
            'fields_json':
            json.dumps({
                'solr': [field.name for field in FIELD_TYPES],
                'hive': HIVE_TYPES,
                'hivePrimitive': HIVE_PRIMITIVE_TYPES
            }),
            'operators_json':
            json.dumps([operator.to_dict() for operator in OPERATORS]),
            'file_types_json':
            json.dumps([
                format_.format_info()
                for format_ in get_file_indexable_format_types()
            ]),
            'default_field_type':
            json.dumps(Field().to_dict()),
            'prefill':
            json.dumps(prefill),
            'source_type':
            source_type
        })
Exemple #10
0
def indexer(request):
    if not request.user.has_hue_permission(action="access", app='search'):
        raise PopupException(_('Missing permission.'), error_code=403)

    searcher = SolrClient(request.user)
    indexes = searcher.get_indexes()

    for index in indexes:
        index['isSelected'] = False

    return render(
        'indexer.mako', request, {
            'is_embeddable':
            request.GET.get('is_embeddable', False),
            'indexes_json':
            json.dumps(indexes),
            'fields_json':
            json.dumps([field.name for field in FIELD_TYPES]),
            'operators_json':
            json.dumps([operator.to_dict() for operator in OPERATORS]),
            'file_types_json':
            json.dumps([
                format_.format_info()
                for format_ in get_file_indexable_format_types()
            ]),
            'default_field_type':
            json.dumps(Field().to_dict())
        })
Exemple #11
0
 def __init__(self):
     self._fields = [
         Field("timestamp", "string"),
         Field("timestamp8601", "string"),
         Field("facility", "string"),
         Field("priority", "string"),
         Field("logsource", "string"),
         Field("program", "string"),
         Field("pid", "string"),
         Field("message", "text_general"),
     ]
Exemple #12
0
def execute_and_watch(request):
  notebook_id = request.GET.get('editor', request.GET.get('notebook'))
  snippet_id = int(request.GET['snippet'])
  action = request.GET['action']
  destination = request.GET['destination']

  notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data()
  snippet = notebook['snippets'][snippet_id]
  editor_type = snippet['type']

  api = get_api(request, snippet)

  if action == 'save_as_table':
    sql, success_url = api.export_data_as_table(notebook, snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'])
  elif action == 'insert_as_query':
    sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'])
  elif action == 'index_query':
    sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='')
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute')

    sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True)

    from indexer.api3 import _index # Will ve moved to the lib in next commit
    from indexer.file_format import HiveFormat
    from indexer.fields import Field

    file_format = {
        'name': 'col',
        'inputFormat': 'query',
        'format': {'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001'},
        "sample": '',
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in sample['meta']
        ]
    }

    job_handle = _index(request, file_format, destination, query=notebook['uuid'])
    return redirect(reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']}))
  else:
    raise PopupException(_('Action %s is unknown') % action)

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
          'editor_type': editor_type,
          'success_url': success_url
      }),
      'editor_type': editor_type,
  })
Exemple #13
0
  def _guess_fields(self, sample):
    header = self._guess_field_names(sample)
    types = self._guess_field_types(self._sample_rows)

    if len(header) == len(types):
      # create the fields
      fields = [Field(header[i], types[i]) for i in range(len(header))]
    else:
      # likely failed to guess correctly
      LOG.warn("Guess field types failed - number of headers didn't match number of predicted types.")
      fields = []

    return fields
Exemple #14
0
 def __init__(self):
     self._fields = [
         Field("timestamp", "string"),
         Field("pid", "long"),
         Field("loglevel", "string"),
         Field("progname", "string"),
         Field("message", "text_general"),
         Field("field_line", "text_general")
     ]
Exemple #15
0
  def get_instance(cls, file_stream, format_):
    sample_data, sample_lines = cls._get_sample(file_stream)

    fields = []

    for field in format_["fields"]:
      fields.append(Field(
        name=field["name"],
        field_type_name=cls.FIELD_TYPE_TRANSLATE.get(field['type'], 'string')
      ))

    return cls(**{
      "delimiter": ',',
      "line_terminator": '\n',
      "quote_char": '"',
      "has_header": False,
      "sample": sample_data,
      "fields": format_["fields"]
    })
Exemple #16
0
def indexer(request):
    searcher = IndexController(request.user)
    indexes = searcher.get_indexes()

    for index in indexes:
        index['isSelected'] = False

    return render(
        'indexer.mako', request, {
            'indexes_json':
            json.dumps(indexes),
            'fields_json':
            json.dumps([field.name for field in FIELD_TYPES]),
            'operators_json':
            json.dumps([operator.to_dict() for operator in OPERATORS]),
            'file_types_json':
            json.dumps(
                [format_.format_info() for format_ in get_format_types()]),
            'default_field_type':
            json.dumps(Field().to_dict())
        })
Exemple #17
0
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }
  elif file_format['inputFormat'] == 'rdbms':
    query_server = rdbms.get_query_server_config(server=file_format['rdbmsType'])
    db = rdbms.get(request.user, query_server=query_server)
    sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data(mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName'])
    table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False)

    format_ = {
        "sample": list(sample['rows'])[:4],
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in table_metadata
        ]
    }
  elif file_format['inputFormat'] == 'stream':
    # Note: mocked here, should come from SFDC or Kafka API or sampling job
    if file_format['streamSelection'] == 'kafka':
      data = """%(kafkaFieldNames)s
%(data)s""" % {
        'kafkaFieldNames': file_format.get('kafkaFieldNames', ''),
        'data': '\n'.join([','.join(['...'] * len(file_format.get('kafkaFieldTypes', '').split(',')))] * 5)
      }
      stream = StringIO.StringIO()
      stream.write(data)

      _convert_format(file_format["format"], inverse=True)
Exemple #18
0
    def __init__(self):
        geo_ip_operation = get_operator("geo_ip").get_default_operation()

        geo_ip_operation['settings']["/country/names/en"] = True
        geo_ip_operation['settings']["/city/names/en"] = True
        geo_ip_operation['settings']["/location/latitude"] = True
        geo_ip_operation['settings']["/location/longitude"] = True

        geo_ip_operation['fields'] += [
            Field("country", "string").to_dict(),
            Field("city", "string").to_dict(),
            Field("latitude", "double").to_dict(),
            Field("longitude", "double").to_dict()
        ]

        self._fields = [
            Field("date", "date"),
            Field("component", "string"),
            Field("log_level", "string"),
            Field("details", "string"),
            Field("message", "text_general"),
            Field("ip", "string", [geo_ip_operation]),
            Field("user", "string"),
            Field("http_method", "string"),
            Field("path", "string"),
            Field("protocol", "string")
        ]
Exemple #19
0
 def __init__(self):
     self._fields = [
         Field("clientip", "string"),
         Field("ident", "string"),
         Field("auth", "string"),
         Field("timestamp", "date"),
         Field("verb", "string"),
         Field("request", "string"),
         Field("httpversion", "double"),
         Field("rawrequest", "long"),
         Field("response", "long"),
         Field("bytes", "long"),
         Field("referrer", "string"),
         Field("field_line", "text_general")
     ]
Exemple #20
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = chardet.detect(stream.read(10000)).get('encoding')
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                kafkaFieldNames = [
                    'id', 'additionalInfo', 'allowed', 'collectionName',
                    'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst',
                    'entityId', 'family', 'impersonator', 'ip', 'name',
                    'objectType', 'objType', 'objUsageType', 'operationParams',
                    'operationText', 'op', 'opText', 'path', 'perms',
                    'privilege', 'qualifier', 'QUERY_ID', 'resourcePath',
                    'service', 'SESSION_ID', 'solrVersion', 'src', 'status',
                    'subOperation', 'tableName', 'table', 'time', 'type',
                    'url', 'user'
                ]
                kafkaFieldTypes = ['string'] * len(kafkaFieldNames)
                kafkaFieldNames.append('timeDate')
                kafkaFieldTypes.append('date')
            else:
                # Note: mocked here, should come from SFDC or Kafka API or sampling job
                kafkaFieldNames = file_format.get('kafkaFieldNames',
                                                  '').split(',')
                kafkaFieldTypes = file_format.get('kafkaFieldTypes',
                                                  '').split(',')

            data = """%(kafkaFieldNames)s
%(data)s""" % {
                'kafkaFieldNames': ','.join(kafkaFieldNames),
                'data': '\n'.join(
                    [','.join(['...'] * len(kafkaFieldTypes))] * 5)
            }
            stream = string_io()
            stream.write(data)

            _convert_format(file_format["format"], inverse=True)

            indexer = MorphlineIndexer(request.user, request.fs)
            format_ = indexer.guess_field_types({
                "file": {
                    "stream": stream,
                    "name": file_format['path']
                },
                "format": file_format['format']
            })
            type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes)))

            for col in format_['columns']:
                col['keyType'] = type_mapping[col['name']]
                col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Exemple #21
0
      columns = [
          Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }
  elif file_format['inputFormat'] == 'rdbms':
    api = _get_api(request)
    sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName'])

    format_ = {
      "sample": list(sample['rows'])[:4],
      "columns": [
          Field(col['name'], col['type']).to_dict()
          for col in sample['full_headers']
      ]
    }
  elif file_format['inputFormat'] == 'stream':
    if file_format['streamSelection'] == 'kafka':
      if file_format.get('kafkaSelectedTopics') == 'NavigatorAuditEvents':
        kafkaFieldNames = [
          'id',
          'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db',
          'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator',
          'ip', 'name', 'objectType', 'objType', 'objUsageType',
          'operationParams', 'operationText', 'op', 'opText', 'path',
          'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath',
          'service', 'SESSION_ID', 'solrVersion', 'src', 'status',
          'subOperation', 'tableName', 'table', 'time', 'type',
Exemple #22
0
def execute_and_watch(request):
  notebook_id = request.GET.get('editor', request.GET.get('notebook'))
  snippet_id = int(request.GET['snippet'])
  action = request.GET['action']
  destination = request.GET['destination']

  notebook = Notebook(document=Document2.objects.get(id=notebook_id)).get_data()
  snippet = notebook['snippets'][snippet_id]
  editor_type = snippet['type']

  api = get_api(request, snippet)

  if action == 'save_as_table':
    sql, success_url = api.export_data_as_table(notebook, snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'])
  elif action == 'insert_as_query':
    # TODO: checks/workarounds in case of non impersonation or Sentry
    # TODO: keep older simpler way in case of known not many rows?
    sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination)
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url)
  elif action == 'index_query':
    if destination == '__hue__':
      destination = _get_snippet_name(notebook, unique=True, table_format=True)
      live_indexing = True
    else:
      live_indexing = False

    sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='')
    editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute')

    sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True)

    from indexer.api3 import _index # Will ve moved to the lib
    from indexer.file_format import HiveFormat
    from indexer.fields import Field

    file_format = {
        'name': 'col',
        'inputFormat': 'query',
        'format': {'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001'},
        "sample": '',
        "columns": [
            Field(col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in sample['meta']
        ]
    }

    if live_indexing:
      file_format['inputFormat'] = 'hs2_handle'
      file_format['fetch_handle'] = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over)

    job_handle = _index(request, file_format, destination, query=notebook['uuid'])

    if live_indexing:
      return redirect(reverse('search:browse', kwargs={'name': destination}))
    else:
      return redirect(reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']}))
  else:
    raise PopupException(_('Action %s is unknown') % action)

  return render('editor.mako', request, {
      'notebooks_json': json.dumps([editor.get_data()]),
      'options_json': json.dumps({
          'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}],
          'mode': 'editor',
          'editor_type': editor_type,
          'success_url': success_url
      }),
      'editor_type': editor_type,
  })
Exemple #23
0
def export_result(request):
    response = {'status': -1, 'message': _('Exporting result failed.')}

    # Passed by check_document_access_permission but unused by APIs
    notebook = json.loads(request.POST.get('notebook', '{}'))
    snippet = json.loads(request.POST.get('snippet', '{}'))
    data_format = json.loads(request.POST.get('format', 'hdfs-file'))
    destination = json.loads(request.POST.get('destination', ''))
    overwrite = json.loads(request.POST.get('overwrite', 'false'))
    is_embedded = json.loads(request.POST.get('is_embedded', 'false'))

    api = get_api(request, snippet)

    if data_format == 'hdfs-file':  # Blocking operation, like downloading
        if request.fs.isdir(destination):
            if notebook.get('name'):
                destination += '/%(name)s.csv' % notebook
            else:
                destination += '/%(type)s-%(id)s.csv' % notebook
        if overwrite and request.fs.exists(destination):
            request.fs.do_as_user(request.user.username, request.fs.rmtree,
                                  destination)
        response['watch_url'] = api.export_data_as_hdfs_file(
            snippet, destination, overwrite)
        response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to HDFS destination: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format == 'hive-table':
        if is_embedded:
            sql, success_url = api.export_data_as_table(
                notebook, snippet, destination)

            task = make_notebook(name=_('Export %s query to table %s') %
                                 (snippet['type'], destination),
                                 description=_('Query %s to %s') %
                                 (_get_snippet_name(notebook), success_url),
                                 editor_type=snippet['type'],
                                 statement=sql,
                                 status='ready-execute',
                                 database=snippet['database'],
                                 on_success_url=success_url,
                                 is_task=True)
            response = task.execute(request)
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse(
                'notebook:execute_and_watch'
            ) + '?action=save_as_table&notebook=' + str(
                notebook_id) + '&snippet=0&destination=' + destination
            response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to Hive table: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format == 'hdfs-directory':
        if is_embedded:
            sql, success_url = api.export_large_data_to_hdfs(
                notebook, snippet, destination)

            task = make_notebook(name=_('Export %s query to directory') %
                                 snippet['type'],
                                 description=_('Query %s to %s') %
                                 (_get_snippet_name(notebook), success_url),
                                 editor_type=snippet['type'],
                                 statement=sql,
                                 status='ready-execute',
                                 database=snippet['database'],
                                 on_success_url=success_url,
                                 is_task=True)
            response = task.execute(request)
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse(
                'notebook:execute_and_watch'
            ) + '?action=insert_as_query&notebook=' + str(
                notebook_id) + '&snippet=0&destination=' + destination
            response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to HDFS directory: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }
    elif data_format == 'search-index':
        if is_embedded:
            if destination == '__hue__':
                destination = _get_snippet_name(notebook,
                                                unique=True,
                                                table_format=True)
                live_indexing = True
            else:
                live_indexing = False

            sample = get_api(request, snippet).fetch_result(notebook,
                                                            snippet,
                                                            0,
                                                            start_over=True)

            from indexer.api3 import _index  # Will be moved to the lib
            from indexer.file_format import HiveFormat
            from indexer.fields import Field

            file_format = {
                'name':
                'col',
                'inputFormat':
                'query',
                'format': {
                    'quoteChar': '"',
                    'recordSeparator': '\n',
                    'type': 'csv',
                    'hasHeader': False,
                    'fieldSeparator': '\u0001'
                },
                "sample":
                '',
                "columns": [
                    Field(
                        col['name'].rsplit('.')[-1],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in sample['meta']
                ]
            }

            if live_indexing:
                file_format['inputFormat'] = 'hs2_handle'
                file_format['fetch_handle'] = lambda rows, start_over: get_api(
                    request, snippet).fetch_result(
                        notebook, snippet, rows=rows, start_over=start_over)
                response['rowcount'] = _index(request,
                                              file_format,
                                              destination,
                                              query=notebook['uuid'])
                response['watch_url'] = reverse('search:browse',
                                                kwargs={'name': destination})
                response['status'] = 0
            else:
                response = _index(request,
                                  file_format,
                                  destination,
                                  query=notebook['uuid'])
        else:
            notebook_id = notebook['id'] or request.GET.get(
                'editor', request.GET.get('notebook'))
            response['watch_url'] = reverse(
                'notebook:execute_and_watch'
            ) + '?action=index_query&notebook=' + str(
                notebook_id) + '&snippet=0&destination=' + destination
            response['status'] = 0
        request.audit = {
            'operation':
            'EXPORT',
            'operationText':
            'User %s exported to Search index: %s' %
            (request.user.username, destination),
            'allowed':
            True
        }

    return JsonResponse(response)
Exemple #24
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warning(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            data = get_topic_data(request.user,
                                  file_format.get('kafkaSelectedTopics'))

            kafkaFieldNames = [col['name'] for col in data['full_headers']]
            kafkaFieldTypes = [col['type'] for col in data['full_headers']]
            topics_data = data['rows']

            format_ = {
                "sample":
                topics_data,
                "columns": [
                    Field(col, 'string', unique=False).to_dict()
                    for col in kafkaFieldNames
                ]
            }


#       data = """%(kafkaFieldNames)s
# %(data)s""" % {
#         'kafkaFieldNames': ','.join(kafkaFieldNames),
#         'data': '\n'.join([','.join(cols) for cols in topics_data])
#       }
#       stream = string_io()
#       stream.write(data)

#       _convert_format(file_format["format"], inverse=True)

#       indexer = MorphlineIndexer(request.user, request.fs)

#       format_ = indexer.guess_field_types({
#         "file": {
#             "stream": stream,
#             "name": file_format['path']
#         },
#         "format": file_format['format']
#       })
#       type_mapping = dict(
#         list(
#           zip(kafkaFieldNames, kafkaFieldTypes)
#         )
#       )

#       for col in format_['columns']:
#         col['keyType'] = type_mapping[col['name']]
#         col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Exemple #25
0
def export_result(request):
  response = {'status': -1, 'message': _('Success')}

  # Passed by check_document_access_permission but unused by APIs
  notebook = json.loads(request.POST.get('notebook', '{}'))
  snippet = json.loads(request.POST.get('snippet', '{}'))
  data_format = json.loads(request.POST.get('format', '"hdfs-file"'))
  destination = urllib.unquote(json.loads(request.POST.get('destination', '""')))
  overwrite = json.loads(request.POST.get('overwrite', 'false'))
  is_embedded = json.loads(request.POST.get('is_embedded', 'false'))
  start_time = json.loads(request.POST.get('start_time', '-1'))

  api = get_api(request, snippet)

  if data_format == 'hdfs-file': # Blocking operation, like downloading
    if request.fs.isdir(destination):
      if notebook.get('name'):
        destination += '/%(name)s.csv' % notebook
      else:
        destination += '/%(type)s-%(id)s.csv' % notebook
    if overwrite and request.fs.exists(destination):
      request.fs.do_as_user(request.user.username, request.fs.rmtree, destination)
    response['watch_url'] = api.export_data_as_hdfs_file(snippet, destination, overwrite)
    response['status'] = 0
    request.audit = {
      'operation': 'EXPORT',
      'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination),
      'allowed': True
    }
  elif data_format == 'hive-table':
    if is_embedded:
      sql, success_url = api.export_data_as_table(notebook, snippet, destination)

      task = make_notebook(
        name=_('Export %s query to table %s') % (snippet['type'], destination),
        description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url),
        editor_type=snippet['type'],
        statement=sql,
        status='ready',
        database=snippet['database'],
        on_success_url=success_url,
        last_executed=start_time,
        is_task=True
      )
      response = task.execute(request)
    else:
      notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook'))
      response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table&notebook=' + str(notebook_id) + '&snippet=0&destination=' + destination
      response['status'] = 0
    request.audit = {
      'operation': 'EXPORT',
      'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination),
      'allowed': True
    }
  elif data_format == 'hdfs-directory':
    if is_embedded:
      sql, success_url = api.export_large_data_to_hdfs(notebook, snippet, destination)

      task = make_notebook(
        name=_('Export %s query to directory') % snippet['type'],
        description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url),
        editor_type=snippet['type'],
        statement=sql,
        status='ready-execute',
        database=snippet['database'],
        on_success_url=success_url,
        last_executed=start_time,
        is_task=True
      )
      response = task.execute(request)
    else:
      notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook'))
      response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query&notebook=' + str(notebook_id) + '&snippet=0&destination=' + destination
      response['status'] = 0
    request.audit = {
      'operation': 'EXPORT',
      'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination),
      'allowed': True
    }
  elif data_format in ('search-index', 'dashboard'):
    # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable)
    if is_embedded:
      notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook'))

      if data_format == 'dashboard':
        engine = notebook['type'].replace('query-', '')
        response['watch_url'] = reverse('dashboard:browse', kwargs={'name': notebook_id}) + '?source=query&engine=%(engine)s' % {'engine': engine}
        response['status'] = 0
      else:
        sample = get_api(request, snippet).fetch_result(notebook, snippet, rows=4, start_over=True)
        for col in sample['meta']:
          col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')

        response['status'] = 0
        response['id'] = notebook_id
        response['name'] = _get_snippet_name(notebook)
        response['source_type'] = 'query'
        response['target_type'] = 'index'
        response['target_path'] = destination
        response['sample'] = list(sample['data'])
        response['columns'] = [
            Field(col['name'], col['type']).to_dict() for col in sample['meta']
        ]
    else:
      notebook_id = notebook['id'] or request.GET.get('editor', request.GET.get('notebook'))
      response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query&notebook=' + str(notebook_id) + '&snippet=0&destination=' + destination
      response['status'] = 0

    if response.get('status') != 0:
      response['message'] =  _('Exporting result failed.')

  return JsonResponse(response)
Exemple #26
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    path = urllib.unquote(file_format["path"])
    stream = request.fs.open(path)
    encoding = chardet.detect(stream.read(10000)).get('encoding')
    stream.seek(0)
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": path
        },
      "format": file_format['format']
    })

    # Note: Would also need to set charset to table (only supported in Hive)
    if 'sample' in format_:
      format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding)
    for col in format_['columns']:
      col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding)

  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query']

    notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data()
    snippet = notebook['snippets'][0]
    db = get_api(request, snippet)

    if file_format.get('sampleCols'):
      columns = file_format.get('sampleCols')
      sample = file_format.get('sample')
    else:
      snippet['query'] = snippet['statement']
      try:
        sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4]
      except Exception, e:
        LOG.warn('Skipping sample data as query handle might be expired: %s' % e)
        sample = [[], [], [], [], []]
      columns = db.autocomplete(snippet=snippet, database='', table='')
      columns = [
          Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }