Example #1
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Example #2
0
File: api.py Project: dulems/hue-1
def autocomplete(request):
  searcher = CollectionManagerController(request.user)
  autocomplete = searcher.get_autocomplete()

  massaged_collections = []

  for collection in autocomplete['collections']:
    massaged_collections.append({
      'name': collection,
      'isCollection': True,
      'isConfig': False,
    })

  for config in autocomplete['configs']:
    massaged_collections.append({
      'name': config,
      'isCollection': False,
      'isConfig': True,
    })

  response = {
    'status': 0,
    'collections': massaged_collections
  }

  return JsonResponse(response)
Example #3
0
File: api.py Project: zlcken/hue
def collections_import(request):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    collection = json.loads(request.POST.get('collection', '{}'))

    if collection:
        searcher = CollectionManagerController(request.user)
        unique_key, fields = searcher.get_fields(collection.get('name'))

        # Create collection and metadata.
        hue_collection, created = Collection.objects.get_or_create(
            name=collection.get('name'),
            solr_properties='{}',
            is_enabled=True,
            user=request.user)
        properties_dict = hue_collection.properties_dict
        properties_dict['data_type'] = 'separated'
        properties_dict['field_order'] = [field_name for field_name in fields]
        hue_collection.properties = json.dumps(properties_dict)
        hue_collection.save()

        response['status'] = 0
        response['message'] = _('Collection created!')
    else:
        response['message'] = _('Collection missing.')

    return JsonResponse(response)
Example #4
0
File: api.py Project: zlcken/hue
def collections_data(request, collection):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    source = request.POST.get('source')

    if source == 'file':
        searcher = CollectionManagerController(request.user)

        searcher.update_data_from_hdfs(
            request.fs,
            collection,
            None,
            request.POST.get('path'),
            request.POST.get('type'),
            separator=request.POST.get('separator'),
            quote_character=request.POST.get('quote'))

        response['status'] = 0
        response['message'] = _('Index imported!')
    else:
        response['message'] = _('Unsupported source %s') % source

    return JsonResponse(response)
Example #5
0
File: api.py Project: zlcken/hue
def collections_remove(request):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    collections = json.loads(request.POST.get('collections', '[]'))

    if not collections:
        response['message'] = _('No collections to remove.')

    if response.get('message', None) is None:
        searcher = CollectionManagerController(request.user)
        solr_collections = searcher.get_collections()

        for collection in collections:
            if collection.get('name') in solr_collections:
                # Remove collection and instancedir
                searcher.delete_collection(collection.get('name'),
                                           collection.get('isCoreOnly'))

        response['status'] = 0
        response['message'] = _('Collections removed!')

    return JsonResponse(response)
Example #6
0
def index_file(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    _convert_format(file_format["format"], inverse=True)
    collection_name = file_format["name"]
    indexer = Indexer(request.user, request.fs)
    unique_field = indexer.get_unique_field(file_format)
    is_unique_generated = indexer.is_unique_generated(file_format)

    schema_fields = indexer.get_kept_field_list(file_format['columns'])
    if is_unique_generated:
        schema_fields += [{"name": unique_field, "type": "string"}]

    morphline = indexer.generate_morphline_config(collection_name, file_format,
                                                  unique_field)

    collection_manager = CollectionManagerController(request.user)
    if not collection_manager.collection_exists(collection_name):
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

    job_id = indexer.run_morphline(collection_name, morphline,
                                   file_format["path"])

    return JsonResponse({"jobId": job_id})
Example #7
0
    def test_end_to_end(self):
        if not is_live_cluster():
            raise SkipTest()

        cluster = shared_cluster()
        fs = cluster.fs
        collection_name = "test_collection"
        indexer = Indexer("test", fs=fs, jt=cluster.jt)
        input_loc = "/tmp/test.csv"

        # upload the test file to hdfs
        fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

        # open a filestream for the file on hdfs
        stream = fs.open(input_loc)

        # guess the format of the file
        file_type_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        field_types = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": file_type_format
        })

        format_ = field_types.copy()
        format_['format'] = file_type_format

        # find a field name available to use for the record's uuid
        unique_field = indexer.get_unique_field(format_)
        is_unique_generated = indexer.is_unique_generated(format_)

        # generate morphline
        morphline = indexer.generate_morphline_config(collection_name, format_,
                                                      unique_field)

        schema_fields = indexer.get_kept_field_list(format_['columns'])
        if is_unique_generated:
            schema_fields += [{"name": unique_field, "type": "string"}]

        # create the collection from the specified fields
        collection_manager = CollectionManagerController("test")
        if collection_manager.collection_exists(collection_name):
            collection_manager.delete_collection(collection_name, None)
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

        # index the file
        indexer.run_morphline(collection_name, morphline, input_loc)
Example #8
0
    def test_collections_fields(self):
        db = CollectionManagerController(self.user)

        db.get_fields('log_analytics_demo')
        resp = self.client.post(reverse('indexer:install_examples'))
        content = json.loads(resp.content)

        assert_equal(content.get('status'), 0)
        assert_equal(content.get('fields'), 0)
        assert_equal(content.get('unique_key'), 0)
Example #9
0
def collections_create(request):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    collection = json.loads(request.POST.get('collection', '{}'))

    if collection:
        searcher = CollectionManagerController(request.user)

        # Create instance directory, collection, and add fields
        searcher.create_collection(collection.get('name'),
                                   collection.get('fields', []),
                                   collection.get('uniqueKeyField'),
                                   collection.get('df'))

        try:
            if request.POST.get('source') == 'file':
                # Index data
                searcher.update_data_from_hdfs(
                    request.fs,
                    collection.get('name'),
                    collection.get('fields', []),
                    request.POST.get('path'),
                    request.POST.get('type'),
                    separator=request.POST.get('separator'),
                    quote_character=request.POST.get('quote'))

            elif request.POST.get('source') == 'hive':
                # Run a custom hive query and post data to collection
                from beeswax.server import dbms

                db = dbms.get(request.user)

                database = request.POST.get('database')
                table = request.POST.get('table')
                columns = [
                    field['name'] for field in collection.get('fields', [])
                ]

                searcher.update_data_from_hive(db, collection.get('name'),
                                               database, table,
                                               columns)  # Not up to date

            response['status'] = 0
            response['message'] = _('Collection created!')
        except Exception as e:
            LOG.error(e)
            raise
    else:
        response['message'] = _('Collection missing.')

    return JsonResponse(response)
Example #10
0
    def test_create_collection(self):
        db = CollectionManagerController(self.user)

        name = get_db_prefix(name='solr') + 'test_create_collection'
        fields = [{'name': 'my_test', 'type': 'text'}]

        try:
            db.create_collection(name,
                                 fields,
                                 unique_key_field='id',
                                 df='text')
        finally:
            db.delete_collection(name, core=False)
Example #11
0
File: api.py Project: dulems/hue-1
def collections_fields(request, collection):
  if request.method != 'GET':
    raise PopupException(_('GET request required.'))

  response = {}

  searcher = CollectionManagerController(request.user)
  unique_key, fields = searcher.get_fields(collection)

  response['status'] = 0
  response['fields'] = [(field, fields[field]['type'], fields[field].get('indexed', None), fields[field].get('stored', None)) for field in fields]
  response['unique_key'] = unique_key

  return JsonResponse(response)
Example #12
0
  def setup_class(cls):

    if not is_live_cluster():
      raise SkipTest()

    cls.client = make_logged_in_client(username='******', is_superuser=False)
    cls.user = User.objects.get(username='******')
    add_to_group('test')
    grant_access("test", "test", "indexer")

    cls.db = CollectionManagerController(cls.user)

    resp = cls.client.post(reverse('indexer:install_examples'), {'data': 'log_analytics_demo'})
    content = json.loads(resp.content)

    assert_equal(content.get('status'), 0)
Example #13
0
def _small_indexing(user, fs, client, source, destination, index_name):
  kwargs = {}
  errors = []

  if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
    path = urllib.unquote(source["path"])
    stats = fs.stats(path)
    if stats.size > MAX_UPLOAD_SIZE:
      raise PopupException(_('File size is too large to handle!'))

  indexer = MorphlineIndexer(user, fs)

  fields = indexer.get_field_list(destination['columns'])
  _create_solr_collection(user, fs, client, destination, index_name, kwargs)

  if source['inputFormat'] == 'file':
    path = urllib.unquote(source["path"])
    data = fs.read(path, 0, MAX_UPLOAD_SIZE)

  if client.is_solr_six_or_more():
    kwargs['processor'] = 'tolerant'
    kwargs['map'] = 'NULL:'

  try:
    if source['inputFormat'] == 'query':
      query_id = source['query']['id'] if source['query'].get('id') else source['query']

      notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data()
      request = MockedDjangoRequest(user=user)
      snippet = notebook['snippets'][0]

      searcher = CollectionManagerController(user)
      columns = [field['name'] for field in fields if field['name'] != 'hue_id']
      fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live
      rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs)
      # TODO if rows == MAX_ROWS truncation warning
    elif source['inputFormat'] == 'manual':
      pass # No need to do anything
    else:
      response = client.index(name=index_name, data=data, **kwargs)
      errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])]
  except Exception, e:
    try:
      client.delete_index(index_name, keep_config=False)
    except Exception, e2:
      LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
  def test_end_to_end(self):
    if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = MorphlineIndexer("test", fs=fs, jt=cluster.jt, solr_client=self.solr_client)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Example #15
0
File: api.py Project: dulems/hue-1
def collections_update(request, collection):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if not collection:
    response['message'] = _('No collection to update.')

  if response.get('message', None) is None:
    searcher = CollectionManagerController(request.user)
    searcher.update_collection(collection.get('name'), collection.get('fields', []))

    response['status'] = 0
    response['message'] = _('Collection updated!')

  return JsonResponse(response)
Example #16
0
File: api.py Project: zlcken/hue
def collections(request):
    searcher = CollectionManagerController(request.user)
    solr_collections = searcher.get_collections()
    massaged_collections = []

    for collection in solr_collections:
        massaged_collections.append({
            'name':
            collection,
            'isCoreOnly':
            solr_collections[collection]['isCoreOnly'],
            'isAlias':
            solr_collections[collection].get('isAlias', False),
            'collections':
            solr_collections[collection].get('collections', []),
        })

    response = {'status': 0, 'collections': massaged_collections}

    return JsonResponse(response)
Example #17
0
 def test_get_collections(self):
     db = CollectionManagerController(self.user)
     db.get_collections()
Example #18
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Example #19
0
 def test_collection_exists(self):
     db = CollectionManagerController(self.user)
     assert_false(db.collection_exists('does_not_exist'))
Example #20
0
 def test_is_solr_cloud_mode(self):
     assert_true(
         CollectionManagerController(self.user).is_solr_cloud_mode())