Exemple #1
0
def collections_data(request, collection):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  source = request.POST.get('source')

  if source == 'file':
    searcher = CollectionManagerController(request.user)

    searcher.update_data_from_hdfs(request.fs,
                                   collection,
                                   None,
                                   request.POST.get('path'),
                                   request.POST.get('type'),
                                   separator=request.POST.get('separator'),
                                   quote_character=request.POST.get('quote'))

    response['status'] = 0
    response['message'] = _('Index imported!')
  else:
    response['message'] = _('Unsupported source %s') % source

  return JsonResponse(response)
Exemple #2
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  else:
    input_path = None

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Exemple #3
0
def collections_import(request):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if collection:
    searcher = CollectionManagerController(request.user)
    unique_key, fields = searcher.get_fields(collection.get('name'))

    # Create collection and metadata.
    hue_collection, created = Collection.objects.get_or_create(name=collection.get('name'), solr_properties='{}', is_enabled=True, user=request.user)
    properties_dict = hue_collection.properties_dict
    properties_dict['data_type'] = 'separated'
    properties_dict['field_order'] = [field_name for field_name in fields]
    hue_collection.properties = json.dumps(properties_dict)
    hue_collection.save()

    response['status'] = 0
    response['message'] = _('Collection created!')
  else:
    response['message'] = _('Collection missing.')

  return JsonResponse(response)
Exemple #4
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse=True)
  collection_name = file_format["name"]

  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  else:
    input_path = file_format["path"]

  job_handle = indexer.run_morphline(request, collection_name, morphline, input_path) #TODO if query generate insert
  return JsonResponse(job_handle)
Exemple #5
0
def autocomplete(request):
  searcher = CollectionManagerController(request.user)
  autocomplete = searcher.get_autocomplete()

  massaged_collections = []

  for collection in autocomplete['collections']:
    massaged_collections.append({
      'name': collection,
      'isCollection': True,
      'isConfig': False,
    })

  for config in autocomplete['configs']:
    massaged_collections.append({
      'name': config,
      'isCollection': False,
      'isConfig': True,
    })

  response = {
    'status': 0,
    'collections': massaged_collections
  }

  return JsonResponse(response)
Exemple #6
0
  def test_collections_fields(self):
    db = CollectionManagerController(self.user)

    db.get_fields('log_analytics')
    resp = self.client.post(reverse('indexer:install_examples'))
    content = json.loads(resp.content)

    assert_equal(content.get('status'), 0)
    assert_equal(content.get('fields'), 0)
    assert_equal(content.get('unique_key'), 0)
Exemple #7
0
  def test_create_collection(self):
    db = CollectionManagerController(self.user)

    name = get_db_prefix(name='solr') + 'test_create_collection'
    fields = [{'name': 'my_test', 'type': 'text'}]

    try:
      db.create_collection(name, fields, unique_key_field='id', df='text')
    finally:
      db.delete_collection(name, core=False)
Exemple #8
0
def collections_fields(request, collection):
  if request.method != 'GET':
    raise PopupException(_('GET request required.'))

  response = {}

  searcher = CollectionManagerController(request.user)
  unique_key, fields = searcher.get_fields(collection)

  response['status'] = 0
  response['fields'] = [(field, fields[field]['type'], fields[field].get('indexed', None), fields[field].get('stored', None)) for field in fields]
  response['unique_key'] = unique_key

  return JsonResponse(response)
Exemple #9
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse = True)
  collection_name = file_format["name"]
  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_uuid_name(file_format)
  schema_fields = [{"name": unique_field, "type": "string"}] + \
    indexer.get_kept_field_list(file_format['columns'])

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  job_id = indexer.run_morphline(collection_name, morphline, file_format["path"])

  return JsonResponse({"jobId": job_id})
Exemple #10
0
def collections(request):
  searcher = CollectionManagerController(request.user)
  solr_collections = searcher.get_collections()
  massaged_collections = []

  for collection in solr_collections:
    massaged_collections.append({
      'name': collection,
      'isCoreOnly': solr_collections[collection]['isCoreOnly'],
      'isAlias': solr_collections[collection].get('isAlias', False),
      'collections': solr_collections[collection].get('collections', []),
    })

  response = {
    'status': 0,
    'collections': massaged_collections
  }

  return JsonResponse(response)
Exemple #11
0
def collections_update(request, collection):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if not collection:
    response['message'] = _('No collection to update.')

  if response.get('message', None) is None:
    searcher = CollectionManagerController(request.user)
    searcher.update_collection(collection.get('name'), collection.get('fields', []))

    response['status'] = 0
    response['message'] = _('Collection updated!')

  return JsonResponse(response)
Exemple #12
0
def collections_remove(request):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collections = json.loads(request.POST.get('collections', '[]'))

  if not collections:
    response['message'] = _('No collections to remove.')

  if response.get('message', None) is None:
    searcher = CollectionManagerController(request.user)
    solr_collections = searcher.get_collections()

    for collection in collections:
      if collection.get('name') in solr_collections:
        # Remove collection and instancedir
        searcher.delete_collection(collection.get('name'), collection.get('isCoreOnly'))

    response['status'] = 0
    response['message'] = _('Collections removed!')

  return JsonResponse(response)
Exemple #13
0
def collections_create(request):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if collection:
    searcher = CollectionManagerController(request.user)

    # Create instance directory, collection, and add fields
    searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df'))

    try:
      if request.POST.get('source') == 'file':
        # Index data
        searcher.update_data_from_hdfs(request.fs,
                                       collection.get('name'),
                                       collection.get('fields', []),
                                       request.POST.get('path'),
                                       request.POST.get('type'),
                                       separator=request.POST.get('separator'),
                                       quote_character=request.POST.get('quote'))

      elif request.POST.get('source') == 'hive':
        # Run a custom hive query and post data to collection
        from beeswax.server import dbms

        db = dbms.get(request.user)

        database = request.POST.get('database')
        table = request.POST.get('table')
        columns = [field['name'] for field in collection.get('fields', [])]

        searcher.update_data_from_hive(db, collection.get('name'), database, table, columns)

      response['status'] = 0
      response['message'] = _('Collection created!')
    except Exception, e:
      LOG.error(e)
      raise
Exemple #14
0
  def test_end_to_end(self):
    if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = Indexer("test", fs=fs, jt=cluster.jt)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Exemple #15
0
  def test_end_to_end(self):
    fs = cluster.get_hdfs()
    collection_name = "test_collection"
    indexer = Indexer("test", fs)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=IndexerTest.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_uuid_name(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = [{"name": unique_field, "type": "string"}] + indexer.get_kept_field_list(format_['columns'])

    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(collection_name, morphline, input_loc)
Exemple #16
0
def _small_indexing(user, fs, client, source, destination, index_name):
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        path = urllib_unquote(source["path"])
        stats = fs.stats(path)
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)

    fields = indexer.get_field_list(destination['columns'])
    _create_solr_collection(user, fs, client, destination, index_name, kwargs)

    if source['inputFormat'] == 'file':
        path = urllib_unquote(source["path"])
        data = fs.read(path, 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'
        kwargs['map'] = 'NULL:'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        elif source['inputFormat'] == 'manual':
            pass  # No need to do anything
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception as e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception as e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
        raise e

    return {
        'status': 0,
        'on_success_url': reverse('indexer:indexes',
                                  kwargs={'index': index_name}),
        'pub_sub_url': 'assist.collections.refresh',
        'errors': errors
    }
Exemple #17
0
 def test_is_solr_cloud_mode(self):
     assert_true(
         CollectionManagerController(self.user).is_solr_cloud_mode())
Exemple #18
0
 def test_collection_exists(self):
     db = CollectionManagerController(self.user)
     assert_false(db.collection_exists('does_not_exist'))
Exemple #19
0
 def test_get_collections(self):
     db = CollectionManagerController(self.user)
     db.get_collections()
Exemple #20
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    for field in fields:
        for operation in field['operations']:
            if operation['type'] == 'split':
                field[
                    'multiValued'] = True  # Solr requires multiValued to be set when splitting
                kwargs['f.%(name)s.split' % field] = 'true'
                kwargs['f.%(name)s.separator' %
                       field] = operation['settings']['splitChar'] or ','

    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'
        kwargs['map'] = 'NULL:'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Exemple #21
0
 def test_collection_exists(self):
   db = CollectionManagerController(self.user)
   assert_false(db.collection_exists('does_not_exist'))
Exemple #22
0
 def test_get_collections(self):
   db = CollectionManagerController(self.user)
   db.get_collections()
    def test_end_to_end(self):
        if not is_live_cluster(
        ) or True:  # Skipping as requires morplines libs to be setup
            raise SkipTest()

        cluster = shared_cluster()
        fs = cluster.fs
        make_logged_in_client(username="******",
                              groupname="default",
                              recreate=True,
                              is_superuser=False)
        user = User.objects.get(username="******")
        collection_name = "test_collection"
        indexer = Indexer("test", fs=fs, jt=cluster.jt)
        input_loc = "/tmp/test.csv"

        # upload the test file to hdfs
        fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

        # open a filestream for the file on hdfs
        stream = fs.open(input_loc)

        # guess the format of the file
        file_type_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        field_types = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": file_type_format
        })

        format_ = field_types.copy()
        format_['format'] = file_type_format

        # find a field name available to use for the record's uuid
        unique_field = indexer.get_unique_field(format_)
        is_unique_generated = indexer.is_unique_generated(format_)

        # generate morphline
        morphline = indexer.generate_morphline_config(collection_name, format_,
                                                      unique_field)

        schema_fields = indexer.get_kept_field_list(format_['columns'])
        if is_unique_generated:
            schema_fields += [{"name": unique_field, "type": "string"}]

        # create the collection from the specified fields
        collection_manager = CollectionManagerController("test")
        if collection_manager.collection_exists(collection_name):
            collection_manager.delete_collection(collection_name, None)
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

        # index the file
        indexer.run_morphline(
            MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt),
            collection_name, morphline, input_loc)