Ejemplo n.º 1
0
def collections_create(request):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    collection = json.loads(request.POST.get('collection', '{}'))

    if collection:
        searcher = CollectionManagerController(request.user)

        # Create instance directory, collection, and add fields
        searcher.create_collection(collection.get('name'),
                                   collection.get('fields', []),
                                   collection.get('uniqueKeyField'),
                                   collection.get('df'))

        try:
            if request.POST.get('source') == 'file':
                # Index data
                searcher.update_data_from_hdfs(
                    request.fs,
                    collection.get('name'),
                    collection.get('fields', []),
                    request.POST.get('path'),
                    request.POST.get('type'),
                    separator=request.POST.get('separator'),
                    quote_character=request.POST.get('quote'))

            elif request.POST.get('source') == 'hive':
                # Run a custom hive query and post data to collection
                from beeswax.server import dbms

                db = dbms.get(request.user)

                database = request.POST.get('database')
                table = request.POST.get('table')
                columns = [
                    field['name'] for field in collection.get('fields', [])
                ]

                searcher.update_data_from_hive(db, collection.get('name'),
                                               database, table,
                                               columns)  # Not up to date

            response['status'] = 0
            response['message'] = _('Collection created!')
        except Exception as e:
            LOG.error(e)
            raise
    else:
        response['message'] = _('Collection missing.')

    return JsonResponse(response)
Ejemplo n.º 2
0
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name):
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
    )

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
Ejemplo n.º 3
0
def collections_create(request):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if collection:
    searcher = CollectionManagerController(request.user)

    # Create instance directory, collection, and add fields
    searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df'))

    try:
      if request.POST.get('source') == 'file':
        # Index data
        searcher.update_data_from_hdfs(request.fs,
                                       collection.get('name'),
                                       collection.get('fields', []),
                                       request.POST.get('path'),
                                       request.POST.get('type'),
                                       separator=request.POST.get('separator'),
                                       quote_character=request.POST.get('quote'))

      elif request.POST.get('source') == 'hive':
        # Run a custom hive query and post data to collection
        from beeswax.server import dbms

        db = dbms.get(request.user)

        database = request.POST.get('database')
        table = request.POST.get('table')
        columns = [field['name'] for field in collection.get('fields', [])]

        searcher.update_data_from_hive(db, collection.get('name'), database, table, columns)

      response['status'] = 0
      response['message'] = _('Collection created!')
    except Exception, e:
      LOG.error(e)
      raise
Ejemplo n.º 4
0
def _small_indexing(user, fs, client, source, destination, index_name):
  kwargs = {}
  errors = []

  if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
    path = urllib.unquote(source["path"])
    stats = fs.stats(path)
    if stats.size > MAX_UPLOAD_SIZE:
      raise PopupException(_('File size is too large to handle!'))

  indexer = MorphlineIndexer(user, fs)

  fields = indexer.get_field_list(destination['columns'])
  _create_solr_collection(user, fs, client, destination, index_name, kwargs)

  if source['inputFormat'] == 'file':
    path = urllib.unquote(source["path"])
    data = fs.read(path, 0, MAX_UPLOAD_SIZE)

  if client.is_solr_six_or_more():
    kwargs['processor'] = 'tolerant'
    kwargs['map'] = 'NULL:'

  try:
    if source['inputFormat'] == 'query':
      query_id = source['query']['id'] if source['query'].get('id') else source['query']

      notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data()
      request = MockedDjangoRequest(user=user)
      snippet = notebook['snippets'][0]

      searcher = CollectionManagerController(user)
      columns = [field['name'] for field in fields if field['name'] != 'hue_id']
      fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live
      rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs)
      # TODO if rows == MAX_ROWS truncation warning
    elif source['inputFormat'] == 'manual':
      pass # No need to do anything
    else:
      response = client.index(name=index_name, data=data, **kwargs)
      errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])]
  except Exception, e:
    try:
      client.delete_index(index_name, keep_config=False)
    except Exception, e2:
      LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
Ejemplo n.º 5
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))