Example #1
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse=True)
  collection_name = file_format["name"]

  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  else:
    input_path = file_format["path"]

  job_handle = indexer.run_morphline(request, collection_name, morphline, input_path) #TODO if query generate insert
  return JsonResponse(job_handle)
Example #2
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  else:
    input_path = None

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Example #3
0
def _index(request, file_format, collection_name, query=None):
  indexer = Indexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query)
Example #4
0
def index_file(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    _convert_format(file_format["format"], inverse=True)
    collection_name = file_format["name"]
    indexer = Indexer(request.user, request.fs)
    unique_field = indexer.get_unique_field(file_format)
    is_unique_generated = indexer.is_unique_generated(file_format)

    schema_fields = indexer.get_kept_field_list(file_format['columns'])
    if is_unique_generated:
        schema_fields += [{"name": unique_field, "type": "string"}]

    morphline = indexer.generate_morphline_config(collection_name, file_format,
                                                  unique_field)

    collection_manager = CollectionManagerController(request.user)
    if not collection_manager.collection_exists(collection_name):
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

    job_id = indexer.run_morphline(collection_name, morphline,
                                   file_format["path"])

    return JsonResponse({"jobId": job_id})
Example #5
0
    def test_end_to_end(self):
        if not is_live_cluster():
            raise SkipTest()

        cluster = shared_cluster()
        fs = cluster.fs
        collection_name = "test_collection"
        indexer = Indexer("test", fs=fs, jt=cluster.jt)
        input_loc = "/tmp/test.csv"

        # upload the test file to hdfs
        fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

        # open a filestream for the file on hdfs
        stream = fs.open(input_loc)

        # guess the format of the file
        file_type_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        field_types = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": file_type_format
        })

        format_ = field_types.copy()
        format_['format'] = file_type_format

        # find a field name available to use for the record's uuid
        unique_field = indexer.get_unique_field(format_)
        is_unique_generated = indexer.is_unique_generated(format_)

        # generate morphline
        morphline = indexer.generate_morphline_config(collection_name, format_,
                                                      unique_field)

        schema_fields = indexer.get_kept_field_list(format_['columns'])
        if is_unique_generated:
            schema_fields += [{"name": unique_field, "type": "string"}]

        # create the collection from the specified fields
        collection_manager = CollectionManagerController("test")
        if collection_manager.collection_exists(collection_name):
            collection_manager.delete_collection(collection_name, None)
        collection_manager.create_collection(collection_name,
                                             schema_fields,
                                             unique_key_field=unique_field)

        # index the file
        indexer.run_morphline(collection_name, morphline, input_loc)
Example #6
0
  def test_create_collection(self):
    db = CollectionManagerController(self.user)

    name = get_db_prefix(name='solr') + 'test_create_collection'
    fields = [{'name': 'my_test', 'type': 'text'}]

    try:
      db.create_collection(name, fields, unique_key_field='id', df='text')
    finally:
      db.delete_collection(name, core=False)
Example #7
0
def collections_create(request):
    if request.method != 'POST':
        raise PopupException(_('POST request required.'))

    response = {'status': -1}

    collection = json.loads(request.POST.get('collection', '{}'))

    if collection:
        searcher = CollectionManagerController(request.user)

        # Create instance directory, collection, and add fields
        searcher.create_collection(collection.get('name'),
                                   collection.get('fields', []),
                                   collection.get('uniqueKeyField'),
                                   collection.get('df'))

        try:
            if request.POST.get('source') == 'file':
                # Index data
                searcher.update_data_from_hdfs(
                    request.fs,
                    collection.get('name'),
                    collection.get('fields', []),
                    request.POST.get('path'),
                    request.POST.get('type'),
                    separator=request.POST.get('separator'),
                    quote_character=request.POST.get('quote'))

            elif request.POST.get('source') == 'hive':
                # Run a custom hive query and post data to collection
                from beeswax.server import dbms

                db = dbms.get(request.user)

                database = request.POST.get('database')
                table = request.POST.get('table')
                columns = [
                    field['name'] for field in collection.get('fields', [])
                ]

                searcher.update_data_from_hive(db, collection.get('name'),
                                               database, table,
                                               columns)  # Not up to date

            response['status'] = 0
            response['message'] = _('Collection created!')
        except Exception as e:
            LOG.error(e)
            raise
    else:
        response['message'] = _('Collection missing.')

    return JsonResponse(response)
Example #8
0
    def test_create_collection(self):
        db = CollectionManagerController(self.user)

        name = get_db_prefix(name='solr') + 'test_create_collection'
        fields = [{'name': 'my_test', 'type': 'text'}]

        try:
            db.create_collection(name,
                                 fields,
                                 unique_key_field='id',
                                 df='text')
        finally:
            db.delete_collection(name, core=False)
Example #9
0
  def test_end_to_end(self):
    if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = Indexer("test", fs=fs, jt=cluster.jt)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
  def test_end_to_end(self):
    if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = MorphlineIndexer("test", fs=fs, jt=cluster.jt, solr_client=self.solr_client)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Example #11
0
def index_file(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))
  _convert_format(file_format["format"], inverse = True)
  collection_name = file_format["name"]
  indexer = Indexer(request.user, request.fs)
  unique_field = indexer.get_uuid_name(file_format)
  schema_fields = [{"name": unique_field, "type": "string"}] + \
    indexer.get_kept_field_list(file_format['columns'])

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field)

  collection_manager = CollectionManagerController(request.user)
  if not collection_manager.collection_exists(collection_name):
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

  job_id = indexer.run_morphline(collection_name, morphline, file_format["path"])

  return JsonResponse({"jobId": job_id})
Example #12
0
def collections_create(request):
  if request.method != 'POST':
    raise PopupException(_('POST request required.'))

  response = {'status': -1}

  collection = json.loads(request.POST.get('collection', '{}'))

  if collection:
    searcher = CollectionManagerController(request.user)

    # Create instance directory, collection, and add fields
    searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df'))

    try:
      if request.POST.get('source') == 'file':
        # Index data
        searcher.update_data_from_hdfs(request.fs,
                                       collection.get('name'),
                                       collection.get('fields', []),
                                       request.POST.get('path'),
                                       request.POST.get('type'),
                                       separator=request.POST.get('separator'),
                                       quote_character=request.POST.get('quote'))

      elif request.POST.get('source') == 'hive':
        # Run a custom hive query and post data to collection
        from beeswax.server import dbms

        db = dbms.get(request.user)

        database = request.POST.get('database')
        table = request.POST.get('table')
        columns = [field['name'] for field in collection.get('fields', [])]

        searcher.update_data_from_hive(db, collection.get('name'), database, table, columns)

      response['status'] = 0
      response['message'] = _('Collection created!')
    except Exception, e:
      LOG.error(e)
      raise
Example #13
0
  def test_end_to_end(self):
    fs = cluster.get_hdfs()
    collection_name = "test_collection"
    indexer = Indexer("test", fs)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=IndexerTest.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_uuid_name(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = [{"name": unique_field, "type": "string"}] + indexer.get_kept_field_list(format_['columns'])

    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(collection_name, morphline, input_loc)