def collections_create(request): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collection = json.loads(request.POST.get('collection', '{}')) if collection: searcher = CollectionManagerController(request.user) # Create instance directory, collection, and add fields searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df')) try: if request.POST.get('source') == 'file': # Index data searcher.update_data_from_hdfs( request.fs, collection.get('name'), collection.get('fields', []), request.POST.get('path'), request.POST.get('type'), separator=request.POST.get('separator'), quote_character=request.POST.get('quote')) elif request.POST.get('source') == 'hive': # Run a custom hive query and post data to collection from beeswax.server import dbms db = dbms.get(request.user) database = request.POST.get('database') table = request.POST.get('table') columns = [ field['name'] for field in collection.get('fields', []) ] searcher.update_data_from_hive(db, collection.get('name'), database, table, columns) # Not up to date response['status'] = 0 response['message'] = _('Collection created!') except Exception as e: LOG.error(e) raise else: response['message'] = _('Collection missing.') return JsonResponse(response)
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] elif file_format['inputFormat'] == 'hs2_handle': searcher = CollectionManagerController(request.user) columns = ['_uuid'] + [field['name'] for field in file_format['columns']] return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle']) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def collections_create(request): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collection = json.loads(request.POST.get('collection', '{}')) if collection: searcher = CollectionManagerController(request.user) # Create instance directory, collection, and add fields searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df')) try: if request.POST.get('source') == 'file': # Index data searcher.update_data_from_hdfs(request.fs, collection.get('name'), collection.get('fields', []), request.POST.get('path'), request.POST.get('type'), separator=request.POST.get('separator'), quote_character=request.POST.get('quote')) elif request.POST.get('source') == 'hive': # Run a custom hive query and post data to collection from beeswax.server import dbms db = dbms.get(request.user) database = request.POST.get('database') table = request.POST.get('table') columns = [field['name'] for field in collection.get('fields', [])] searcher.update_data_from_hive(db, collection.get('name'), database, table, columns) response['status'] = 0 response['message'] = _('Collection created!') except Exception, e: LOG.error(e) raise
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib.unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib.unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get('id') else source['query'] notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [field['name'] for field in fields if field['name'] != 'hue_id'] fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def _small_indexing(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] == 'file': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))