def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib.unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def delete_collection(self, name, core): """ Delete solr collection/core and instance dir """ api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) client = SolrClient(self.user) if core: raise PopupException(_('Cannot remove Solr cores.')) if api.remove_collection(name): # Delete instance directory. try: root_node = '%s/%s' % (ZK_SOLR_CONFIG_NAMESPACE, name) with ZookeeperClient(hosts=client.get_zookeeper_host(), read_only=False) as zc: zc.delete_path(root_node) except Exception as e: # Re-create collection so that we don't have an orphan config api.add_collection(name) raise PopupException( _('Error in deleting Solr configurations.'), detail=e) else: raise PopupException( _('Could not remove collection. Check error logs for more info.' ))
def update_data_from_hive(self, collection_or_core_name, columns, fetch_handle): MAX_ROWS = 10000 ROW_COUNT = 0 FETCH_BATCH = 1000 has_more = True client = SolrClient(self.user) try: while ROW_COUNT < MAX_ROWS and has_more: result = fetch_handle(FETCH_BATCH, ROW_COUNT == 0) has_more = result['has_more'] if result['data']: kwargs = {} dataset = tablib.Dataset() dataset.append(columns) for i, row in enumerate(result['data']): dataset.append([ROW_COUNT + i] + [ cell if cell else (0 if isinstance(cell, numbers.Number) else '') for cell in row ]) if not client.index(name=collection_or_core_name, data=dataset.csv, **kwargs): raise PopupException( _('Could not update index. Check error logs for more info.' )) ROW_COUNT += len(dataset) except Exception, e: raise PopupException(_('Could not update index: %s') % e)
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] elif file_format['inputFormat'] == 'hs2_handle': searcher = CollectionManagerController(request.user) columns = ['_uuid'] + [field['name'] for field in file_format['columns']] return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle']) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def indexer(request): if not request.user.has_hue_permission(action="access", app='search'): raise PopupException(_('Missing permission.'), error_code=403) searcher = SolrClient(request.user) indexes = searcher.get_indexes() for index in indexes: index['isSelected'] = False return render( 'indexer.mako', request, { 'is_embeddable': request.GET.get('is_embeddable', False), 'indexes_json': json.dumps(indexes), 'fields_json': json.dumps([field.name for field in FIELD_TYPES]), 'operators_json': json.dumps([operator.to_dict() for operator in OPERATORS]), 'file_types_json': json.dumps([ format_.format_info() for format_ in get_file_indexable_format_types() ]), 'default_field_type': json.dumps(Field().to_dict()) })
def datasets(self, show_all=False): # True if non Solr Cloud client = SolrClient(user=self.user) show_all = show_all or not client.is_solr_cloud_mode() return [ index['name'] for index in client.get_indexes(include_cores=show_all) ]
def get_tables(self, database, table_names=[]): searcher = SolrClient(self.user) return [{ 'name': table['name'], 'comment': '', 'type': 'View' if table['type'] == 'alias' else 'Table' } for table in searcher.get_indexes()]
def list_configs(request): response = {'status': -1} client = SolrClient(user=request.user) response['configs'] = client.list_configs() response['status'] = 0 return JsonResponse(response)
def list_indexes(request): response = {'status': -1} client = SolrClient(user=request.user) response['collections'] = client.get_indexes() response['status'] = 0 return JsonResponse(response)
def collections(request): client = SolrClient(user=request.user) response = { 'status': 0, 'collections': client.get_indexes() } return JsonResponse(response)
def run_morphline(self, request, collection_name, morphline, input_path, query=None, start_time=None, lib_path=None): workspace_path = self._upload_workspace(morphline) task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', on_success_url=reverse('search:browse', kwargs={'name': collection_name}), pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time ) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, _success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location task.add_hive_snippet(snippet['database'], sql) client = SolrClient(self.user) extra_args = ['-Dmapreduce.job.user.classpath.first=true'] if client.is_solr_six_or_more() else [] task.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=lib_path if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(), arguments=extra_args + [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', client.get_zookeeper_host(), u'--collection', collection_name, input_path, ], files=[ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ] ) return task.execute(request, batch=True)
def index(request): response = {'status': -1} name = request.POST.get('name') data = request.POST.get('data') client = SolrClient(request.user) client.index(name, data) response['status'] = 0 response['message'] = _('Data added') return JsonResponse(response)
def config_index(request): response = {'status': -1} name = request.POST.get('name') client = SolrClient(user=request.user) response['config'] = json.dumps(client.get_config(name), indent=2) response['status'] = 0 return JsonResponse(response)
def sample_index(request): response = {'status': -1} name = request.POST.get('name') client = SolrClient(user=request.user) response['sample'] = client.sample_index(name)['response']['docs'] response['status'] = 0 return JsonResponse(response)
def setup_class(cls): cls.client = make_logged_in_client(username='******', is_superuser=False) cls.user = User.objects.get(username='******') add_to_group('test') grant_access("test", "test", "indexer") global _IS_SOLR_CLOUD global _IS_SOLR_6_OR_MORE global _IS_SOLR_WITH_HDFS global _ZOOKEEPER_HOST SolrClient._reset_properties()
def list_index(request): response = {'status': -1} name = request.REQUEST.get('name') client = SolrClient(user=request.user) response['schema'] = client.list_schema(name) response['name'] = name response['status'] = 0 return JsonResponse(response)
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']: client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline( request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path )
def create_alias(request): response = {'status': -1} alias = request.POST.get('alias', '') collections = json.loads(request.POST.get('collections', '[]')) client = SolrClient(request.user) client.create_alias(alias, collections) response['status'] = 0 response['message'] = _('Alias created or modified!') return JsonResponse(response)
def create_alias(request): response = {'status': -1} name = request.POST.get('name', '') collections = json.loads(request.POST.get('collections', '[]')) client = SolrClient(request.user) response['status'] = 0 response['response'] = client.create_alias(name, collections) response['alias'] = {'name': name, 'type': 'alias', 'collections': collections, 'isSelected': False} response['message'] = _('Alias created or modified!') return JsonResponse(response)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': source['path'] = request.fs.netnormpath(source['path']) if source['path'] else source['path'] if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) if destination['nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _index(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) job_handle = _create_index(request.user, request.fs, client, source, destination, index_name) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) return JsonResponse(job_handle)
def __init__(self, username, fs=None, jt=None, solr_client=None): self.fs = fs self.jt = jt self.username = username self.user = User.objects.get(username=username) # To clean self.solr_client = solr_client if solr_client is not None else SolrClient( self.user)
def __init__(self, user, fs=None, jt=None, solr_client=None): self.fs = fs self.jt = jt self.user = user self.username = user.username self.solr_client = solr_client if solr_client is not None else SolrClient( self.user)
def test_get_ensemble_upstream_solr(self): client = SolrClient(self.user, api=MockSolrUpstreamCloudApi()) client._reset_properties() assert_true(client.is_solr_cloud_mode()) assert_true(client.is_solr_six_or_more()) assert_false(client.is_solr_with_hdfs()) assert_equal('localhost:9983', client.get_zookeeper_host())
def test_get_ensemble_cdh_solr(self): client = SolrClient(self.user, api=MockSolrCdhCloudHdfsApi()) client._reset_properties() assert_true(client.is_solr_cloud_mode()) assert_false(client.is_solr_six_or_more()) assert_true(client.is_solr_with_hdfs()) assert_equal('hue.com:2181/solr', client.get_zookeeper_host())
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib.unquote(source['path']) source['path'] = request.fs.netnormpath(path) parent_path = request.fs.parent_path(path) stats = request.fs.stats(parent_path) split = urlparse(path) # Only for HDFS, import data and non-external table if split.scheme in ('', 'hdfs') and destination['importData'] and destination['useDefaultLocation'] and oct(stats["mode"])[-1] != '7': user_scratch_dir = request.fs.get_home_dir() + '/.scratchdir' request.fs.do_as_user(request.user, request.fs.mkdir, user_scratch_dir, 00777) request.fs.do_as_user(request.user, request.fs.rename, source['path'], user_scratch_dir) source['path'] = user_scratch_dir + '/' + source['path'].split('/')[-1] if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) if destination['nonDefaultLocation'] else destination['nonDefaultLocation'] if source['inputFormat'] == 'stream': job_handle = _envelope_job(request, source, destination, start_time=start_time, lib_path=destination['indexerJobLibPath']) elif destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _large_indexing(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def get_field_list(self, field_data, is_converting_types=False): fields = [] queue = deque(field_data) while len(queue): curr_field = queue.popleft() curr_field['type'] = curr_field['type'] if is_converting_types: SolrClient._port_field_types(curr_field) fields.append(curr_field) for operation in curr_field["operations"]: for field in operation["fields"]: queue.append(field) return fields
def setUp(self): self.c = make_logged_in_client(is_superuser=False) grant_access("test", "test", "indexer") add_to_group("test") self.user = User.objects.get(username='******') self.solr_client = SolrClient(self.user, api=MockSolrCdhCloudHdfsApi()) self.finish = ENABLE_NEW_INDEXER.set_for_testing(True)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _index(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) unique_key_field = destination[ 'indexerDefaultField'] and destination['indexerDefaultField'][ 0] or None df = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None kwargs = {} stats = request.fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(request.user, request.fs) fields = indexer.get_kept_field_list(source['columns']) if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not client.exists(index_name): client.create_index(name=index_name, fields=fields, unique_key_field=unique_key_field, df=df) data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE) client.index(name=index_name, data=data, **kwargs) job_handle = { 'status': 0, 'on_success_url': reverse('search:browse', kwargs={'name': index_name}) } elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) return JsonResponse(job_handle)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib.unquote(source['path']) source['path'] = request.fs.netnormpath(path) if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath( destination['nonDefaultLocation']) if destination[ 'nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def update_data_from_hive(self, collection_or_core_name, columns, fetch_handle, indexing_options=None): MAX_ROWS = 10000 FETCH_BATCH = 1000 row_count = 0 has_more = True if indexing_options is None: indexing_options = {} client = SolrClient(self.user) try: while row_count < MAX_ROWS and has_more: result = fetch_handle(FETCH_BATCH, row_count == 0) has_more = result['has_more'] if result['data']: dataset = tablib.Dataset() dataset.append(columns) for i, row in enumerate(result['data']): dataset.append([ cell if cell else (0 if isinstance(cell, numbers.Number) else '') for cell in row ]) if not client.index(name=collection_or_core_name, data=dataset.csv, **indexing_options): raise PopupException( _('Could not index the data. Check error logs for more info.' )) row_count += len(dataset) except Exception as e: raise PopupException(_('Could not update index: %s') % e) return row_count
def datasets(self, show_all=False): # True if non Solr Cloud client = SolrClient(user=self.user) show_all = show_all or not client.is_solr_cloud_mode() return [index['name'] for index in client.get_indexes(include_cores=show_all)]