def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get( 'show_command'): # if destination['isTargetExisting']: client.create_index(name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format[ 'streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def datasets(self, show_all=False): # True if non Solr Cloud client = SolrClient(user=self.user) show_all = show_all or not client.is_solr_cloud_mode() return [index['name'] for index in client.get_indexes(include_cores=show_all)]
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib_unquote(source['path']) source['path'] = request.fs.netnormpath(path) if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath( destination['nonDefaultLocation']) if destination[ 'nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif source['inputFormat'] in ( 'stream', 'connector') or destination['ouputFormat'] == 'stream': job_handle = _envelope_job(request, source, destination, start_time=start_time, lib_path=destination['indexerJobLibPath']) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) file_encoding = None if source['inputFormat'] == 'file': if source['path']: path = urllib_unquote(source['path']) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) source['path'] = request.fs.netnormpath(path) stream = request.fs.open(path) file_encoding = check_encoding(stream.read(10000)) if destination['ouputFormat'] in ('database', 'table') and request.fs is not None: destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \ if destination['nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif destination['ouputFormat'] == 'stream-table': args = { 'source': source, 'destination': destination, 'start_time': start_time, 'dry_run': request.POST.get('show_command') } api = FlinkIndexer(request.user, request.fs) job_nb = api.create_table_from_kafka(**args) if request.POST.get('show_command'): job_handle = {'status': 0, 'commands': job_nb} else: job_handle = job_nb.execute(request, batch=False) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) elif destination['ouputFormat'] == 'big-table': args = { 'request': request, 'source': source, 'destination': destination, 'start_time': start_time, 'dry_run': request.POST.get('show_command') } api = PhoenixIndexer(request.user, request.fs) job_nb = api.create_table_from_file(**args) if request.POST.get('show_command'): job_handle = {'status': 0, 'commands': job_nb} else: job_handle = job_nb.execute(request, batch=False) else: if source['inputFormat'] == 'localfile': job_handle = _create_table_from_local(request, source, destination, start_time) else: # TODO: if inputFormat is 'stream' and tableFormat is 'kudu' --> create Table only job_handle = _create_table(request, source, destination, start_time, file_encoding) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None): collection_name = destination['name'] indexer = EnvelopeIndexer(request.user, request.fs) lib_path = '/tmp/envelope-0.5.0.jar' input_path = None if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] properties = {'format': 'json'} elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'sfdc': properties = { 'streamSelection': file_format['streamSelection'], 'streamUsername': file_format['streamUsername'], 'streamPassword': file_format['streamPassword'], 'streamToken': file_format['streamToken'], 'streamEndpointUrl': file_format['streamEndpointUrl'], 'streamObject': file_format['streamObject'], } elif file_format['streamSelection'] == 'kafka': manager = ManagerApi() properties = { "brokers": manager.get_kafka_brokers(), "output_table": "impala::%s" % collection_name, "topics": file_format['kafkaSelectedTopics'], "kafkaFieldType": file_format['kafkaFieldType'], "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'], "kafkaFieldNames": file_format['kafkaFieldNames'], "kafkaFieldTypes": file_format['kafkaFieldTypes'] } if destination['outputFormat'] == 'table': if destination['isTargetExisting']: # Todo: check if format matches pass else: sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file( file_format, destination).get_str() print sql if destination['tableFormat'] == 'kudu': manager = ManagerApi() properties["output_table"] = "impala::%s" % collection_name properties["kudu_master"] = manager.get_kudu_master() else: properties['output_table'] = collection_name elif destination['outputFormat'] == 'file': properties['path'] = file_format["path"] properties['format'] = file_format['tableFormat'] # or csv elif destination['outputFormat'] == 'index': properties['collectionName'] = collection_name properties['connection'] = SOLR_URL.get() if destination['isTargetExisting']: # Todo: check if format matches pass else: client = SolrClient(request.user) kwargs = {} _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs) properties["app_name"] = 'Data Ingest' properties["inputFormat"] = file_format['inputFormat'] properties["ouputFormat"] = destination['ouputFormat'] properties["streamSelection"] = file_format["streamSelection"] envelope = indexer.generate_config(properties) return indexer.run(request, collection_name, envelope, input_path, start_time=start_time, lib_path=lib_path)
def handle(self, *args, **options): self.user = install_sample_user() self.client = SolrClient(self.user) collection = options['data'] if collection == 'twitter_demo': LOG.info("Installing twitter collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_twitter_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'twitter_demo', 'fields': self._parse_fields(path, fieldtypes={ 'source': 'string', 'username': '******', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Twitter collection successfully installed") if collection == 'yelp_demo': LOG.info("Installing yelp collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'yelp_demo', 'fields': self._parse_fields(path, fieldtypes={ 'name': 'string', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Yelp collection successfully installed") if collection == 'log_analytics_demo': LOG.info("Installing logs collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'log_analytics_demo', 'fields': self._parse_fields(path, fieldtypes={ 'region_code': 'string', 'referer': 'string', 'user_agent': 'string' }), 'uniqueKeyField': 'id', 'df': 'record' }, path) LOG.info("Logs collection successfully installed")
def run_morphline(self, request, collection_name, morphline, input_path, query=None, start_time=None, lib_path=None): workspace_path = self._upload_workspace(morphline) task = make_notebook(name=_('Indexing into %s') % collection_name, editor_type='notebook', on_success_url=reverse( 'search:browse', kwargs={'name': collection_name}), pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, _success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location task.add_hive_snippet(snippet['database'], sql) client = SolrClient(self.user) extra_args = ['-Dmapreduce.job.user.classpath.first=true' ] if client.is_solr_six_or_more() else [] task.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=lib_path if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(), arguments=extra_args + [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', client.get_zookeeper_host(), u'--collection', collection_name, input_path, ], files=[{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }]) return task.execute(request, batch=True)
def collections(request): client = SolrClient(user=request.user) response = {'status': 0, 'collections': client.get_indexes()} return JsonResponse(response)
def datasets(self, show_all=False): client = SolrClient(user=self.user) return [ index['name'] for index in client.get_indexes(include_cores=show_all) ]
def is_solr_cloud_mode(self): client = SolrClient(self.user) return client.is_solr_cloud_mode()
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if source['inputFormat'] == 'file': if source['path']: path = urllib.unquote(source['path']) source['path'] = request.fs.netnormpath(path) parent_path = request.fs.parent_path(path) stats = request.fs.stats(parent_path) split = urlparse(path) # Only for HDFS, import data and non-external table if split.scheme in ( '', 'hdfs') and destination['importData'] and destination[ 'useDefaultLocation'] and oct( stats["mode"])[-1] != '7': user_scratch_dir = request.fs.get_home_dir() + '/.scratchdir' request.fs.do_as_user(request.user, request.fs.mkdir, user_scratch_dir, 00777) request.fs.do_as_user(request.user, request.fs.rename, source['path'], user_scratch_dir) source['path'] = user_scratch_dir + '/' + source['path'].split( '/')[-1] if destination['ouputFormat'] in ('database', 'table'): destination['nonDefaultLocation'] = request.fs.netnormpath( destination['nonDefaultLocation']) if destination[ 'nonDefaultLocation'] else destination['nonDefaultLocation'] if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob'] or source['inputFormat'] == 'stream': _convert_format(source["format"], inverse=True) job_handle = _large_indexing( request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath'], destination=destination) else: client = SolrClient(request.user) job_handle = _small_indexing(request.user, request.fs, client, source, destination, index_name) elif source['inputFormat'] in ( 'stream', 'sfdc') or destination['ouputFormat'] == 'stream': job_handle = _envelope_job(request, source, destination, start_time=start_time, lib_path=destination['indexerJobLibPath']) elif source['inputFormat'] == 'altus': # BDR copy or DistCP + DDL + Sentry DDL copy pass elif source['inputFormat'] == 'rdbms': if destination['outputFormat'] in ('database', 'file', 'table', 'hbase'): job_handle = run_sqoop(request, source, destination, start_time) elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) request.audit = { 'operation': 'EXPORT', 'operationText': 'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s' % { 'username': request.user.username, 'inputFormat': source['inputFormat'], 'ouputFormat': destination['ouputFormat'], 'name': destination['name'], }, 'allowed': True } return JsonResponse(job_handle)
def __init__(self, username, fs=None, jt=None, solr_client=None): self.fs = fs self.jt = jt self.username = username self.user = User.objects.get(username=username) # To clean self.solr_client = solr_client if solr_client is not None else SolrClient(self.user)