Esempio n. 1
0
def _large_indexing(request,
                    file_format,
                    collection_name,
                    query=None,
                    start_time=None,
                    lib_path=None,
                    destination=None):
    indexer = MorphlineIndexer(request.user, request.fs)

    unique_field = indexer.get_unique_field(file_format)
    is_unique_generated = indexer.is_unique_generated(file_format)

    schema_fields = indexer.get_kept_field_list(file_format['columns'])
    if is_unique_generated:
        schema_fields += [{"name": unique_field, "type": "string"}]

    client = SolrClient(user=request.user)

    if not client.exists(collection_name) and not request.POST.get(
            'show_command'):  # if destination['isTargetExisting']:
        client.create_index(name=collection_name,
                            fields=request.POST.get('fields', schema_fields),
                            unique_key_field=unique_field
                            # No df currently
                            )
    else:
        # TODO: check if format matches
        pass

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        indexer = FlumeIndexer(user=request.user)
        if request.POST.get('show_command'):
            configs = indexer.generate_config(file_format, destination)
            return {'status': 0, 'commands': configs[-1]}
        else:
            return indexer.start(collection_name, file_format, destination)
    elif file_format['inputFormat'] == 'stream':
        return _envelope_job(request,
                             file_format,
                             destination,
                             start_time=start_time,
                             lib_path=lib_path)
    elif file_format['inputFormat'] == 'file':
        input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
    else:
        input_path = None

    morphline = indexer.generate_morphline_config(collection_name,
                                                  file_format,
                                                  unique_field,
                                                  lib_path=lib_path)

    return indexer.run_morphline(request,
                                 collection_name,
                                 morphline,
                                 input_path,
                                 query,
                                 start_time=start_time,
                                 lib_path=lib_path)
Esempio n. 2
0
 def datasets(self, show_all=False): # True if non Solr Cloud
   client = SolrClient(user=self.user)
   show_all = show_all or not client.is_solr_cloud_mode()
   return [index['name'] for index in client.get_indexes(include_cores=show_all)]
Esempio n. 3
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            source['path'] = request.fs.netnormpath(path)

    if destination['ouputFormat'] in ('database', 'table'):
        destination['nonDefaultLocation'] = request.fs.netnormpath(
            destination['nonDefaultLocation']) if destination[
                'nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif source['inputFormat'] in (
            'stream', 'connector') or destination['ouputFormat'] == 'stream':
        job_handle = _envelope_job(request,
                                   source,
                                   destination,
                                   start_time=start_time,
                                   lib_path=destination['indexerJobLibPath'])
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Esempio n. 4
0
File: api3.py Progetto: ranade1/hue
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    file_encoding = None
    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            if path[-3:] == 'xls' or path[-4:] == 'xlsx':
                path = excel_to_csv_file_name_change(path)
            source['path'] = request.fs.netnormpath(path)
            stream = request.fs.open(path)
            file_encoding = check_encoding(stream.read(10000))

    if destination['ouputFormat'] in ('database',
                                      'table') and request.fs is not None:
        destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \
            if destination['nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif destination['ouputFormat'] == 'stream-table':
        args = {
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = FlinkIndexer(request.user, request.fs)

        job_nb = api.create_table_from_kafka(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'big-table':
        args = {
            'request': request,
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = PhoenixIndexer(request.user, request.fs)

        job_nb = api.create_table_from_file(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    else:
        if source['inputFormat'] == 'localfile':
            job_handle = _create_table_from_local(request, source, destination,
                                                  start_time)
        else:
            # TODO: if inputFormat is 'stream' and tableFormat is 'kudu' --> create Table only
            job_handle = _create_table(request, source, destination,
                                       start_time, file_encoding)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Esempio n. 5
0
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = '/tmp/envelope-0.5.0.jar'
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = '${nameNode}%s' % file_format["path"]
        properties = {'format': 'json'}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'sfdc':
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }
        elif file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "output_table": "impala::%s" % collection_name,
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
                "kafkaFieldNames": file_format['kafkaFieldNames'],
                "kafkaFieldTypes": file_format['kafkaFieldTypes']
            }

        if destination['outputFormat'] == 'table':
            if destination['isTargetExisting']:
                # Todo: check if format matches
                pass
            else:
                sql = SQLIndexer(user=request.user,
                                 fs=request.fs).create_table_from_a_file(
                                     file_format, destination).get_str()
                print sql
            if destination['tableFormat'] == 'kudu':
                manager = ManagerApi()
                properties["output_table"] = "impala::%s" % collection_name
                properties["kudu_master"] = manager.get_kudu_master()
            else:
                properties['output_table'] = collection_name
        elif destination['outputFormat'] == 'file':
            properties['path'] = file_format["path"]
            properties['format'] = file_format['tableFormat']  # or csv
        elif destination['outputFormat'] == 'index':
            properties['collectionName'] = collection_name
            properties['connection'] = SOLR_URL.get()
            if destination['isTargetExisting']:
                # Todo: check if format matches
                pass
            else:
                client = SolrClient(request.user)
                kwargs = {}
                _create_solr_collection(request.user, request.fs, client,
                                        destination, collection_name, kwargs)

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    envelope = indexer.generate_config(properties)

    return indexer.run(request,
                       collection_name,
                       envelope,
                       input_path,
                       start_time=start_time,
                       lib_path=lib_path)
Esempio n. 6
0
    def handle(self, *args, **options):
        self.user = install_sample_user()
        self.client = SolrClient(self.user)

        collection = options['data']

        if collection == 'twitter_demo':
            LOG.info("Installing twitter collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_twitter_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'twitter_demo',
                    'fields':
                    self._parse_fields(path,
                                       fieldtypes={
                                           'source': 'string',
                                           'username': '******',
                                       }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'text'
                }, path)
            LOG.info("Twitter collection successfully installed")

        if collection == 'yelp_demo':
            LOG.info("Installing yelp collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'yelp_demo',
                    'fields':
                    self._parse_fields(path, fieldtypes={
                        'name': 'string',
                    }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'text'
                }, path)
            LOG.info("Yelp collection successfully installed")

        if collection == 'log_analytics_demo':
            LOG.info("Installing logs collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'log_analytics_demo',
                    'fields':
                    self._parse_fields(path,
                                       fieldtypes={
                                           'region_code': 'string',
                                           'referer': 'string',
                                           'user_agent': 'string'
                                       }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'record'
                }, path)
            LOG.info("Logs collection successfully installed")
Esempio n. 7
0
    def run_morphline(self,
                      request,
                      collection_name,
                      morphline,
                      input_path,
                      query=None,
                      start_time=None,
                      lib_path=None):
        workspace_path = self._upload_workspace(morphline)

        task = make_notebook(name=_('Indexing into %s') % collection_name,
                             editor_type='notebook',
                             on_success_url=reverse(
                                 'search:browse',
                                 kwargs={'name': collection_name}),
                             pub_sub_url='assist.collections.refresh',
                             is_task=True,
                             is_notebook=True,
                             last_executed=start_time)

        if query:
            q = Notebook(document=Document2.objects.get_by_uuid(user=self.user,
                                                                uuid=query))
            notebook_data = q.get_data()
            snippet = notebook_data['snippets'][0]

            api = get_api(request, snippet)

            destination = '__hue_%s' % notebook_data['uuid'][:4]
            location = '/user/%s/__hue-%s' % (request.user,
                                              notebook_data['uuid'][:4])
            sql, _success_url = api.export_data_as_table(notebook_data,
                                                         snippet,
                                                         destination,
                                                         is_temporary=True,
                                                         location=location)
            input_path = '${nameNode}%s' % location

            task.add_hive_snippet(snippet['database'], sql)

        client = SolrClient(self.user)

        extra_args = ['-Dmapreduce.job.user.classpath.first=true'
                      ] if client.is_solr_six_or_more() else []

        task.add_java_snippet(
            clazz='org.apache.solr.hadoop.MapReduceIndexerTool',
            app_jar=lib_path
            if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(),
            arguments=extra_args + [
                u'--morphline-file',
                u'morphline.conf',
                u'--output-dir',
                u'${nameNode}/user/%s/indexer' % self.username,
                u'--log4j',
                u'log4j.properties',
                u'--go-live',
                u'--zk-host',
                client.get_zookeeper_host(),
                u'--collection',
                collection_name,
                input_path,
            ],
            files=[{
                u'path': u'%s/log4j.properties' % workspace_path,
                u'type': u'file'
            }, {
                u'path': u'%s/morphline.conf' % workspace_path,
                u'type': u'file'
            }])

        return task.execute(request, batch=True)
Esempio n. 8
0
File: api.py Progetto: ziq211/hue
def collections(request):
    client = SolrClient(user=request.user)

    response = {'status': 0, 'collections': client.get_indexes()}

    return JsonResponse(response)
Esempio n. 9
0
 def datasets(self, show_all=False):
     client = SolrClient(user=self.user)
     return [
         index['name']
         for index in client.get_indexes(include_cores=show_all)
     ]
Esempio n. 10
0
 def is_solr_cloud_mode(self):
     client = SolrClient(self.user)
     return client.is_solr_cloud_mode()
Esempio n. 11
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib.unquote(source['path'])
            source['path'] = request.fs.netnormpath(path)
            parent_path = request.fs.parent_path(path)
            stats = request.fs.stats(parent_path)
            split = urlparse(path)
            # Only for HDFS, import data and non-external table
            if split.scheme in (
                    '', 'hdfs') and destination['importData'] and destination[
                        'useDefaultLocation'] and oct(
                            stats["mode"])[-1] != '7':
                user_scratch_dir = request.fs.get_home_dir() + '/.scratchdir'
                request.fs.do_as_user(request.user, request.fs.mkdir,
                                      user_scratch_dir, 00777)
                request.fs.do_as_user(request.user, request.fs.rename,
                                      source['path'], user_scratch_dir)
                source['path'] = user_scratch_dir + '/' + source['path'].split(
                    '/')[-1]

    if destination['ouputFormat'] in ('database', 'table'):
        destination['nonDefaultLocation'] = request.fs.netnormpath(
            destination['nonDefaultLocation']) if destination[
                'nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif source['inputFormat'] in (
            'stream', 'sfdc') or destination['ouputFormat'] == 'stream':
        job_handle = _envelope_job(request,
                                   source,
                                   destination,
                                   start_time=start_time,
                                   lib_path=destination['indexerJobLibPath'])
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Esempio n. 12
0
 def __init__(self, username, fs=None, jt=None, solr_client=None):
   self.fs = fs
   self.jt = jt
   self.username = username
   self.user = User.objects.get(username=username) # To clean
   self.solr_client = solr_client if solr_client is not None else SolrClient(self.user)