コード例 #1
0
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name):
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
    )

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
コード例 #2
0
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name):
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib.unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
コード例 #3
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob']:
            _convert_format(source["format"], inverse=True)
            job_handle = _index(request,
                                source,
                                index_name,
                                start_time=start_time,
                                lib_path=destination['indexerJobLibPath'])
        else:
            client = SolrClient(request.user)
            unique_key_field = destination[
                'indexerDefaultField'] and destination['indexerDefaultField'][
                    0] or None
            df = destination['indexerPrimaryKey'] and destination[
                'indexerPrimaryKey'][0] or None
            kwargs = {}

            stats = request.fs.stats(source["path"])
            if stats.size > MAX_UPLOAD_SIZE:
                raise PopupException(_('File size is too large to handle!'))

            indexer = MorphlineIndexer(request.user, request.fs)
            fields = indexer.get_kept_field_list(source['columns'])
            if not unique_key_field:
                unique_key_field = 'hue_id'
                fields += [{"name": unique_key_field, "type": "string"}]
                kwargs['rowid'] = unique_key_field

            if not client.exists(index_name):
                client.create_index(name=index_name,
                                    fields=fields,
                                    unique_key_field=unique_key_field,
                                    df=df)

            data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
            client.index(name=index_name, data=data, **kwargs)

            job_handle = {
                'status':
                0,
                'on_success_url':
                reverse('search:browse', kwargs={'name': index_name})
            }
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    return JsonResponse(job_handle)
コード例 #4
0
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']:
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )
  else:
    # TODO: check if format matches
    pass

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume':
    indexer = FlumeIndexer(user=request.user)
    if request.POST.get('show_command'):
      configs = indexer.generate_config(file_format, destination)
      return {'status': 0, 'commands': configs[-1]}
    else:
      return indexer.start(collection_name, file_format, destination)
  elif file_format['inputFormat'] == 'stream':
    return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path)
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(
      request,
      collection_name,
      morphline,
      input_path,
      query,
      start_time=start_time,
      lib_path=lib_path
  )
コード例 #5
0
ファイル: indexer_setup.py プロジェクト: ziq211/hue
class Command(BaseCommand):
    """
  Install examples but do not overwrite them.
  """
    def handle(self, *args, **options):
        self.user = install_sample_user()
        self.client = SolrClient(self.user)

        collection = options['data']

        if collection == 'twitter_demo':
            LOG.info("Installing twitter collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_twitter_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'twitter_demo',
                    'fields':
                    self._parse_fields(path,
                                       fieldtypes={
                                           'source': 'string',
                                           'username': '******',
                                       }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'text'
                }, path)
            LOG.info("Twitter collection successfully installed")

        if collection == 'yelp_demo':
            LOG.info("Installing yelp collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'yelp_demo',
                    'fields':
                    self._parse_fields(path, fieldtypes={
                        'name': 'string',
                    }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'text'
                }, path)
            LOG.info("Yelp collection successfully installed")

        if collection == 'log_analytics_demo':
            LOG.info("Installing logs collection")
            path = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    '../../../../../../../apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv'
                ))
            self._setup_collection_from_csv(
                {
                    'name':
                    'log_analytics_demo',
                    'fields':
                    self._parse_fields(path,
                                       fieldtypes={
                                           'region_code': 'string',
                                           'referer': 'string',
                                           'user_agent': 'string'
                                       }),
                    'uniqueKeyField':
                    'id',
                    'df':
                    'record'
                }, path)
            LOG.info("Logs collection successfully installed")

    def _setup_collection_from_csv(self, collection, path):
        if not self.client.exists(collection['name']):
            self.client.create_index(
                name=collection['name'],
                fields=collection['fields'],
                unique_key_field=collection['uniqueKeyField'],
                df=collection['df'])

            with open(path) as fh:
                self.client.index(collection['name'], fh.read())

    def _parse_fields(self,
                      path,
                      separator=',',
                      quote_character='"',
                      fieldtypes={}):
        with open(path) as fh:
            field_generator = utils.field_values_from_separated_file(
                fh, separator, quote_character)
            row = next(field_generator)
            field_names = list(row.keys())
            field_types = utils.get_field_types(
                (list(row.values())
                 for row in itertools.chain([row], field_generator)),
                iterations=51)
            return [{
                'name':
                field[0],
                'type':
                field[0] in fieldtypes and fieldtypes[field[0]] or field[1]
            } for field in zip(field_names, field_types)]