コード例 #1
0
  def update_data_from_hive(self, db, collection_or_core_name, database, table, columns, indexing_strategy='upload'):
    """
    Add hdfs path contents to index
    """
    # Run a custom hive query and post data to collection
    from beeswax.server import dbms
    import tablib

    api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get())
    if indexing_strategy == 'upload':
      table = db.get_table(database, table)
      hql = "SELECT %s FROM `%s.%s` %s" % (','.join(columns), database, table.name, db._get_browse_limit_clause(table))
      query = dbms.hql_query(hql)

      try:
        handle = db.execute_and_wait(query)

        if handle:
          result = db.fetch(handle, rows=100)
          db.close(handle)

          dataset = tablib.Dataset()
          dataset.append(columns)
          for row in result.rows():
            dataset.append(row)

          if not api.update(collection_or_core_name, dataset.csv, content_type='csv'):
            raise PopupException(_('Could not update index. Check error logs for more info.'))
        else:
          raise PopupException(_('Could not update index. Could not fetch any data from Hive.'))
      except Exception, e:
        raise PopupException(_('Could not update index.'), detail=e)
コード例 #2
0
  def update_data_from_hdfs(self, fs, collection_or_core_name, fields, path, data_type='separated', indexing_strategy='upload', **kwargs):
    """
    Add hdfs path contents to index
    """
    api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get())

    if indexing_strategy == 'upload':
      stats = fs.stats(path)
      if stats.size > MAX_UPLOAD_SIZE:
        raise PopupException(_('File size is too large to handle!'))
      else:
        # Get fields for filtering
        unique_key, fields = self.get_fields(collection_or_core_name)
        fields = [{'name': field, 'type': fields[field]['type']} for field in fields]

        fh = fs.open(path)
        if data_type == 'log':
          # Transform to JSON then update
          data = json.dumps([value for value in field_values_from_log(fh, fields)])
          content_type = 'json'
        elif data_type == 'separated':
          data = json.dumps([value for value in field_values_from_separated_file(fh, kwargs.get('separator', ','), kwargs.get('quote_character', '"'), fields)], indent=2)
          content_type = 'json'
        else:
          raise PopupException(_('Could not update index. Unknown type %s') % data_type)
        fh.close()
      if not api.update(collection_or_core_name, data, content_type=content_type):
        raise PopupException(_('Could not update index. Check error logs for more info.'))
    else:
      raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)
コード例 #3
0
  def update_data_from_hive(self, db, collection_or_core_name, database, table, columns, indexing_strategy='upload'):
    """
    Add hdfs path contents to index
    """
    # Run a custom hive query and post data to collection
    from beeswax.server import dbms
    import tablib

    api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get())
    if indexing_strategy == 'upload':
      table = db.get_table(database, table)
      hql = "SELECT %s FROM `%s.%s` %s" % (','.join(columns), database, table.name, db._get_browse_limit_clause(table))
      query = dbms.hql_query(hql)
      handle = db.execute_and_wait(query)

      if handle:
        result = db.fetch(handle, rows=100)
        db.close(handle)

        dataset = tablib.Dataset()
        dataset.append(columns)
        for row in result.rows():
          dataset.append(row)

        if not api.update(collection_or_core_name, dataset.csv, content_type='csv'):
          raise PopupException(_('Could not update index. Check error logs for more info.'))
      else:
        raise PopupException(_('Could not update index. Could not fetch any data from Hive.'))
    else:
      raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)
コード例 #4
0
    def update_data_from_hive(self, collection_or_core_name, columns,
                              fetch_handle):
        MAX_ROWS = 10000
        ROW_COUNT = 0
        FETCH_BATCH = 1000
        has_more = True

        api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get())

        try:
            while ROW_COUNT < MAX_ROWS and has_more:
                result = fetch_handle(FETCH_BATCH, ROW_COUNT == 0)
                has_more = result['has_more']

                if result['data']:
                    dataset = tablib.Dataset()
                    dataset.append(columns)
                    for i, row in enumerate(result['data']):
                        dataset.append([ROW_COUNT + i] + [
                            cell if cell else
                            (0 if isinstance(cell, numbers.Number) else '')
                            for cell in row
                        ])

                    if not api.update(collection_or_core_name,
                                      dataset.csv,
                                      content_type='csv'):
                        raise PopupException(
                            _('Could not update index. Check error logs for more info.'
                              ))

                ROW_COUNT += len(dataset)
        except Exception, e:
            raise PopupException(_('Could not update index.'), detail=e)
コード例 #5
0
  def update_data_from_hdfs(self, fs, collection_or_core_name, fields, path, data_type='separated', indexing_strategy='upload', **kwargs):
    """
    Add hdfs path contents to index
    """
    api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get())

    if indexing_strategy == 'upload':
      stats = fs.stats(path)
      if stats.size > MAX_UPLOAD_SIZE:
        raise PopupException(_('File size is too large to handle!'))
      else:
        # Get fields for filtering
        unique_key, fields = self.get_fields(collection_or_core_name)
        fields = [{'name': field, 'type': fields[field]['type']} for field in fields]

        fh = fs.open(path)
        if data_type == 'log':
          # Transform to JSON then update
          data = json.dumps([value for value in field_values_from_log(fh, fields)])
          content_type = 'json'
        elif data_type == 'separated':
          data = json.dumps([value for value in field_values_from_separated_file(fh, kwargs.get('separator', ','), kwargs.get('quote_character', '"'), fields)], indent=2)
          content_type = 'json'
        else:
          raise PopupException(_('Could not update index. Unknown type %s') % data_type)
        fh.close()
      if not api.update(collection_or_core_name, data, content_type=content_type):
        raise PopupException(_('Could not update index. Check error logs for more info.'))
    else:
      raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)