Esempio n. 1
0
    def _get_loader(self, app):
        if app in self.loaders:
            return self.loaders[app]

        # Lazily find a temp dir for module_dir.
        # This laziness is important because at initialization time
        # we might still be running as root during desktop startup
        # and thus the temp dir would be owned as root, not the
        # unpriveleged user!
        if self.module_dir is None:
            self.module_dir = tempfile.mkdtemp()  # TODO(todd) configurable?
        app_module = __import__(app)
        app_dir = os.path.dirname(app_module.__file__)
        app_template_dir = os.path.join(app_dir, 'templates')

        loader = TemplateLookup(
            directories=[app_template_dir, self.desktop_template_dir],
            module_directory=os.path.join(self.module_dir, app),
            output_encoding=i18n.get_site_encoding(),
            input_encoding=i18n.get_site_encoding(),
            encoding_errors=ENCODING_ERRORS,
            default_filters=['unicode', 'escape'],
            imports=IMPORTS)
        # TODO(philip): Make a django_aware default filter, that understands
        # django safe strings.  See http://www.makotemplates.org/docs/filtering.html.
        self.loaders[app] = loader
        return loader
Esempio n. 2
0
  def _get_loader(self, app):
    if app in self.loaders:
      return self.loaders[app]

    # Lazily find a temp dir for module_dir.
    # This laziness is important because at initialization time
    # we might still be running as root during desktop startup
    # and thus the temp dir would be owned as root, not the
    # unpriveleged user!
    if self.module_dir is None:
      self.module_dir = tempfile.mkdtemp() # TODO(todd) configurable?
    app_module = __import__(app)
    app_dir = os.path.dirname(app_module.__file__)
    app_template_dir = os.path.join(app_dir, 'templates')

    loader = TemplateLookup(directories=[app_template_dir, self.desktop_template_dir],
                            module_directory=os.path.join(self.module_dir, app),
                            output_encoding=i18n.get_site_encoding(),
                            input_encoding=i18n.get_site_encoding(),
                            encoding_errors=ENCODING_ERRORS,
                            default_filters=['unicode', 'escape'], 
                            imports=IMPORTS)
    # TODO(philip): Make a django_aware default filter, that understands
    # django safe strings.  See http://www.makotemplates.org/docs/filtering.html.
    self.loaders[app] = loader
    return loader
Esempio n. 3
0
 def __init__(self, encoding=None):
     super(XLSformatter, self).__init__()
     self._encoding = encoding or i18n.get_site_encoding()
     self._book = xl.Workbook()
     self._sheet = self._book.add_sheet("Sheet 1")
     self._row = 0
     self._size = 0
Esempio n. 4
0
 def __init__(self, encoding=None):
   super(XLSformatter, self).__init__()
   self._encoding = encoding or i18n.get_site_encoding()
   self._book = xl.Workbook()
   self._sheet = self._book.add_sheet("Sheet 1")
   self._row = 0
   self._size = 0
Esempio n. 5
0
 def __init__(self, encoding=None):
   super(XLSformatter, self).__init__()
   self._encoding = encoding or i18n.get_site_encoding()
   self._book = Workbook(optimized_write = True, encoding = 'utf-8')
   self._sheet = self._book.create_sheet(title='RESULT')
   self._row = 0
   self._size = 0
Esempio n. 6
0
 def __init__(self, encoding=None):
   super(CSVformatter, self).__init__()
   dialect = csv.excel()
   dialect.quoting = csv.QUOTE_ALL
   self._encoding = encoding or i18n.get_site_encoding()
   self._csv_writer = csv.writer(self, dialect=dialect)
   self._line = None
Esempio n. 7
0
 def __init__(self, encoding=None):
   super(CSVformatter, self).__init__()
   dialect = csv.excel()
   dialect.quoting = csv.QUOTE_ALL
   self._encoding = encoding or i18n.get_site_encoding()
   self._csv_writer = csv.writer(self, dialect=dialect)
   self._line = None
Esempio n. 8
0
def format(row, encoding=None):
    return [
        smart_str(nullify(cell),
                  encoding or i18n.get_site_encoding(),
                  strings_only=True,
                  errors='replace') for cell in row
    ]
Esempio n. 9
0
def encode_row(row, encoding=None, is_xls=False):
  encoded_row = []
  for cell in row:
    if is_xls and isinstance(cell, str):
      cell = re.sub(XLS_ILLEGAL_CHARS, '?', cell)
    cell = smart_str(nullify(cell), encoding or i18n.get_site_encoding(), strings_only=True, errors='replace')
    encoded_row.append(cell)
  return encoded_row
Esempio n. 10
0
    def _get_sample(cls, file_stream):
        encoding = i18n.get_site_encoding()

        for reader in [TextFileReader, GzipFileReader]:
            file_stream.seek(0)
            sample_data, sample_lines = reader.readlines(file_stream, encoding)
            file_stream.seek(0)

            if sample_data is not None:
                yield sample_data, sample_lines
Esempio n. 11
0
def encode_row(row, encoding=None):
  encoded_row = []
  for cell in row:
    if isinstance(cell, six.string_types):
      cell = re.sub(ILLEGAL_CHARS, '?', cell)
    cell = nullify(cell)
    if not isinstance(cell, numbers.Number):
      cell = smart_str(cell, encoding or i18n.get_site_encoding(), strings_only=True, errors='replace')
    encoded_row.append(cell)
  return encoded_row
Esempio n. 12
0
    def _get_sample(cls, file_stream):
        encoding = i18n.get_site_encoding()

        for reader in [TextFileReader, GzipFileReader]:
            file_stream.seek(0)
            lines = reader.readlines(file_stream, encoding)
            file_stream.seek(0)

            if lines is not None:
                yield '\n'.join(lines)
Esempio n. 13
0
def encode_row(row, encoding=None, make_excel_links=False):
  encoded_row = []
  for cell in row:
    if isinstance(cell, six.string_types):
      cell = re.sub(ILLEGAL_CHARS, '?', cell)
      if make_excel_links:
        cell = re.compile('(https?://.+)', re.IGNORECASE).sub(r'=HYPERLINK("\1")', cell)
    cell = nullify(cell)
    if not isinstance(cell, numbers.Number):
      cell = smart_str(cell, encoding or i18n.get_site_encoding(), strings_only=True, errors='replace')
    encoded_row.append(cell)
  return encoded_row
Esempio n. 14
0
def encode_row(row, encoding=None, make_excel_links=False):
  encoded_row = []
  for cell in row:
    if isinstance(cell, six.string_types):
      cell = re.sub(ILLEGAL_CHARS, '?', cell)
      if make_excel_links:
        cell = re.compile('(https?://.+)', re.IGNORECASE).sub(r'=HYPERLINK("\1")', cell)
    cell = nullify(cell)
    if not isinstance(cell, numbers.Number):
      cell = smart_str(cell, encoding or i18n.get_site_encoding(), strings_only=True, errors='replace')
    encoded_row.append(cell)
  return encoded_row
Esempio n. 15
0
  def test_remove_header(self):
    fs = self.cluster.fs

    path = "/tmp/test_remove_header.txt"
    data_header = "destination\trank"
    data_body = """thailand\t10
costarica\t?
curacao\t?"""
    data = data_header + '\n' + data_body

    f = fs.open(path, "w")
    f.write("hello")
    f.close()

    encoding = i18n.get_site_encoding()
    do_overwrite_save(fs, path, data.encode(encoding))

    assert_not_equal(data_body, fs.open(path).read())

    remove_header(fs, path)

    assert_equals(data_body, fs.open(path).read())
Esempio n. 16
0
    def test_remove_header(self):
        fs = self.cluster.fs

        path = "/tmp/test_remove_header.txt"
        data_header = "destination\trank"
        data_body = """thailand\t10
costarica\t?
curacao\t?"""
        data = data_header + '\n' + data_body

        f = fs.open(path, "w")
        f.write("hello")
        f.close()

        encoding = i18n.get_site_encoding()
        do_overwrite_save(fs, path, data.encode(encoding))

        assert_not_equal(data_body, fs.open(path).read())

        remove_header(fs, path)

        assert_equals(data_body, fs.open(path).read())
Esempio n. 17
0
def import_wizard(request, database='default'):
  """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
  encoding = i18n.get_site_encoding()
  app_name = get_app_name(request)

  db = dbms.get(request.user)
  dbs = db.get_databases()
  databases = [{'name':db, 'url':reverse('beeswax:import_wizard', kwargs={'database': db})} for db in dbs]

  if request.method == 'POST':
    #
    # General processing logic:
    # - We have 3 steps. Each requires the previous.
    #   * Step 1      : Table name and file location
    #   * Step 2a     : Display sample with auto chosen delim
    #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
    #   * Step 3      : Display sample, and define columns
    # - Each step is represented by a different form. The form of an earlier step
    #   should be present when submitting to a later step.
    # - To preserve the data from the earlier steps, we send the forms back as
    #   hidden fields. This way, when users revisit a previous step, the data would
    #   be there as well.
    #
    delim_is_auto = False
    fields_list, n_cols = [[]], 0
    s3_col_formset = None
    s1_file_form = CreateByImportFileForm(request.POST, db=db)

    if s1_file_form.is_valid():
      do_s2_auto_delim = request.POST.get('submit_file')        # Step 1 -> 2
      do_s2_user_delim = request.POST.get('submit_preview')     # Step 2 -> 2
      do_s3_column_def = request.POST.get('submit_delim')       # Step 2 -> 3
      do_hive_create = request.POST.get('submit_create')        # Step 3 -> execute

      cancel_s2_user_delim = request.POST.get('cancel_delim')   # Step 2 -> 1
      cancel_s3_column_def = request.POST.get('cancel_create')  # Step 3 -> 2

      # Exactly one of these should be True
      if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1:
        raise PopupException(_('Invalid form submission'))

      if not do_s2_auto_delim:
        # We should have a valid delim form
        s2_delim_form = CreateByImportDelimForm(request.POST)
        if not s2_delim_form.is_valid():
          # Go back to picking delimiter
          do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False
      if do_hive_create:
        # We should have a valid columns formset
        s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST)
        if not s3_col_formset.is_valid():
          # Go back to define columns
          do_s3_column_def, do_hive_create = True, False

      #
      # Go to step 2: We've just picked the file. Preview it.
      #
      if do_s2_auto_delim:
        delim_is_auto = True
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS)

      if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
        # Delimit based on input
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],),
                                                            (s2_delim_form.cleaned_data['delimiter'],))

      if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
        return render('choose_delimiter.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]),
          'initial': delim_is_auto,
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'fields_list': fields_list,
          'delimiter_choices': TERMINATOR_CHOICES,
          'n_cols': n_cols,
          'database': database,
          'databases': databases
        })

      #
      # Go to step 3: Define column.
      #
      if do_s3_column_def:
        if s3_col_formset is None:
          columns = []
          for i in range(n_cols):
            columns.append({
                'column_name': 'col_%s' % (i,),
                'column_type': 'string',
            })
          s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns)
        try:
          fields_list_for_json = list(fields_list)
          if fields_list_for_json:
            fields_list_for_json[0] = map(lambda a: re.sub('[^\w]', '', a), fields_list_for_json[0]) # Cleaning headers

          return render('define_columns.mako', request, {
            'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
            'file_form': s1_file_form,
            'delim_form': s2_delim_form,
            'column_formset': s3_col_formset,
            'fields_list': fields_list,
            'fields_list_json': json.dumps(fields_list_for_json),
            'n_cols': n_cols,
            'database': database,
            'databases': databases
          })
        except Exception, e:
          raise PopupException(_("The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."), detail=e)

      #
      # Final: Execute
      #
      if do_hive_create:
        delim = s2_delim_form.cleaned_data['delimiter']
        table_name = s1_file_form.cleaned_data['name']
        proposed_query = django_mako.render_to_string("create_table_statement.mako", {
            'table': {
                'name': table_name,
                'comment': s1_file_form.cleaned_data['comment'],
                'row_format': 'Delimited',
                'field_terminator': delim
             },
            'columns': [ f.cleaned_data for f in s3_col_formset.forms ],
            'partition_columns': [],
            'database': database,
            'databases': databases
          }
        )

        do_load_data = s1_file_form.cleaned_data.get('do_import')
        path = s1_file_form.cleaned_data['path']
        try:
          return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database)
        except QueryServerException, e:
          raise PopupException(_('The table could not be created.'), detail=e.message)
Esempio n. 18
0
def format(row, encoding=None):
  return [smart_str(nullify(cell), encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') for cell in row]
Esempio n. 19
0
 def clean_encoding(self):
     encoding = self.cleaned_data.get("encoding", "").strip()
     if not encoding:
         return i18n.get_site_encoding()
     return encoding
Esempio n. 20
0
def import_wizard(request, database=None):
  """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
  database = _get_last_database(request, database)
  encoding = i18n.get_site_encoding()
  app_name = get_app_name(request)

  if request.method == 'POST':
    # Have a while loop to allow an easy way to break
    for _ in range(1):
      #
      # General processing logic:
      # - We have 3 steps. Each requires the previous.
      #   * Step 1      : Table name and file location
      #   * Step 2a     : Display sample with auto chosen delim
      #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
      #   * Step 3      : Display sample, and define columns
      # - Each step is represented by a different form. The form of an earlier step
      #   should be present when submitting to a later step.
      # - To preserve the data from the earlier steps, we send the forms back as
      #   hidden fields. This way, when users revisit a previous step, the data would
      #   be there as well.
      #
      delim_is_auto = False
      fields_list, n_cols = [ [] ], 0
      s3_col_formset = None

      # Everything requires a valid file form
      db = dbms.get(request.user)
      s1_file_form = CreateByImportFileForm(request.POST, db=db)
      if not s1_file_form.is_valid():
        break

      do_s2_auto_delim = request.POST.get('submit_file')        # Step 1 -> 2
      do_s2_user_delim = request.POST.get('submit_preview')     # Step 2 -> 2
      do_s3_column_def = request.POST.get('submit_delim')       # Step 2 -> 3
      do_hive_create = request.POST.get('submit_create')        # Step 3 -> execute

      cancel_s2_user_delim = request.POST.get('cancel_delim')   # Step 2 -> 1
      cancel_s3_column_def = request.POST.get('cancel_create')  # Step 3 -> 2

      # Exactly one of these should be True
      assert len(filter(None, (do_s2_auto_delim,
                               do_s2_user_delim,
                               do_s3_column_def,
                               do_hive_create,
                               cancel_s2_user_delim,
                               cancel_s3_column_def))) == 1, 'Invalid form submission'

      #
      # Fix up what we should do in case any form is invalid
      #
      if not do_s2_auto_delim:
        # We should have a valid delim form
        s2_delim_form = CreateByImportDelimForm(request.POST)
        if not s2_delim_form.is_valid():
          # Go back to picking delimiter
          do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False

      if do_hive_create:
        # We should have a valid columns formset
        s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST)
        if not s3_col_formset.is_valid():
          # Go back to define columns
          do_s3_column_def, do_hive_create = True, False

      #
      # Go to step 2: We've just picked the file. Preview it.
      #
      if do_s2_auto_delim:
        delim_is_auto = True
        fields_list, n_cols, s2_delim_form = _delim_preview(
                                              request.fs,
                                              s1_file_form,
                                              encoding,
                                              [ reader.TYPE for reader in FILE_READERS ],
                                              DELIMITERS,
                                              False)

      if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
        # Delimit based on input
        fields_list, n_cols, s2_delim_form = _delim_preview(
                                              request.fs,
                                              s1_file_form,
                                              encoding,
                                              (s2_delim_form.cleaned_data['file_type'],),
                                              (s2_delim_form.cleaned_data['delimiter'],),
                                              s2_delim_form.cleaned_data.get('read_column_headers'))

      if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
        return render('choose_delimiter.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]),
          'initial': delim_is_auto,
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'fields_list': fields_list,
          'delimiter_choices': TERMINATOR_CHOICES,
          'n_cols': n_cols,
          'database': database,
        })

      #
      # Go to step 3: Define column.
      #
      if do_s3_column_def:
        read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers')
        if s3_col_formset is None or not read_column_headers:
          columns = []
          if read_column_headers and fields_list:
            first_row = fields_list[0]
            for i in range(n_cols):
              columns.append(dict(
                  column_name=first_row[i] if i < len(first_row) else 'col_%s' % (i + 1,),
                  column_type='string',
              ))
            fields_list = fields_list[1:]
          else:
            for i in range(n_cols):
              columns.append(dict(
                  column_name='col_%s' % (i + 1,),
                  column_type='string',
              ))
          s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns)
        return render('define_columns.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'column_formset': s3_col_formset,
          'fields_list': fields_list,
          'n_cols': n_cols,
           'database': database,
        })

      #
      # Finale: Execute
      #
      if do_hive_create:
        delim = s2_delim_form.cleaned_data['delimiter']
        table_name = s1_file_form.cleaned_data['name']
        proposed_query = django_mako.render_to_string("create_table_statement.mako",
          {
            'table': dict(name=table_name,
                          comment=s1_file_form.cleaned_data['comment'],
                          row_format='Delimited',
                          field_terminator=delim),
            'columns': [ f.cleaned_data for f in s3_col_formset.forms ],
            'partition_columns': [],
            'database': database,
          }
        )

        do_load_data = s1_file_form.cleaned_data.get('do_import')
        path = s1_file_form.cleaned_data.get('path')
        read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers')
        if read_column_headers and do_load_data:
          file_type = s2_delim_form.cleaned_data.get('file_type')
          file_readers = [ reader for reader in FILE_READERS if reader.TYPE == file_type ]
          if len(file_readers) == 1:
            try:
              file_obj = request.fs.open(path)
              _skip_first_line_in_file(file_readers[0],
                                       request.fs,
                                       path,
                                       file_readers[0].find(file_obj, '\n') + 1)
            except Exception, ex:
              msg = _('Cannot process file: %s' % (ex,))
              LOG.error(msg)
              raise PopupException(msg)
        return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database)
Esempio n. 21
0
File: views.py Progetto: abayer/hue
def display(request, path):
  """
  Implements displaying part of a file.

  GET arguments are length, offset, mode, compression and encoding
  with reasonable defaults chosen.

  Note that display by length and offset are on bytes, not on characters.

  TODO(philip): Could easily built-in file type detection
  (perhaps using something similar to file(1)), as well
  as more advanced binary-file viewing capability (de-serialize
  sequence files, decompress gzipped text files, etc.).
  There exists a python-magic package to interface with libmagic.
  """
  path = _unquote_path(path)
  if not request.fs.isfile(path):
    raise PopupException("Not a file: '%s'" % (path,))

  stats = request.fs.stats(path)
  encoding = request.GET.get('encoding') or i18n.get_site_encoding()

  # I'm mixing URL-based parameters and traditional
  # HTTP GET parameters, since URL-based parameters
  # can't naturally be optional.

  # Need to deal with possibility that length is not present
  # because the offset came in via the toolbar manual byte entry.
  end = request.GET.get("end")
  if end:
    end = int(end)
  begin = request.GET.get("begin", 1)
  if begin:
    # Subtract one to zero index for file read
    begin = int(begin) - 1
  if end:
    offset = begin
    length = end - begin
    if begin >= end:
      raise PopupException("First byte to display must be before last byte to display.")
  else:
    length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES))
    # Display first block by default.
    offset = int(request.GET.get("offset", 0))

  mode = request.GET.get("mode")
  compression = request.GET.get("compression")

  if mode and mode not in ["binary", "text"]:
    raise PopupException("Mode must be one of 'binary' or 'text'.")
  if offset < 0:
    raise PopupException("Offset may not be less than zero.")
  if length < 0:
    raise PopupException("Length may not be less than zero.")
  if length > MAX_CHUNK_SIZE_BYTES:
    raise PopupException("Cannot request chunks greater than %d bytes" % MAX_CHUNK_SIZE_BYTES)

  # Auto gzip detection, unless we are explicitly told to view binary
  if not compression and mode != 'binary':
    if path.endswith('.gz') and detect_gzip(request.fs.open(path).read(2)):
      compression = 'gzip'
      offset = 0
    else:
      compression = 'none'

  f = request.fs.open(path)

  if compression == 'gzip':
    if offset and offset != 0:
      raise PopupException("We don't support offset and gzip Compression")
    try:
      try:
        contents = GzipFile('', 'r', 0, StringIO(f.read())).read(length)
      except:
        logging.warn("Could not decompress file at %s" % path, exc_info=True)
        contents = ''
        raise PopupException("Failed to decompress file")
    finally:
      f.close()
  else:
    try:
      f.seek(offset)
      contents = f.read(length)
    finally:
      f.close()

  # Get contents as string for text mode, or at least try
  uni_contents = None
  if not mode or mode == 'text':
    uni_contents = unicode(contents, encoding, errors='replace')
    is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1
    # Auto-detect mode
    if not mode:
      mode = is_binary and 'binary' or 'text'

  # Get contents as bytes
  if mode == "binary":
    xxd_out = list(xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE))

  dirname = posixpath.dirname(path)
  # Start with index-like data:
  data = _massage_stats(request, request.fs.stats(path))
  # And add a view structure:
  data["success"] = True
  data["view"] = {
    'offset': offset,
    'length': length,
    'end': offset + len(contents),
    'dirname': dirname,
    'mode': mode,
    'compression': compression,
    'size': stats['size']
  }
  data["filename"] = os.path.basename(path)
  data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE
  if mode == "binary":
    # This might be the wrong thing for ?format=json; doing the
    # xxd'ing in javascript might be more compact, or sending a less
    # intermediate representation...
    logger.debug("xxd: " + str(xxd_out))
    data['view']['xxd'] = xxd_out
    data['view']['masked_binary_data'] =  False
  else:
    data['view']['contents'] = uni_contents
    data['view']['masked_binary_data'] = is_binary

  return render_with_toolbars("display.mako", request, data)
def import_wizard(request):
  """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
  encoding = i18n.get_site_encoding()

  if request.method == 'POST':
    # Have a while loop to allow an easy way to break
    for _ in range(1):
      #
      # General processing logic:
      # - We have 3 steps. Each requires the previous.
      #   * Step 1      : Table name and file location
      #   * Step 2a     : Display sample with auto chosen delim
      #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
      #   * Step 3      : Display sample, and define columns
      # - Each step is represented by a different form. The form of an earlier step
      #   should be present when submitting to a later step.
      # - To preserve the data from the earlier steps, we send the forms back as
      #   hidden fields. This way, when users revisit a previous step, the data would
      #   be there as well.
      #
      delim_is_auto = False
      fields_list, n_cols = [ [] ], 0
      s3_col_formset = None

      # Everything requires a valid file form
      s1_file_form = beeswax.forms.CreateByImportFileForm(request.POST)
      if not s1_file_form.is_valid():
        break

      do_s2_auto_delim = request.POST.get('submit_file')        # Step 1 -> 2
      do_s2_user_delim = request.POST.get('submit_preview')     # Step 2 -> 2
      do_s3_column_def = request.POST.get('submit_delim')       # Step 2 -> 3
      do_hive_create = request.POST.get('submit_create')        # Step 3 -> execute

      cancel_s2_user_delim = request.POST.get('cancel_delim')   # Step 2 -> 1
      cancel_s3_column_def = request.POST.get('cancel_create')  # Step 3 -> 2

      # Exactly one of these should be True
      assert len(filter(None, (do_s2_auto_delim,
                               do_s2_user_delim,
                               do_s3_column_def,
                               do_hive_create,
                               cancel_s2_user_delim,
                               cancel_s3_column_def))) == 1, 'Invalid form submission'

      #
      # Fix up what we should do in case any form is invalid
      #
      if not do_s2_auto_delim:
        # We should have a valid delim form
        s2_delim_form = beeswax.forms.CreateByImportDelimForm(request.POST)
        if not s2_delim_form.is_valid():
          # Go back to picking delimiter
          do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False

      if do_hive_create:
        # We should have a valid columns formset
        s3_col_formset = beeswax.forms.ColumnTypeFormSet(prefix='cols', data=request.POST)
        if not s3_col_formset.is_valid():
          # Go back to define columns
          do_s3_column_def, do_hive_create = True, False

      #
      # Go to step 2: We've just picked the file. Preview it.
      #
      if do_s2_auto_delim:
        delim_is_auto = True
        fields_list, n_cols, s2_delim_form = _delim_preview(
                                              request.fs,
                                              s1_file_form,
                                              encoding,
                                              [ reader.TYPE for reader in FILE_READERS ],
                                              DELIMITERS)

      if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
        # Delimit based on input
        fields_list, n_cols, s2_delim_form = _delim_preview(
                                              request.fs,
                                              s1_file_form,
                                              encoding,
                                              (s2_delim_form.cleaned_data['file_type'],),
                                              (s2_delim_form.cleaned_data['delimiter'],))

      if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
        return render('choose_delimiter.mako', request, dict(
          action=urlresolvers.reverse(import_wizard),
          delim_readable=DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]),
          initial=delim_is_auto,
          file_form=s1_file_form,
          delim_form=s2_delim_form,
          fields_list=fields_list,
          delimiter_choices=beeswax.forms.TERMINATOR_CHOICES,
          n_cols=n_cols,
        ))

      #
      # Go to step 3: Define column.
      #
      if do_s3_column_def:
        if s3_col_formset is None:
          columns = []
          for i in range(n_cols):
            columns.append(dict(
                column_name='col_%s' % (i,),
                column_type='string',
            ))
          s3_col_formset = beeswax.forms.ColumnTypeFormSet(prefix='cols', initial=columns)
        return render('define_columns.mako', request, dict(
          action=urlresolvers.reverse(import_wizard),
          file_form=s1_file_form,
          delim_form=s2_delim_form,
          column_formset=s3_col_formset,
          fields_list=fields_list,
          n_cols=n_cols,
        ))

      #
      # Finale: Execute
      #
      if do_hive_create:
        delim = s2_delim_form.cleaned_data['delimiter']
        table_name = s1_file_form.cleaned_data['name']
        proposed_query = django_mako.render_to_string("create_table_statement.mako",
          {
            'table': dict(name=table_name,
                          comment=s1_file_form.cleaned_data['comment'],
                          row_format='Delimited',
                          field_terminator=delim),
            'columns': [ f.cleaned_data for f in s3_col_formset.forms ],
            'partition_columns': []
          }
        )

        do_load_data = s1_file_form.cleaned_data.get('do_import')
        path = s1_file_form.cleaned_data['path']
        return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data)
  else:
    s1_file_form = beeswax.forms.CreateByImportFileForm()

  return render('choose_file.mako', request, dict(
    action=urlresolvers.reverse(import_wizard),
    file_form=s1_file_form,
  ))
Esempio n. 23
0
def display(request, path):
    """
  Implements displaying part of a file.

  GET arguments are length, offset, mode, compression and encoding
  with reasonable defaults chosen.

  Note that display by length and offset are on bytes, not on characters.

  TODO(philip): Could easily built-in file type detection
  (perhaps using something similar to file(1)), as well
  as more advanced binary-file viewing capability (de-serialize
  sequence files, decompress gzipped text files, etc.).
  There exists a python-magic package to interface with libmagic.
  """
    path = _unquote_path(path)
    if not request.fs.isfile(path):
        raise PopupException("Not a file: '%s'" % (path, ))

    stats = request.fs.stats(path)
    encoding = request.GET.get('encoding') or i18n.get_site_encoding()

    # I'm mixing URL-based parameters and traditional
    # HTTP GET parameters, since URL-based parameters
    # can't naturally be optional.

    # Need to deal with possibility that length is not present
    # because the offset came in via the toolbar manual byte entry.
    end = request.GET.get("end")
    if end:
        end = int(end)
    begin = request.GET.get("begin", 1)
    if begin:
        # Subtract one to zero index for file read
        begin = int(begin) - 1
    if end:
        offset = begin
        length = end - begin
        if begin >= end:
            raise PopupException(
                "First byte to display must be before last byte to display.")
    else:
        length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES))
        # Display first block by default.
        offset = int(request.GET.get("offset", 0))

    mode = request.GET.get("mode")
    compression = request.GET.get("compression")

    if mode and mode not in ["binary", "text"]:
        raise PopupException("Mode must be one of 'binary' or 'text'.")
    if offset < 0:
        raise PopupException("Offset may not be less than zero.")
    if length < 0:
        raise PopupException("Length may not be less than zero.")
    if length > MAX_CHUNK_SIZE_BYTES:
        raise PopupException("Cannot request chunks greater than %d bytes" %
                             MAX_CHUNK_SIZE_BYTES)

    # Auto gzip detection, unless we are explicitly told to view binary
    if not compression and mode != 'binary':
        if path.endswith('.gz') and detect_gzip(request.fs.open(path).read(2)):
            compression = 'gzip'
            offset = 0
        else:
            compression = 'none'

    f = request.fs.open(path)

    if compression == 'gzip':
        if offset and offset != 0:
            raise PopupException(
                "We don't support offset and gzip Compression")
        try:
            try:
                contents = GzipFile('', 'r', 0,
                                    StringIO(f.read())).read(length)
            except:
                logging.warn("Could not decompress file at %s" % path,
                             exc_info=True)
                contents = ''
                raise PopupException("Failed to decompress file")
        finally:
            f.close()
    else:
        try:
            f.seek(offset)
            contents = f.read(length)
        finally:
            f.close()

    # Get contents as string for text mode, or at least try
    uni_contents = None
    if not mode or mode == 'text':
        uni_contents = unicode(contents, encoding, errors='replace')
        is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1
        # Auto-detect mode
        if not mode:
            mode = is_binary and 'binary' or 'text'

    # Get contents as bytes
    if mode == "binary":
        xxd_out = list(
            xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE))

    dirname = posixpath.dirname(path)
    # Start with index-like data:
    data = _massage_stats(request, request.fs.stats(path))
    # And add a view structure:
    data["success"] = True
    data["view"] = {
        'offset': offset,
        'length': length,
        'end': offset + len(contents),
        'dirname': dirname,
        'mode': mode,
        'compression': compression,
        'size': stats['size']
    }
    data["filename"] = os.path.basename(path)
    data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE
    if mode == "binary":
        # This might be the wrong thing for ?format=json; doing the
        # xxd'ing in javascript might be more compact, or sending a less
        # intermediate representation...
        logger.debug("xxd: " + str(xxd_out))
        data['view']['xxd'] = xxd_out
        data['view']['masked_binary_data'] = False
    else:
        data['view']['contents'] = uni_contents
        data['view']['masked_binary_data'] = is_binary

    return render_with_toolbars("display.mako", request, data)
Esempio n. 24
0
        # A file not found is OK, otherwise re-raise
        if ioe.errno == errno.ENOENT:
            stats = None
        else:
            raise

    # Can't edit a directory
    if stats and stats["mode"] & stat_module.S_IFDIR:
        raise PopupException(_("Cannot edit a directory: %(path)s") % {"path": path})

    # Maximum size of edit
    if stats and stats["size"] > MAX_FILEEDITOR_SIZE:
        raise PopupException(_("File too big to edit: %(path)s") % {"path": path})

    if not form:
        encoding = request.REQUEST.get("encoding") or i18n.get_site_encoding()
        if stats:
            f = request.fs.open(path)
            try:
                try:
                    current_contents = unicode(f.read(), encoding)
                except UnicodeDecodeError:
                    raise PopupException(
                        _("File is not encoded in %(encoding)s; cannot be edited: %(path)s")
                        % {"encoding": encoding, "path": path}
                    )
            finally:
                f.close()
        else:
            current_contents = u""
Esempio n. 25
0
def import_wizard(request, database='default'):
    """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
    encoding = i18n.get_site_encoding()
    app_name = get_app_name(request)

    db = dbms.get(request.user)
    dbs = db.get_databases()
    databases = [{
        'name':
        db,
        'url':
        reverse('beeswax:import_wizard', kwargs={'database': db})
    } for db in dbs]

    if request.method == 'POST':
        #
        # General processing logic:
        # - We have 3 steps. Each requires the previous.
        #   * Step 1      : Table name and file location
        #   * Step 2a     : Display sample with auto chosen delim
        #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
        #   * Step 3      : Display sample, and define columns
        # - Each step is represented by a different form. The form of an earlier step
        #   should be present when submitting to a later step.
        # - To preserve the data from the earlier steps, we send the forms back as
        #   hidden fields. This way, when users revisit a previous step, the data would
        #   be there as well.
        #
        delim_is_auto = False
        fields_list, n_cols = [[]], 0
        s3_col_formset = None
        s1_file_form = CreateByImportFileForm(request.POST, db=db)

        if s1_file_form.is_valid():
            do_s2_auto_delim = request.POST.get('submit_file')  # Step 1 -> 2
            do_s2_user_delim = request.POST.get(
                'submit_preview')  # Step 2 -> 2
            do_s3_column_def = request.POST.get('submit_delim')  # Step 2 -> 3
            do_hive_create = request.POST.get(
                'submit_create')  # Step 3 -> execute

            cancel_s2_user_delim = request.POST.get(
                'cancel_delim')  # Step 2 -> 1
            cancel_s3_column_def = request.POST.get(
                'cancel_create')  # Step 3 -> 2

            # Exactly one of these should be True
            if len([
                    _f for _f in (do_s2_auto_delim, do_s2_user_delim,
                                  do_s3_column_def, do_hive_create,
                                  cancel_s2_user_delim, cancel_s3_column_def)
                    if _f
            ]) != 1:
                raise PopupException(_('Invalid form submission'))

            if not do_s2_auto_delim:
                # We should have a valid delim form
                s2_delim_form = CreateByImportDelimForm(request.POST)
                if not s2_delim_form.is_valid():
                    # Go back to picking delimiter
                    do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False
            if do_hive_create:
                # We should have a valid columns formset
                s3_col_formset = ColumnTypeFormSet(prefix='cols',
                                                   data=request.POST)
                if not s3_col_formset.is_valid():
                    # Go back to define columns
                    do_s3_column_def, do_hive_create = True, False

            load_data = s1_file_form.cleaned_data.get('load_data',
                                                      'IMPORT').upper()
            path = s1_file_form.cleaned_data['path']

            #
            # Go to step 2: We've just picked the file. Preview it.
            #
            if do_s2_auto_delim:
                try:
                    if load_data == 'IMPORT':
                        if not request.fs.isfile(path):
                            raise PopupException(
                                _('Path location must refer to a file if "Import Data" is selected.'
                                  ))
                    elif load_data == 'EXTERNAL':
                        if not request.fs.isdir(path):
                            raise PopupException(
                                _('Path location must refer to a directory if "Create External Table" is selected.'
                                  ))
                except (IOError, S3FileSystemException) as e:
                    raise PopupException(
                        _('Path location "%s" is invalid: %s') % (path, e))

                delim_is_auto = True
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs, s1_file_form, encoding,
                    [reader.TYPE for reader in FILE_READERS], DELIMITERS)

            if (do_s2_user_delim or do_s3_column_def
                    or cancel_s3_column_def) and s2_delim_form.is_valid():
                # Delimit based on input
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs, s1_file_form, encoding,
                    (s2_delim_form.cleaned_data['file_type'], ),
                    (s2_delim_form.cleaned_data['delimiter'], ))

            if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
                apps_list = _get_apps(request.user, '')
                return render(
                    'import_wizard_choose_delimiter.mako', request, {
                        'apps':
                        apps_list,
                        'action':
                        reverse(app_name + ':import_wizard',
                                kwargs={'database': database}),
                        'delim_readable':
                        DELIMITER_READABLE.get(
                            s2_delim_form['delimiter'].data[0],
                            s2_delim_form['delimiter'].data[1]),
                        'initial':
                        delim_is_auto,
                        'file_form':
                        s1_file_form,
                        'delim_form':
                        s2_delim_form,
                        'fields_list':
                        fields_list,
                        'delimiter_choices':
                        TERMINATOR_CHOICES,
                        'n_cols':
                        n_cols,
                        'database':
                        database,
                        'databases':
                        databases
                    })

            #
            # Go to step 3: Define column.
            #
            if do_s3_column_def:
                if s3_col_formset is None:
                    columns = []
                    for i in range(n_cols):
                        columns.append({
                            'column_name': 'col_%s' % (i, ),
                            'column_type': 'string',
                        })
                    s3_col_formset = ColumnTypeFormSet(prefix='cols',
                                                       initial=columns)
                try:
                    fields_list_for_json = list(fields_list)
                    if fields_list_for_json:
                        fields_list_for_json[0] = [
                            re.sub('[^\w]', '', a)
                            for a in fields_list_for_json[0]
                        ]  # Cleaning headers
                    apps_list = _get_apps(request.user, '')
                    return render(
                        'import_wizard_define_columns.mako', request, {
                            'apps':
                            apps_list,
                            'action':
                            reverse(app_name + ':import_wizard',
                                    kwargs={'database': database}),
                            'file_form':
                            s1_file_form,
                            'delim_form':
                            s2_delim_form,
                            'column_formset':
                            s3_col_formset,
                            'fields_list':
                            fields_list,
                            'fields_list_json':
                            json.dumps(fields_list_for_json),
                            'n_cols':
                            n_cols,
                            'database':
                            database,
                            'databases':
                            databases
                        })
                except Exception as e:
                    raise PopupException(_(
                        "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."
                    ),
                                         detail=e)

            #
            # Final: Execute
            #
            if do_hive_create:
                delim = s2_delim_form.cleaned_data['delimiter']
                table_name = s1_file_form.cleaned_data['name']

                proposed_query = django_mako.render_to_string(
                    "create_table_statement.mako", {
                        'table': {
                            'name':
                            table_name,
                            'comment':
                            s1_file_form.cleaned_data['comment'],
                            'row_format':
                            'Delimited',
                            'field_terminator':
                            delim,
                            'file_format':
                            'TextFile',
                            'load_data':
                            load_data,
                            'path':
                            path,
                            'skip_header':
                            request.GET.get('removeHeader', 'off').lower()
                            == 'on'
                        },
                        'columns':
                        [f.cleaned_data for f in s3_col_formset.forms],
                        'partition_columns': [],
                        'database': database,
                        'databases': databases
                    })
                try:
                    return _submit_create_and_load(request,
                                                   proposed_query,
                                                   table_name,
                                                   path,
                                                   load_data,
                                                   database=database)
                except QueryServerException as e:
                    raise PopupException(_('The table could not be created.'),
                                         detail=e.message)
    else:
        s1_file_form = CreateByImportFileForm()

    return render(
        'import_wizard_choose_file.mako', request, {
            'action':
            reverse(app_name + ':import_wizard', kwargs={'database': database
                                                         }),
            'file_form':
            s1_file_form,
            'database':
            database,
            'databases':
            databases
        })
Esempio n. 26
0
def import_wizard(request, database="default"):
    """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
    encoding = i18n.get_site_encoding()
    app_name = get_app_name(request)

    db = dbms.get(request.user)
    dbs = db.get_databases()
    databases = [{"name": db, "url": reverse("beeswax:import_wizard", kwargs={"database": db})} for db in dbs]

    if request.method == "POST":
        #
        # General processing logic:
        # - We have 3 steps. Each requires the previous.
        #   * Step 1      : Table name and file location
        #   * Step 2a     : Display sample with auto chosen delim
        #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
        #   * Step 3      : Display sample, and define columns
        # - Each step is represented by a different form. The form of an earlier step
        #   should be present when submitting to a later step.
        # - To preserve the data from the earlier steps, we send the forms back as
        #   hidden fields. This way, when users revisit a previous step, the data would
        #   be there as well.
        #
        delim_is_auto = False
        fields_list, n_cols = [[]], 0
        s3_col_formset = None
        s1_file_form = CreateByImportFileForm(request.POST, db=db)

        if s1_file_form.is_valid():
            do_s2_auto_delim = request.POST.get("submit_file")  # Step 1 -> 2
            do_s2_user_delim = request.POST.get("submit_preview")  # Step 2 -> 2
            do_s3_column_def = request.POST.get("submit_delim")  # Step 2 -> 3
            do_hive_create = request.POST.get("submit_create")  # Step 3 -> execute

            cancel_s2_user_delim = request.POST.get("cancel_delim")  # Step 2 -> 1
            cancel_s3_column_def = request.POST.get("cancel_create")  # Step 3 -> 2

            # Exactly one of these should be True
            if (
                len(
                    filter(
                        None,
                        (
                            do_s2_auto_delim,
                            do_s2_user_delim,
                            do_s3_column_def,
                            do_hive_create,
                            cancel_s2_user_delim,
                            cancel_s3_column_def,
                        ),
                    )
                )
                != 1
            ):
                raise PopupException(_("Invalid form submission"))

            if not do_s2_auto_delim:
                # We should have a valid delim form
                s2_delim_form = CreateByImportDelimForm(request.POST)
                if not s2_delim_form.is_valid():
                    # Go back to picking delimiter
                    do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False
            if do_hive_create:
                # We should have a valid columns formset
                s3_col_formset = ColumnTypeFormSet(prefix="cols", data=request.POST)
                if not s3_col_formset.is_valid():
                    # Go back to define columns
                    do_s3_column_def, do_hive_create = True, False

            #
            # Go to step 2: We've just picked the file. Preview it.
            #
            if do_s2_auto_delim:
                delim_is_auto = True
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS
                )

            if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
                # Delimit based on input
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs,
                    s1_file_form,
                    encoding,
                    (s2_delim_form.cleaned_data["file_type"],),
                    (s2_delim_form.cleaned_data["delimiter"],),
                )

            if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
                return render(
                    "choose_delimiter.mako",
                    request,
                    {
                        "action": reverse(app_name + ":import_wizard", kwargs={"database": database}),
                        "delim_readable": DELIMITER_READABLE.get(
                            s2_delim_form["delimiter"].data[0], s2_delim_form["delimiter"].data[1]
                        ),
                        "initial": delim_is_auto,
                        "file_form": s1_file_form,
                        "delim_form": s2_delim_form,
                        "fields_list": fields_list,
                        "delimiter_choices": TERMINATOR_CHOICES,
                        "n_cols": n_cols,
                        "database": database,
                        "databases": databases,
                    },
                )

            #
            # Go to step 3: Define column.
            #
            if do_s3_column_def:
                if s3_col_formset is None:
                    columns = []
                    for i in range(n_cols):
                        columns.append({"column_name": "col_%s" % (i,), "column_type": "string"})
                    s3_col_formset = ColumnTypeFormSet(prefix="cols", initial=columns)
                try:
                    fields_list_for_json = list(fields_list)
                    if fields_list_for_json:
                        fields_list_for_json[0] = map(
                            lambda a: re.sub("[^\w]", "", a), fields_list_for_json[0]
                        )  # Cleaning headers

                    return render(
                        "define_columns.mako",
                        request,
                        {
                            "action": reverse(app_name + ":import_wizard", kwargs={"database": database}),
                            "file_form": s1_file_form,
                            "delim_form": s2_delim_form,
                            "column_formset": s3_col_formset,
                            "fields_list": fields_list,
                            "fields_list_json": json.dumps(fields_list_for_json),
                            "n_cols": n_cols,
                            "database": database,
                            "databases": databases,
                        },
                    )
                except Exception, e:
                    raise PopupException(
                        _(
                            "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."
                        ),
                        detail=e,
                    )

            #
            # Final: Execute
            #
            if do_hive_create:
                delim = s2_delim_form.cleaned_data["delimiter"]
                table_name = s1_file_form.cleaned_data["name"]
                proposed_query = django_mako.render_to_string(
                    "create_table_statement.mako",
                    {
                        "table": {
                            "name": table_name,
                            "comment": s1_file_form.cleaned_data["comment"],
                            "row_format": "Delimited",
                            "field_terminator": delim,
                        },
                        "columns": [f.cleaned_data for f in s3_col_formset.forms],
                        "partition_columns": [],
                        "database": database,
                        "databases": databases,
                    },
                )

                do_load_data = s1_file_form.cleaned_data.get("do_import")
                path = s1_file_form.cleaned_data["path"]
                return _submit_create_and_load(
                    request, proposed_query, table_name, path, do_load_data, database=database
                )
Esempio n. 27
0
        # A file not found is OK, otherwise re-raise
        if ioe.errno == errno.ENOENT:
            stats = None
        else:
            raise

    # Can't edit a directory
    if stats and stats['mode'] & stat_module.S_IFDIR:
        raise PopupException(_("Cannot edit a directory: %(path)s") % {'path': path})

    # Maximum size of edit
    if stats and stats['size'] > MAX_FILEEDITOR_SIZE:
        raise PopupException(_("File too big to edit: %(path)s") % {'path': path})

    if not form:
        encoding = request.REQUEST.get('encoding') or i18n.get_site_encoding()
        if stats:
            f = request.fs.open(path)
            try:
                try:
                    current_contents = unicode(f.read(), encoding)
                except UnicodeDecodeError:
                    raise PopupException(_("File is not encoded in %(encoding)s; cannot be edited: %(path)s.") % {'encoding': encoding, 'path': path})
            finally:
                f.close()
        else:
            current_contents = u""

        form = EditorForm(dict(path=path, contents=current_contents, encoding=encoding))

    data = dict(
Esempio n. 28
0
  def run_bin_hadoop_step(self, step):
    """
    user.name is used by FileSystem.getHomeDirectory().
    The environment variables for _USER and _GROUPS are used
    by the aspectj aspect to overwrite Hadoop's notion of 
    users and groups.
    """
    java_properties = {}
    java_properties["hue.suffix"] = "-via-hue"
    java_properties["user.name"] = self.plan.user
    java_prop_str = " ".join("-D%s=%s" % (k,v) for k, v in java_properties.iteritems())
    env = {      
      'HADOOP_HOME': hadoop.conf.HADOOP_HOME.get(), 
      'HADOOP_OPTS': "-javaagent:%s %s" % (jobsub.conf.ASPECTJWEAVER.get(), java_prop_str),
      'HADOOP_CLASSPATH': ':'.join([jobsub.conf.ASPECTPATH.get(),
                                    hadoop.conf.HADOOP_EXTRA_CLASSPATH_STRING.get()]),
      'HUE_JOBTRACE_LOG': self.internal_file_name("jobs"),
      'HUE_JOBSUB_USER': self.plan.user,
      'HUE_JOBSUB_GROUPS': ",".join(self.plan.groups),
      'LANG': os.getenv('LANG', i18n.get_site_encoding()),
    }

    all_clusters = []
    all_clusters += all_mrclusters().values()
    all_clusters += get_all_hdfs().values()
    delegation_token_files = []
    merged_token_file = tempfile.NamedTemporaryFile()
    try:
      LOG.debug("all_clusters: %s" % (repr(all_clusters),))
      for cluster in all_clusters:
        if cluster.security_enabled:
          cluster.setuser(self.plan.user)
          token = cluster.get_delegation_token()
          token_file = tempfile.NamedTemporaryFile()
          token_file.write(token.delegationTokenBytes)
          token_file.flush()
          delegation_token_files.append(token_file)
  
      java_home = os.getenv('JAVA_HOME')
      if java_home:
        env["JAVA_HOME"] = java_home
      for k, v in env.iteritems():
        assert v is not None, "Environment key %s missing value." % k
  
      base_args = [ hadoop.conf.HADOOP_BIN.get() ]
      if hadoop.conf.HADOOP_CONF_DIR.get():
        base_args.append("--config")
        base_args.append(hadoop.conf.HADOOP_CONF_DIR.get())
  
      if delegation_token_files:
        args = list(base_args) # Make a copy of the base args.
        args += ['jar', hadoop.conf.CREDENTIALS_MERGER_JAR.get(), merged_token_file.name]
        args += [token_file.name for token_file in delegation_token_files]
        LOG.debug("merging credentials files with comand: '%s'" % (' '.join(args),))
        merge_pipe = subprocess.Popen(args, shell=False, close_fds=True)
        retcode = merge_pipe.wait()
        if 0 != retcode:
          raise Exception("bin/hadoop returned non-zero %d while trying to merge credentials" % (retcode,))
        env['HADOOP_TOKEN_FILE_LOCATION'] = merged_token_file.name
  
      args = list(base_args) # Make a copy of the base args.
      args += step.arguments
      LOG.info("Starting %s.  (Env: %s)", repr(args), repr(env))
      LOG.info("Running: %s" % " ".join(args))
      self.pipe = subprocess.Popen(
        args,
        stdin=None,
        cwd=self.work_dir,
        stdout=self.stdout,
        stderr=self.stderr,
        shell=False,
        close_fds=True,
        env=env)
      retcode = self.pipe.wait()
      if 0 != retcode:
        raise Exception("bin/hadoop returned non-zero %d" % retcode)
      LOG.info("bin/hadoop returned %d" % retcode)
    finally:
      for token_file in delegation_token_files + [merged_token_file]:
        token_file.close()
Esempio n. 29
0
def import_wizard(request):
    """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
    encoding = i18n.get_site_encoding()

    if request.method == 'POST':
        # Have a while loop to allow an easy way to break
        for _ in range(1):
            #
            # General processing logic:
            # - We have 3 steps. Each requires the previous.
            #   * Step 1      : Table name and file location
            #   * Step 2a     : Display sample with auto chosen delim
            #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
            #   * Step 3      : Display sample, and define columns
            # - Each step is represented by a different form. The form of an earlier step
            #   should be present when submitting to a later step.
            # - To preserve the data from the earlier steps, we send the forms back as
            #   hidden fields. This way, when users revisit a previous step, the data would
            #   be there as well.
            #
            delim_is_auto = False
            fields_list, n_cols = [[]], 0
            s3_col_formset = None

            # Everything requires a valid file form
            s1_file_form = beeswax.forms.CreateByImportFileForm(request.POST)
            if not s1_file_form.is_valid():
                break

            do_s2_auto_delim = request.POST.get('submit_file')  # Step 1 -> 2
            do_s2_user_delim = request.POST.get(
                'submit_preview')  # Step 2 -> 2
            do_s3_column_def = request.POST.get('submit_delim')  # Step 2 -> 3
            do_hive_create = request.POST.get(
                'submit_create')  # Step 3 -> execute

            # Exactly one of these should be True
            assert len(
                filter(None,
                       (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def,
                        do_hive_create))) == 1, 'Invalid form submission'

            #
            # Fix up what we should do in case any form is invalid
            #
            if not do_s2_auto_delim:
                # We should have a valid delim form
                s2_delim_form = beeswax.forms.CreateByImportDelimForm(
                    request.POST)
                if not s2_delim_form.is_valid():
                    # Go back to picking delimiter
                    do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False

            if do_hive_create:
                # We should have a valid columns formset
                s3_col_formset = beeswax.forms.ColumnTypeFormSet(
                    prefix='cols', data=request.POST)
                if not s3_col_formset.is_valid():
                    # Go back to define columns
                    do_s3_column_def, do_hive_create = True, False

            #
            # Go to step 2: We've just picked the file. Preview it.
            #
            if do_s2_auto_delim:
                delim_is_auto = True
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs, s1_file_form, encoding,
                    [reader.TYPE for reader in FILE_READERS], DELIMITERS)

            if (do_s2_user_delim
                    or do_s3_column_def) and s2_delim_form.is_valid():
                # Delimit based on input
                fields_list, n_cols, s2_delim_form = _delim_preview(
                    request.fs, s1_file_form, encoding,
                    (s2_delim_form.cleaned_data['file_type'], ),
                    (s2_delim_form.cleaned_data['delimiter'], ))

            if do_s2_auto_delim or do_s2_user_delim:
                return render(
                    'choose_delimiter.mako', request,
                    dict(
                        action=urlresolvers.reverse(import_wizard),
                        delim_readable=DELIMITER_READABLE[
                            s2_delim_form['delimiter'].data[0]],
                        initial=delim_is_auto,
                        file_form=s1_file_form,
                        delim_form=s2_delim_form,
                        fields_list=fields_list,
                        delimiter_choices=beeswax.forms.TERMINATOR_CHOICES,
                        n_cols=n_cols,
                    ))

            #
            # Go to step 3: Define column.
            #
            if do_s3_column_def:
                if s3_col_formset is None:
                    columns = []
                    for i in range(n_cols):
                        columns.append(
                            dict(
                                column_name='col_%s' % (i, ),
                                column_type='string',
                            ))
                    s3_col_formset = beeswax.forms.ColumnTypeFormSet(
                        prefix='cols', initial=columns)
                return render(
                    'define_columns.mako', request,
                    dict(
                        action=urlresolvers.reverse(import_wizard),
                        file_form=s1_file_form,
                        delim_form=s2_delim_form,
                        column_formset=s3_col_formset,
                        fields_list=fields_list,
                        n_cols=n_cols,
                    ))

            #
            # Finale: Execute
            #
            if do_hive_create:
                delim = s2_delim_form.cleaned_data['delimiter']
                table_name = s1_file_form.cleaned_data['name']
                proposed_query = django_mako.render_to_string(
                    "create_table_statement.mako", {
                        'table':
                        dict(name=table_name,
                             comment=s1_file_form.cleaned_data['comment'],
                             row_format='Delimited',
                             field_terminator=delim),
                        'columns':
                        [f.cleaned_data for f in s3_col_formset.forms],
                        'partition_columns': []
                    })

                do_load_data = s1_file_form.cleaned_data.get('do_import')
                path = s1_file_form.cleaned_data['path']
                return _submit_create_and_load(request, proposed_query,
                                               table_name, path, do_load_data)
    else:
        s1_file_form = beeswax.forms.CreateByImportFileForm()

    return render(
        'choose_file.mako', request,
        dict(
            action=urlresolvers.reverse(import_wizard),
            file_form=s1_file_form,
        ))
Esempio n. 30
0
 def clean_encoding(self):
     encoding = self.cleaned_data.get('encoding', '').strip()
     if not encoding:
         return i18n.get_site_encoding()
     return encoding
Esempio n. 31
0
def import_wizard(request, database='default'):
  """
  Help users define table and based on a file they want to import to Hive.
  Limitations:
    - Rows are delimited (no serde).
    - No detection for map and array types.
    - No detection for the presence of column header in the first row.
    - No partition table.
    - Does not work with binary data.
  """
  encoding = i18n.get_site_encoding()
  app_name = get_app_name(request)

  if request.method == 'POST':
    #
    # General processing logic:
    # - We have 3 steps. Each requires the previous.
    #   * Step 1      : Table name and file location
    #   * Step 2a     : Display sample with auto chosen delim
    #   * Step 2b     : Display sample with user chosen delim (if user chooses one)
    #   * Step 3      : Display sample, and define columns
    # - Each step is represented by a different form. The form of an earlier step
    #   should be present when submitting to a later step.
    # - To preserve the data from the earlier steps, we send the forms back as
    #   hidden fields. This way, when users revisit a previous step, the data would
    #   be there as well.
    #
    delim_is_auto = False
    fields_list, n_cols = [[]], 0
    s3_col_formset = None

    db = dbms.get(request.user)
    s1_file_form = CreateByImportFileForm(request.POST, db=db)

    if s1_file_form.is_valid():
      do_s2_auto_delim = request.POST.get('submit_file')        # Step 1 -> 2
      do_s2_user_delim = request.POST.get('submit_preview')     # Step 2 -> 2
      do_s3_column_def = request.POST.get('submit_delim')       # Step 2 -> 3
      do_hive_create = request.POST.get('submit_create')        # Step 3 -> execute

      cancel_s2_user_delim = request.POST.get('cancel_delim')   # Step 2 -> 1
      cancel_s3_column_def = request.POST.get('cancel_create')  # Step 3 -> 2

      # Exactly one of these should be True
      if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1:
        raise PopupException(_('Invalid form submission'))

      if not do_s2_auto_delim:
        # We should have a valid delim form
        s2_delim_form = CreateByImportDelimForm(request.POST)
        if not s2_delim_form.is_valid():
          # Go back to picking delimiter
          do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False
      if do_hive_create:
        # We should have a valid columns formset
        s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST)
        if not s3_col_formset.is_valid():
          # Go back to define columns
          do_s3_column_def, do_hive_create = True, False

      #
      # Go to step 2: We've just picked the file. Preview it.
      #
      if do_s2_auto_delim:
        delim_is_auto = True
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS)

      if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid():
        # Delimit based on input
        fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],),
                                                            (s2_delim_form.cleaned_data['delimiter'],))

      if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def:
        return render('choose_delimiter.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]),
          'initial': delim_is_auto,
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'fields_list': fields_list,
          'delimiter_choices': TERMINATOR_CHOICES,
          'n_cols': n_cols,
          'database': database,
        })

      #
      # Go to step 3: Define column.
      #
      if do_s3_column_def:
        if s3_col_formset is None:
          columns = []
          for i in range(n_cols):
            columns.append(dict(
                column_name='col_%s' % (i,),
                column_type='string',
            ))
          s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns)
        return render('define_columns.mako', request, {
          'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
          'file_form': s1_file_form,
          'delim_form': s2_delim_form,
          'column_formset': s3_col_formset,
          'fields_list': fields_list,
          'n_cols': n_cols,
          'database': database,
        })

      #
      # Final: Execute
      #
      if do_hive_create:
        delim = s2_delim_form.cleaned_data['delimiter']
        table_name = s1_file_form.cleaned_data['name']
        proposed_query = django_mako.render_to_string("create_table_statement.mako", {
            'table': dict(name=table_name,
                          comment=s1_file_form.cleaned_data['comment'],
                          row_format='Delimited',
                          field_terminator=delim),
            'columns': [ f.cleaned_data for f in s3_col_formset.forms ],
            'partition_columns': [],
            'database': database,
          }
        )

        do_load_data = s1_file_form.cleaned_data.get('do_import')
        path = s1_file_form.cleaned_data['path']
        return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database)
  else:
    s1_file_form = CreateByImportFileForm()

  return render('choose_file.mako', request, {
    'action': reverse(app_name + ':import_wizard', kwargs={'database': database}),
    'file_form': s1_file_form,
    'database': database,
  })
Esempio n. 32
0
def display(request, path):
    """
    Implements displaying part of a file.

    GET arguments are length, offset, mode, compression and encoding
    with reasonable defaults chosen.

    Note that display by length and offset are on bytes, not on characters.

    TODO(philip): Could easily built-in file type detection
    (perhaps using something similar to file(1)), as well
    as more advanced binary-file viewing capability (de-serialize
    sequence files, decompress gzipped text files, etc.).
    There exists a python-magic package to interface with libmagic.
    """
    if not request.fs.isfile(path):
        raise PopupException(_("Not a file: '%(path)s'") % {'path': path})

    mimetype = mimetypes.guess_type(path)[0]

    if mimetype is not None and INLINE_DISPLAY_MIMETYPE.search(mimetype):
      path_enc = urlencode(path)
      return redirect(reverse('filebrowser.views.download', args=[path_enc]) + '?disposition=inline')

    stats = request.fs.stats(path)
    encoding = request.GET.get('encoding') or i18n.get_site_encoding()

    # I'm mixing URL-based parameters and traditional
    # HTTP GET parameters, since URL-based parameters
    # can't naturally be optional.

    # Need to deal with possibility that length is not present
    # because the offset came in via the toolbar manual byte entry.
    end = request.GET.get("end")
    if end:
        end = int(end)
    begin = request.GET.get("begin", 1)
    if begin:
        # Subtract one to zero index for file read
        begin = int(begin) - 1
    if end:
        offset = begin
        length = end - begin
        if begin >= end:
            raise PopupException(_("First byte to display must be before last byte to display."))
    else:
        length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES))
        # Display first block by default.
        offset = int(request.GET.get("offset", 0))

    mode = request.GET.get("mode")
    compression = request.GET.get("compression")

    if mode and mode not in ["binary", "text"]:
        raise PopupException(_("Mode must be one of 'binary' or 'text'."))
    if offset < 0:
        raise PopupException(_("Offset may not be less than zero."))
    if length < 0:
        raise PopupException(_("Length may not be less than zero."))
    if length > MAX_CHUNK_SIZE_BYTES:
        raise PopupException(_("Cannot request chunks greater than %(bytes)d bytes.") % {'bytes': MAX_CHUNK_SIZE_BYTES})

    # Do not decompress in binary mode.
    if mode == 'binary':
        compression = 'none'
        # Read out based on meta.
    compression, offset, length, contents =\
    read_contents(compression, path, request.fs, offset, length)

    # Get contents as string for text mode, or at least try
    uni_contents = None
    if not mode or mode == 'text':
        uni_contents = unicode(contents, encoding, errors='replace')
        is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1
        # Auto-detect mode
        if not mode:
            mode = is_binary and 'binary' or 'text'

    # Get contents as bytes
    if mode == "binary":
        xxd_out = list(xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE))

    dirname = posixpath.dirname(path)
    # Start with index-like data:
    data = _massage_stats(request, request.fs.stats(path))
    # And add a view structure:
    data["success"] = True
    data["view"] = {
        'offset': offset,
        'length': length,
        'end': offset + len(contents),
        'dirname': dirname,
        'mode': mode,
        'compression': compression,
        'size': stats['size'],
        'max_chunk_size': str(MAX_CHUNK_SIZE_BYTES)
    }
    data["filename"] = os.path.basename(path)
    data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE
    if mode == "binary":
        # This might be the wrong thing for ?format=json; doing the
        # xxd'ing in javascript might be more compact, or sending a less
        # intermediate representation...
        logger.debug("xxd: " + str(xxd_out))
        data['view']['xxd'] = xxd_out
        data['view']['masked_binary_data'] = False
    else:
        data['view']['contents'] = uni_contents
        data['view']['masked_binary_data'] = is_binary

    data['breadcrumbs'] = parse_breadcrumbs(path)

    return render("display.mako", request, data)
Esempio n. 33
0
    def run_bin_hadoop_step(self, step):
        """
    user.name is used by FileSystem.getHomeDirectory().
    The environment variables for _USER and _GROUPS are used
    by the aspectj aspect to overwrite Hadoop's notion of 
    users and groups.
    """
        java_properties = {}
        java_properties["hue.suffix"] = "-via-hue"
        java_properties["user.name"] = self.plan.user
        java_prop_str = " ".join("-D%s=%s" % (k, v)
                                 for k, v in java_properties.iteritems())
        env = {
            'HADOOP_HOME':
            hadoop.conf.HADOOP_HOME.get(),
            'HADOOP_OPTS':
            "-javaagent:%s %s" %
            (jobsub.conf.ASPECTJWEAVER.get(), java_prop_str),
            'HADOOP_CLASSPATH':
            ':'.join([
                jobsub.conf.ASPECTPATH.get(),
                hadoop.conf.HADOOP_EXTRA_CLASSPATH_STRING.get()
            ]),
            'HUE_JOBTRACE_LOG':
            self.internal_file_name("jobs"),
            'HUE_JOBSUB_USER':
            self.plan.user,
            'HUE_JOBSUB_GROUPS':
            ",".join(self.plan.groups),
            'LANG':
            os.getenv('LANG', i18n.get_site_encoding()),
        }

        delegation_token_files = []
        all_clusters = []
        all_clusters += all_mrclusters().values()
        all_clusters += get_all_hdfs().values()
        LOG.info("all_clusters: %s" % (repr(all_clusters), ))
        for cluster in all_clusters:
            if cluster.security_enabled:
                cluster.setuser(self.plan.user)
                token = cluster.get_delegation_token()
                token_file = tempfile.NamedTemporaryFile()
                token_file.write(token.delegationTokenBytes)
                token_file.flush()
                delegation_token_files.append(token_file)

        if delegation_token_files:
            env['HADOOP_TOKEN_FILE_LOCATION'] = ','.join(
                [token_file.name for token_file in delegation_token_files])

        java_home = os.getenv('JAVA_HOME')
        if java_home:
            env["JAVA_HOME"] = java_home
        for k, v in env.iteritems():
            assert v is not None, "Environment key %s missing value." % k

        args = [hadoop.conf.HADOOP_BIN.get()]
        if hadoop.conf.HADOOP_CONF_DIR.get():
            args.append("--config")
            args.append(hadoop.conf.HADOOP_CONF_DIR.get())

        args += step.arguments
        LOG.info("Starting %s.  (Env: %s)", repr(args), repr(env))
        LOG.info("Running: %s" % " ".join(args))
        self.pipe = subprocess.Popen(args,
                                     stdin=None,
                                     cwd=self.work_dir,
                                     stdout=self.stdout,
                                     stderr=self.stderr,
                                     shell=False,
                                     close_fds=True,
                                     env=env)
        retcode = self.pipe.wait()
        if 0 != retcode:
            raise Exception("bin/hadoop returned non-zero %d" % retcode)
        LOG.info("bin/hadoop returned %d" % retcode)
        for token_file in delegation_token_files:
            token_file.close()