def _delim_preview(fs, file_form, encoding, file_types, delimiters): """ _delim_preview(fs, file_form, encoding, file_types, delimiters) -> (fields_list, n_cols, delim_form) Look at the beginning of the file and parse it according to the list of available file_types and delimiters. """ assert file_form.is_valid() path = file_form.cleaned_data['path'] try: # If path is a directory, find first file object if fs.isdir(path): children = fs.listdir(path) if children: path = '%s/%s' % (path, children[0]) file_obj = fs.open(path) delim, file_type, fields_list = _parse_fields(path, file_obj, encoding, file_types, delimiters) file_obj.close() except IOError as ex: msg = "Failed to open file '%s': %s" % (path, ex) LOG.exception(msg) raise PopupException(msg) n_cols = max([len(row) for row in fields_list]) # ``delimiter`` is a MultiValueField. delimiter_0 and delimiter_1 are the sub-fields. delimiter_0 = delim delimiter_1 = '' # If custom delimiter if not [val for val in TERMINATOR_CHOICES if val[0] == delim]: delimiter_0 = '__other__' delimiter_1 = delim delim_form = CreateByImportDelimForm( dict(delimiter_0=delimiter_0, delimiter_1=delimiter_1, file_type=file_type, n_cols=n_cols)) if not delim_form.is_valid(): assert False, _( 'Internal error when constructing the delimiter form: %(error)s.' ) % { 'error': delim_form.errors } return fields_list, n_cols, delim_form
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{'name':db, 'url':reverse('beeswax:import_wizard', kwargs={'database': db})} for db in dbs] if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, 'databases': databases }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({ 'column_name': 'col_%s' % (i,), 'column_type': 'string', }) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = map(lambda a: re.sub('[^\w]', '', a), fields_list_for_json[0]) # Cleaning headers return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'fields_list_json': json.dumps(fields_list_for_json), 'n_cols': n_cols, 'database': database, 'databases': databases }) except Exception, e: raise PopupException(_("The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."), detail=e) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': { 'name': table_name, 'comment': s1_file_form.cleaned_data['comment'], 'row_format': 'Delimited', 'field_terminator': delim }, 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, 'databases': databases } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] try: return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database) except QueryServerException, e: raise PopupException(_('The table could not be created.'), detail=e.message)
raise PopupException(msg) n_cols = max([ len(row) for row in fields_list ]) # ``delimiter`` is a MultiValueField. delimiter_0 and delimiter_1 are the sub-fields. delimiter_0 = delim delimiter_1 = '' # If custom delimiter if not filter(lambda val: val[0] == delim, TERMINATOR_CHOICES): delimiter_0 = '__other__' delimiter_1 = delim delim_form = CreateByImportDelimForm(dict(delimiter_0=delimiter_0, delimiter_1=delimiter_1, file_type=file_type, n_cols=n_cols)) if not delim_form.is_valid(): assert False, _('Internal error when constructing the delimiter form: %(error)s.') % {'error': delim_form.errors} return fields_list, n_cols, delim_form def _parse_fields(path, file_obj, encoding, filetypes, delimiters): """ _parse_fields(path, file_obj, encoding, filetypes, delimiters) -> (delimiter, filetype, fields_list) Go through the list of ``filetypes`` (gzip, text) and stop at the first one that works for the data. Then apply the list of ``delimiters`` and pick the most appropriate one. ``path`` is used for debugging only. Return the best delimiter, filetype and the data broken down into rows of fields.
def import_wizard(request, database="default"): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{"name": db, "url": reverse("beeswax:import_wizard", kwargs={"database": db})} for db in dbs] if request.method == "POST": # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get("submit_file") # Step 1 -> 2 do_s2_user_delim = request.POST.get("submit_preview") # Step 2 -> 2 do_s3_column_def = request.POST.get("submit_delim") # Step 2 -> 3 do_hive_create = request.POST.get("submit_create") # Step 3 -> execute cancel_s2_user_delim = request.POST.get("cancel_delim") # Step 2 -> 1 cancel_s3_column_def = request.POST.get("cancel_create") # Step 3 -> 2 # Exactly one of these should be True if ( len( filter( None, ( do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def, ), ) ) != 1 ): raise PopupException(_("Invalid form submission")) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix="cols", data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS ) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data["file_type"],), (s2_delim_form.cleaned_data["delimiter"],), ) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render( "choose_delimiter.mako", request, { "action": reverse(app_name + ":import_wizard", kwargs={"database": database}), "delim_readable": DELIMITER_READABLE.get( s2_delim_form["delimiter"].data[0], s2_delim_form["delimiter"].data[1] ), "initial": delim_is_auto, "file_form": s1_file_form, "delim_form": s2_delim_form, "fields_list": fields_list, "delimiter_choices": TERMINATOR_CHOICES, "n_cols": n_cols, "database": database, "databases": databases, }, ) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({"column_name": "col_%s" % (i,), "column_type": "string"}) s3_col_formset = ColumnTypeFormSet(prefix="cols", initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = map( lambda a: re.sub("[^\w]", "", a), fields_list_for_json[0] ) # Cleaning headers return render( "define_columns.mako", request, { "action": reverse(app_name + ":import_wizard", kwargs={"database": database}), "file_form": s1_file_form, "delim_form": s2_delim_form, "column_formset": s3_col_formset, "fields_list": fields_list, "fields_list_json": json.dumps(fields_list_for_json), "n_cols": n_cols, "database": database, "databases": databases, }, ) except Exception, e: raise PopupException( _( "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns." ), detail=e, ) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data["delimiter"] table_name = s1_file_form.cleaned_data["name"] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { "table": { "name": table_name, "comment": s1_file_form.cleaned_data["comment"], "row_format": "Delimited", "field_terminator": delim, }, "columns": [f.cleaned_data for f in s3_col_formset.forms], "partition_columns": [], "database": database, "databases": databases, }, ) do_load_data = s1_file_form.cleaned_data.get("do_import") path = s1_file_form.cleaned_data["path"] return _submit_create_and_load( request, proposed_query, table_name, path, do_load_data, database=database )
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i,), column_type='string', )) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'n_cols': n_cols, 'database': database, }) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database) else: s1_file_form = CreateByImportFileForm() return render('choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'database': database, })
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{ 'name': db, 'url': reverse('beeswax:import_wizard', kwargs={'database': db}) } for db in dbs] if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get( 'submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get( 'submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get( 'cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get( 'cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len([ _f for _f in (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def) if _f ]) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False load_data = s1_file_form.cleaned_data.get('load_data', 'IMPORT').upper() path = s1_file_form.cleaned_data['path'] # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: try: if load_data == 'IMPORT': if not request.fs.isfile(path): raise PopupException( _('Path location must refer to a file if "Import Data" is selected.' )) elif load_data == 'EXTERNAL': if not request.fs.isdir(path): raise PopupException( _('Path location must refer to a directory if "Create External Table" is selected.' )) except (IOError, S3FileSystemException) as e: raise PopupException( _('Path location "%s" is invalid: %s') % (path, e)) delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'], ), (s2_delim_form.cleaned_data['delimiter'], )) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: apps_list = _get_apps(request.user, '') return render( 'import_wizard_choose_delimiter.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get( s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, 'databases': databases }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({ 'column_name': 'col_%s' % (i, ), 'column_type': 'string', }) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = [ re.sub('[^\w]', '', a) for a in fields_list_for_json[0] ] # Cleaning headers apps_list = _get_apps(request.user, '') return render( 'import_wizard_define_columns.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'fields_list_json': json.dumps(fields_list_for_json), 'n_cols': n_cols, 'database': database, 'databases': databases }) except Exception as e: raise PopupException(_( "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns." ), detail=e) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { 'table': { 'name': table_name, 'comment': s1_file_form.cleaned_data['comment'], 'row_format': 'Delimited', 'field_terminator': delim, 'file_format': 'TextFile', 'load_data': load_data, 'path': path, 'skip_header': request.GET.get('removeHeader', 'off').lower() == 'on' }, 'columns': [f.cleaned_data for f in s3_col_formset.forms], 'partition_columns': [], 'database': database, 'databases': databases }) try: return _submit_create_and_load(request, proposed_query, table_name, path, load_data, database=database) except QueryServerException as e: raise PopupException(_('The table could not be created.'), detail=e.message) else: s1_file_form = CreateByImportFileForm() return render( 'import_wizard_choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database }), 'file_form': s1_file_form, 'database': database, 'databases': databases })
def import_wizard(request, database=None): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ database = _get_last_database(request, database) encoding = i18n.get_site_encoding() app_name = get_app_name(request) if request.method == 'POST': # Have a while loop to allow an easy way to break for _ in range(1): # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [ [] ], 0 s3_col_formset = None # Everything requires a valid file form db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if not s1_file_form.is_valid(): break do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True assert len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) == 1, 'Invalid form submission' # # Fix up what we should do in case any form is invalid # if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [ reader.TYPE for reader in FILE_READERS ], DELIMITERS, False) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],), s2_delim_form.cleaned_data.get('read_column_headers')) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, }) # # Go to step 3: Define column. # if do_s3_column_def: read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers') if s3_col_formset is None or not read_column_headers: columns = [] if read_column_headers and fields_list: first_row = fields_list[0] for i in range(n_cols): columns.append(dict( column_name=first_row[i] if i < len(first_row) else 'col_%s' % (i + 1,), column_type='string', )) fields_list = fields_list[1:] else: for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i + 1,), column_type='string', )) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'n_cols': n_cols, 'database': database, }) # # Finale: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data.get('path') read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers') if read_column_headers and do_load_data: file_type = s2_delim_form.cleaned_data.get('file_type') file_readers = [ reader for reader in FILE_READERS if reader.TYPE == file_type ] if len(file_readers) == 1: try: file_obj = request.fs.open(path) _skip_first_line_in_file(file_readers[0], request.fs, path, file_readers[0].find(file_obj, '\n') + 1) except Exception, ex: msg = _('Cannot process file: %s' % (ex,)) LOG.error(msg) raise PopupException(msg) return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database)
def import_wizard(request): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() if request.method == "POST": # Have a while loop to allow an easy way to break for _ in range(1): # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None # Everything requires a valid file form db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if not s1_file_form.is_valid(): break do_s2_auto_delim = request.POST.get("submit_file") # Step 1 -> 2 do_s2_user_delim = request.POST.get("submit_preview") # Step 2 -> 2 do_s3_column_def = request.POST.get("submit_delim") # Step 2 -> 3 do_hive_create = request.POST.get("submit_create") # Step 3 -> execute cancel_s2_user_delim = request.POST.get("cancel_delim") # Step 2 -> 1 cancel_s3_column_def = request.POST.get("cancel_create") # Step 3 -> 2 # Exactly one of these should be True assert ( len( filter( None, ( do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def, ), ) ) == 1 ), "Invalid form submission" # # Fix up what we should do in case any form is invalid # if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix="cols", data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS ) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data["file_type"],), (s2_delim_form.cleaned_data["delimiter"],), ) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render( "choose_delimiter.mako", request, dict( action=urlresolvers.reverse(import_wizard), delim_readable=DELIMITER_READABLE.get( s2_delim_form["delimiter"].data[0], s2_delim_form["delimiter"].data[1] ), initial=delim_is_auto, file_form=s1_file_form, delim_form=s2_delim_form, fields_list=fields_list, delimiter_choices=TERMINATOR_CHOICES, n_cols=n_cols, ), ) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append(dict(column_name="col_%s" % (i,), column_type="string")) s3_col_formset = ColumnTypeFormSet(prefix="cols", initial=columns) return render( "define_columns.mako", request, dict( action=urlresolvers.reverse(import_wizard), file_form=s1_file_form, delim_form=s2_delim_form, column_formset=s3_col_formset, fields_list=fields_list, n_cols=n_cols, ), ) # # Finale: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data["delimiter"] table_name = s1_file_form.cleaned_data["name"] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { "table": dict( name=table_name, comment=s1_file_form.cleaned_data["comment"], row_format="Delimited", field_terminator=delim, ), "columns": [f.cleaned_data for f in s3_col_formset.forms], "partition_columns": [], }, ) do_load_data = s1_file_form.cleaned_data.get("do_import") path = s1_file_form.cleaned_data["path"] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data) else: s1_file_form = CreateByImportFileForm() return render("choose_file.mako", request, dict(action=urlresolvers.reverse(import_wizard), file_form=s1_file_form))