def parse(self, html, cols): if not isinstance(html, unicode): raise UnicodeInputRequired utf8html = html.encode('utf8') tf_name = _write_to_and_return_tempfile_name(utf8html) # Replace cols marker: f = lambda x: ((x == ['cols']) and str(cols)) or x # Replace filename marker: g = lambda x: ((x == ['filename']) and tf_name) or x commandline_list = _mapmany([f,g], self.commandline_list) commandline = ''.join(commandline_list) # Run the process using popen3; possibly dodgy on Windows! # Need popen3 rather other popen function because we want to # grab stderr and hide it from the clients console. (stdin, stdout, stderr) = os.popen3(commandline, 'r') utf8output = stdout.read() exit_status = stdout.close() _remove_tempfile(tf_name) # Just in case the parser outputs bogus utf8: # Check the return code: if exit_status is not None: raise HTMLParsingFailed # Convert back to unicode object and return: try: output = unicode(utf8output, 'utf8') return output except (LookupError, UnicodeError): raise HTMLParsingFailed
def extract(input, # byte string of file location input_disposition='byte_string', # ['byte_string', 'file_location'] compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory'] my_directory=None, # directory path backup_extension=None, # extension including dot, for backup of my_directory directory_structure='retain', # ['retain', 'flatten'] file_handle = None, # [None, 'py', 'os'] file_handle_mode = 'rb', force_file_permissions=None, # file permission bits. eg 0o777. force_dir_permissions=None, # file permission bits. eg 0o777. umask=None, # file permission bits. eg. 0o777 (assuming standard umask interpretation). allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo'] on_find_invalid_file_type='throw_error', # ['throw_error', 'skip'] filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' num_random_bits=8, # number of random bits to use in the random filename. allow_clobber=False, # [True, False] on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow'] on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow'] # Shelved options: # file_name_regexp, non_matches='rename_safely', etc. # Hopefully to be implemented in the future. ): # Clean out the written files list: global _remove_on_error global _remove_always _remove_on_error = [] _remove_always = [] # Validate arguments. _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location']) _validate_args('compression_hint', compression_hint, [None] + available_tools.keys()) _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory']) # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles']) f = lambda type: _validate_args('allow_file_types', type, _valid_file_types) map(f, allow_file_types) if not input: raise ValueError('argument input must specify a filename or a byte string') # From here on, we start writing things out to disk, so we wrap it # in a try loop and catch all exceptions. This allows us to clean # up the disk if we didn't succeed with the whole of the # extraction. try: # try/except/finally cannot be combined, so we have to nest: try: # Write input to a temp file if we are given a byte string. if input_disposition == 'byte_string': input_file_loc = _write_to_and_return_tempfile_name(input) _remember_write(input_file_loc) else: # input_disposition == 'file_location' # Check that the input file location we've been given exists; # stat will throw the right error for us: os.stat(input) # Check it is a file: if not os.path.isfile(input): raise ValueError("argument input must be a path to an archive file if input_disposition='file_location': %s" % (input)) input_file_loc = input # Make sure we know what type of file we're dealing with: if compression_hint is None: compression_ext = _calculate_filename_extension(filename=input_file_loc) compression_ext = _pick_compression_type(compression_ext) else: compression_ext = compression_hint # Select approriate archive/compression tool: try: tool_class = available_tools[compression_ext] except KeyError: raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext)) # Instantiate the tool: archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber) if extract_to == 'byte_strings': # If extract_to == byte_strings, permissions mean nothing. # However, because we use a temp directory to load the files # into byte strings, we force the permissions to be nice and # liberal inside the temp dir: force_file_permissions = 0700 force_dir_permissions = 0700 # Get extraction_root: if extract_to == 'byte_strings' or extract_to == 'temp_directory': # Need a temp directory to work in. extraction_root = tempfile.mkdtemp() if extract_to == 'byte_strings': _remember_write(extraction_root, error_only=False) else: # extract_to == 'temp_directory': _remember_write(extraction_root, error_only=True) else: # extract_to == 'my_directory': if my_directory is None: raise ValueError("my_directory must be specified if extract_to='my_directory'") # Make given directory into a nice sane one. my_directory = os.path.abspath(os.path.expanduser(os.path.normpath(my_directory))) # Check it exists, and we can stat it: # stat will throw the right error for us: os.stat(my_directory) # Check it is a dir. if not os.path.isdir(my_directory): raise ValueError("argument my_directory must be a directory: %s" % (my_directory)) # If we've been asked to back it up, do so: if backup_extension is not None: backup_dir = my_directory + backup_extension if _backup_directory(my_directory, backup_dir) is not None: raise EZArchiveError('creation of backup directory using GNU mirrordir failed: %s' % (backup_dir)) # Finally set the extraction root: extraction_root = my_directory # Logically we would also check we have write permissions # here. But this is acutally better served by letting # builtin/other functions raise EnvironmentErrors when we fail # to write: Checking for write permissions is actually quite # complex: e.g. you'd have to check group membership to see if # the group bits allow write. # If we haven't been given a umask, use take the system umask as a # default. If we have been given a umask, set the system umask to # it, so all calls to builtin open/file apply the given umask: if umask is None: # It doesn't seem possible to read the umask without also # setting it. Hence this fudge: umask = os.umask(0o777) os.umask(umask) # Used in the extraction for loop to check for filename collisions # when flattening directory structure: seen_filenames = {} # Collect the returned file information here: return_data = [] for mem in archive.list_all_members(): name = mem['name'] dir = mem['dir'] file_type = mem['file_type'] identity_object = mem['identity_object'] # Check it is an allowed file type: if file_type not in allow_file_types: if on_find_invalid_file_type=='skip': continue else: # on_find_invalid_file_type='throw_error': raise EZArchiveError("found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name))) # Deal with dotdot paths: if on_find_dotdot_path == 'allow': pass else: # check if path contains '..' dir_parts = dir.split(os.sep) if '..' in dir_parts or name == '..': if on_find_dotdot_path == 'throw_error': raise EZArchiveError("tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): " + os.path.join(dir, name)) else: # on_find_dotdot_path == 'skip' # next file please: continue # Deal with absolute paths in a similar way: if on_find_absolute_path == 'allow': pass else: # check if path begins with '/' if dir != '' and dir[0] == '/': if on_find_absolute_path == 'throw_error': raise EZArchiveError("tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): " + os.path.join(dir, name)) else: # on_find_absolute_path == 'skip' # next file please: continue # Deal with flattening of directories: if directory_structure == 'flatten': dir = '' if file_type == 'dir': continue # tars allow multiple entries for same path/file: # extracting such tarballs with GNU/tar will just # cause the second entry to overwrite the first. We # try to be more graceful: verified_fullname = _verify_filename(name=os.path.join(dir, name), seen_filenames=seen_filenames, filename_collision=filename_collision, num_random_bits=num_random_bits, rename_from_set=rename_from_set) if verified_fullname == ['skip']: continue name = os.path.basename(verified_fullname) archive.extract_member(identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name, umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions, allow_clobber=allow_clobber) fullname = os.path.join(extraction_root, dir, name) file_info = {} file_info['basename'] = name file_info['tar_dir'] = dir file_info['file_type'] = file_type if extract_to == 'byte_strings': if file_type == 'regular': file_info['file'] = open(fullname, 'rb').read() else: # extract_to in ['my_directory', 'temp_directory'] file_info['fullname'] = fullname file_info['dirname'] = os.path.join(extraction_root, dir) if file_type == 'regular': if file_handle == 'py': file_info['fh'] = open(fullname, file_handle_mode) elif file_handle == 'os': file_info['fh'] = os.open(fullname, file_handle_mode) return_data.append(file_info) if extract_to == 'temp_directory': return (extraction_root, return_data) else: return return_data except: # Clean up non-temporary file if we get an error: _delete_files(_remove_on_error) raise finally: # Always clean up temporary files, error or not: _delete_files(_remove_always)
def extract( input, # byte string of file location input_disposition='byte_string', # ['byte_string', 'file_location'] compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip'] extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory'] my_directory=None, # directory path backup_extension=None, # extension including dot, for backup of my_directory directory_structure='retain', # ['retain', 'flatten'] file_handle=None, # [None, 'py', 'os'] file_handle_mode='rb', force_file_permissions=None, # file permission bits. eg 0o777. force_dir_permissions=None, # file permission bits. eg 0o777. umask=None, # file permission bits. eg. 0o777 (assuming standard umask interpretation). allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo'] on_find_invalid_file_type='throw_error', # ['throw_error', 'skip'] filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip'] rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random' num_random_bits=8, # number of random bits to use in the random filename. allow_clobber=False, # [True, False] on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow'] on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow'] # Shelved options: # file_name_regexp, non_matches='rename_safely', etc. # Hopefully to be implemented in the future. ): # Clean out the written files list: global _remove_on_error global _remove_always _remove_on_error = [] _remove_always = [] # Validate arguments. _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location']) _validate_args('compression_hint', compression_hint, [None] + available_tools.keys()) _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory']) # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles']) f = lambda type: _validate_args('allow_file_types', type, _valid_file_types ) map(f, allow_file_types) if not input: raise ValueError( 'argument input must specify a filename or a byte string') # From here on, we start writing things out to disk, so we wrap it # in a try loop and catch all exceptions. This allows us to clean # up the disk if we didn't succeed with the whole of the # extraction. try: # try/except/finally cannot be combined, so we have to nest: try: # Write input to a temp file if we are given a byte string. if input_disposition == 'byte_string': input_file_loc = _write_to_and_return_tempfile_name(input) _remember_write(input_file_loc) else: # input_disposition == 'file_location' # Check that the input file location we've been given exists; # stat will throw the right error for us: os.stat(input) # Check it is a file: if not os.path.isfile(input): raise ValueError( "argument input must be a path to an archive file if input_disposition='file_location': %s" % (input)) input_file_loc = input # Make sure we know what type of file we're dealing with: if compression_hint is None: compression_ext = _calculate_filename_extension( filename=input_file_loc) compression_ext = _pick_compression_type(compression_ext) else: compression_ext = compression_hint # Select approriate archive/compression tool: try: tool_class = available_tools[compression_ext] except KeyError: raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext)) # Instantiate the tool: archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber) if extract_to == 'byte_strings': # If extract_to == byte_strings, permissions mean nothing. # However, because we use a temp directory to load the files # into byte strings, we force the permissions to be nice and # liberal inside the temp dir: force_file_permissions = 0700 force_dir_permissions = 0700 # Get extraction_root: if extract_to == 'byte_strings' or extract_to == 'temp_directory': # Need a temp directory to work in. extraction_root = tempfile.mkdtemp() if extract_to == 'byte_strings': _remember_write(extraction_root, error_only=False) else: # extract_to == 'temp_directory': _remember_write(extraction_root, error_only=True) else: # extract_to == 'my_directory': if my_directory is None: raise ValueError( "my_directory must be specified if extract_to='my_directory'" ) # Make given directory into a nice sane one. my_directory = os.path.abspath( os.path.expanduser(os.path.normpath(my_directory))) # Check it exists, and we can stat it: # stat will throw the right error for us: os.stat(my_directory) # Check it is a dir. if not os.path.isdir(my_directory): raise ValueError( "argument my_directory must be a directory: %s" % (my_directory)) # If we've been asked to back it up, do so: if backup_extension is not None: backup_dir = my_directory + backup_extension if _backup_directory(my_directory, backup_dir) is not None: raise EZArchiveError( 'creation of backup directory using GNU mirrordir failed: %s' % (backup_dir)) # Finally set the extraction root: extraction_root = my_directory # Logically we would also check we have write permissions # here. But this is acutally better served by letting # builtin/other functions raise EnvironmentErrors when we fail # to write: Checking for write permissions is actually quite # complex: e.g. you'd have to check group membership to see if # the group bits allow write. # If we haven't been given a umask, use take the system umask as a # default. If we have been given a umask, set the system umask to # it, so all calls to builtin open/file apply the given umask: if umask is None: # It doesn't seem possible to read the umask without also # setting it. Hence this fudge: umask = os.umask(0o777) os.umask(umask) # Used in the extraction for loop to check for filename collisions # when flattening directory structure: seen_filenames = {} # Collect the returned file information here: return_data = [] for mem in archive.list_all_members(): name = mem['name'] dir = mem['dir'] file_type = mem['file_type'] identity_object = mem['identity_object'] # Check it is an allowed file type: if file_type not in allow_file_types: if on_find_invalid_file_type == 'skip': continue else: # on_find_invalid_file_type='throw_error': raise EZArchiveError( "found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name))) # Deal with dotdot paths: if on_find_dotdot_path == 'allow': pass else: # check if path contains '..' dir_parts = dir.split(os.sep) if '..' in dir_parts or name == '..': if on_find_dotdot_path == 'throw_error': raise EZArchiveError( "tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): " + os.path.join(dir, name)) else: # on_find_dotdot_path == 'skip' # next file please: continue # Deal with absolute paths in a similar way: if on_find_absolute_path == 'allow': pass else: # check if path begins with '/' if dir != '' and dir[0] == '/': if on_find_absolute_path == 'throw_error': raise EZArchiveError( "tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): " + os.path.join(dir, name)) else: # on_find_absolute_path == 'skip' # next file please: continue # Deal with flattening of directories: if directory_structure == 'flatten': dir = '' if file_type == 'dir': continue # tars allow multiple entries for same path/file: # extracting such tarballs with GNU/tar will just # cause the second entry to overwrite the first. We # try to be more graceful: verified_fullname = _verify_filename( name=os.path.join(dir, name), seen_filenames=seen_filenames, filename_collision=filename_collision, num_random_bits=num_random_bits, rename_from_set=rename_from_set) if verified_fullname == ['skip']: continue name = os.path.basename(verified_fullname) archive.extract_member( identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name, umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions, allow_clobber=allow_clobber) fullname = os.path.join(extraction_root, dir, name) file_info = {} file_info['basename'] = name file_info['tar_dir'] = dir file_info['file_type'] = file_type if extract_to == 'byte_strings': if file_type == 'regular': file_info['file'] = open(fullname, 'rb').read() else: # extract_to in ['my_directory', 'temp_directory'] file_info['fullname'] = fullname file_info['dirname'] = os.path.join(extraction_root, dir) if file_type == 'regular': if file_handle == 'py': file_info['fh'] = open(fullname, file_handle_mode) elif file_handle == 'os': file_info['fh'] = os.open(fullname, file_handle_mode) return_data.append(file_info) if extract_to == 'temp_directory': return (extraction_root, return_data) else: return return_data except: # Clean up non-temporary file if we get an error: _delete_files(_remove_on_error) raise finally: # Always clean up temporary files, error or not: _delete_files(_remove_always)