コード例 #1
0
    def parse(self, html, cols):

        if not isinstance(html, unicode): raise UnicodeInputRequired

        utf8html = html.encode('utf8')
        tf_name = _write_to_and_return_tempfile_name(utf8html)

        # Replace cols marker:
        f = lambda x: ((x == ['cols']) and str(cols)) or x
        # Replace filename marker:
        g = lambda x: ((x == ['filename']) and tf_name) or x

        commandline_list = _mapmany([f, g], self.commandline_list)
        commandline = ''.join(commandline_list)

        # Run the process using popen3; possibly dodgy on Windows!
        # Need popen3 rather other popen function because we want to
        # grab stderr and hide it from the clients console.

        (stdin, stdout, stderr) = os.popen3(commandline, 'r')

        utf8output = stdout.read()
        exit_status = stdout.close()
        _remove_tempfile(tf_name)

        # Just in case the parser outputs bogus utf8:

        # Check the return code:
        if exit_status is not None: raise HTMLParsingFailed

        # Convert back to unicode object and return:
        try:
            output = unicode(utf8output, 'utf8')
            return output
        except (LookupError, UnicodeError):
            raise HTMLParsingFailed
コード例 #2
0
    def parse(self, html, cols):

        if not isinstance(html, unicode): raise UnicodeInputRequired

        utf8html = html.encode('utf8')
        tf_name = _write_to_and_return_tempfile_name(utf8html)

        # Replace cols marker:
        f = lambda x: ((x == ['cols']) and str(cols)) or x
        # Replace filename marker:
        g = lambda x: ((x == ['filename']) and tf_name) or x

        commandline_list = _mapmany([f,g], self.commandline_list)
        commandline = ''.join(commandline_list)

        # Run the process using popen3; possibly dodgy on Windows!
        # Need popen3 rather other popen function because we want to
        # grab stderr and hide it from the clients console.

        (stdin, stdout, stderr) = os.popen3(commandline, 'r')

        utf8output = stdout.read()
        exit_status = stdout.close()
        _remove_tempfile(tf_name)

        # Just in case the parser outputs bogus utf8:

        # Check the return code:
        if exit_status is not None: raise HTMLParsingFailed

        # Convert back to unicode object and return:
        try:
            output = unicode(utf8output, 'utf8')
            return output
        except (LookupError, UnicodeError):
            raise HTMLParsingFailed
コード例 #3
0
def extract(input, # byte string of file location
            input_disposition='byte_string', # ['byte_string', 'file_location']
            compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip']
            extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory']

            my_directory=None, # directory path
            backup_extension=None, # extension including dot, for backup of my_directory
            directory_structure='retain', # ['retain', 'flatten']

            file_handle = None, # [None, 'py', 'os']
            file_handle_mode = 'rb',

            force_file_permissions=None, # file permission bits. eg 0777.
            force_dir_permissions=None, # file permission bits. eg 0777.
            umask=None, # file permission bits. eg. 0777 (assuming standard umask interpretation).

            allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo']
            on_find_invalid_file_type='throw_error', # ['throw_error', 'skip']

            filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip']
            rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random'
            num_random_bits=8, # number of random bits to use in the random filename.

            allow_clobber=False, # [True, False]

            on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow']
            on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow']

            # Shelved options:
            # file_name_regexp, non_matches='rename_safely', etc.
            # Hopefully to be implemented in the future.
            ):

    # Clean out the written files list:
    global _remove_on_error
    global _remove_always
    _remove_on_error = []
    _remove_always = []

    # Validate arguments.
    _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location'])
    _validate_args('compression_hint', compression_hint, [None] + available_tools.keys())
    _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory'])
    # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles'])
    f = lambda type: _validate_args('allow_file_types', type, _valid_file_types)
    map(f, allow_file_types)
    if not input: raise ValueError('argument input must specify a filename or a byte string')

    # From here on, we start writing things out to disk, so we wrap it
    # in a try loop and catch all exceptions. This allows us to clean
    # up the disk if we didn't succeed with the whole of the
    # extraction.

    try:
        # try/except/finally cannot be combined, so we have to nest:
        try:
            # Write input to a temp file if we are given a byte string.
            if input_disposition == 'byte_string':
                input_file_loc = _write_to_and_return_tempfile_name(input)
                _remember_write(input_file_loc)
            else:
                # input_disposition == 'file_location'
                # Check that the input file location we've been given exists;
                # stat will throw the right error for us:
                os.stat(input)

                # Check it is a file:
                if not os.path.isfile(input):
                    raise ValueError("argument input must be a path to an archive file if input_disposition='file_location': %s"
                                     % (input))
                input_file_loc = input

            # Make sure we know what type of file we're dealing with:
            if compression_hint is None:
                compression_ext = _calculate_filename_extension(filename=input_file_loc)
                compression_ext = _pick_compression_type(compression_ext)
            else:
                compression_ext = compression_hint

            # Select approriate archive/compression tool:
            try:
                tool_class = available_tools[compression_ext]
            except KeyError:
                raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext))

            # Instantiate the tool:
            archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber)

            if extract_to == 'byte_strings':
                # If extract_to == byte_strings, permissions mean nothing.
                # However, because we use a temp directory to load the files
                # into byte strings, we force the permissions to be nice and
                # liberal inside the temp dir:
                force_file_permissions = 0700
                force_dir_permissions = 0700

            # Get extraction_root:
            if extract_to == 'byte_strings' or extract_to == 'temp_directory':
                # Need a temp directory to work in.
                extraction_root = tempfile.mkdtemp()

                if extract_to == 'byte_strings':
                    _remember_write(extraction_root, error_only=False)
                else:
                    # extract_to == 'temp_directory':
                    _remember_write(extraction_root, error_only=True)
            else:
                # extract_to == 'my_directory':

                if my_directory is None:
                    raise ValueError("my_directory must be specified if extract_to='my_directory'")

                # Make given directory into a nice sane one.
                my_directory = os.path.abspath(os.path.expanduser(os.path.normpath(my_directory)))

                # Check it exists, and we can stat it:
                # stat will throw the right error for us:
                os.stat(my_directory)

                # Check it is a dir.
                if not os.path.isdir(my_directory):
                    raise ValueError("argument my_directory must be a directory: %s" % (my_directory))

                # If we've been asked to back it up, do so:
                if backup_extension is not None:
                    backup_dir = my_directory + backup_extension
                    if _backup_directory(my_directory, backup_dir) is not None:
                        raise EZArchiveError('creation of backup directory using GNU mirrordir failed: %s' % (backup_dir))

                # Finally set the extraction root:
                extraction_root = my_directory

                # Logically we would also check we have write permissions
                # here.  But this is acutally better served by letting
                # builtin/other functions raise EnvironmentErrors when we fail
                # to write: Checking for write permissions is actually quite
                # complex: e.g. you'd have to check group membership to see if
                # the group bits allow write.

            # If we haven't been given a umask, use take the system umask as a
            # default. If we have been given a umask, set the system umask to
            # it, so all calls to builtin open/file apply the given umask:
            if umask is None:
                # It doesn't seem possible to read the umask without also
                # setting it. Hence this fudge:
                umask = os.umask(0777)
                os.umask(umask)

            # Used in the extraction for loop to check for filename collisions
            # when flattening directory structure:
            seen_filenames = {}

            # Collect the returned file information here:
            return_data = []

            for mem in archive.list_all_members():
                name = mem['name']
                dir = mem['dir']
                file_type = mem['file_type']
                identity_object = mem['identity_object']

                # Check it is an allowed file type:
                if file_type not in allow_file_types:
                    if on_find_invalid_file_type=='skip':
                        continue
                    else:
                        # on_find_invalid_file_type='throw_error':
                        raise EZArchiveError("found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name)))

                # Deal with dotdot paths:
                if on_find_dotdot_path == 'allow':
                    pass
                else:
                    # check if path contains '..'
                    dir_parts = dir.split(os.sep)
                    if '..' in dir_parts or name == '..':
                        if on_find_dotdot_path == 'throw_error':
                            raise EZArchiveError("tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): "
                                                 + os.path.join(dir, name))
                        else:
                            # on_find_dotdot_path == 'skip'
                            # next file please:
                            continue

                # Deal with absolute paths in a similar way:
                if on_find_absolute_path == 'allow':
                    pass
                else:
                    # check if path begins with '/'
                    if dir != '' and dir[0] == '/':
                        if on_find_absolute_path == 'throw_error':
                            raise EZArchiveError("tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): "
                                                 + os.path.join(dir, name))
                        else:
                            # on_find_absolute_path == 'skip'
                            # next file please:
                            continue

                # Deal with flattening of directories:
                if directory_structure == 'flatten':
                    dir = ''

                    if file_type == 'dir':
                        continue

                # tars allow multiple entries for same path/file:
                # extracting such tarballs with GNU/tar will just
                # cause the second entry to overwrite the first.  We
                # try to be more graceful:

                verified_fullname = _verify_filename(name=os.path.join(dir, name), seen_filenames=seen_filenames,
                                                     filename_collision=filename_collision, num_random_bits=num_random_bits,
                                                     rename_from_set=rename_from_set)

                if verified_fullname == ['skip']: continue
                name = os.path.basename(verified_fullname)

                archive.extract_member(identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name,
                                       umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions,
                                       allow_clobber=allow_clobber)

                fullname = os.path.join(extraction_root, dir, name)

                file_info = {}
                file_info['basename'] = name
                file_info['tar_dir'] = dir
                file_info['file_type'] = file_type

                if extract_to == 'byte_strings':
                    if file_type == 'regular':
                        file_info['file'] = open(fullname, 'rb').read()
                else:
                    # extract_to in ['my_directory', 'temp_directory']
                    file_info['fullname'] = fullname
                    file_info['dirname'] = os.path.join(extraction_root, dir)

                    if file_type == 'regular':
                        if file_handle == 'py':
                            file_info['fh'] = open(fullname, file_handle_mode)
                        elif file_handle == 'os':
                            file_info['fh'] = os.open(fullname, file_handle_mode)

                return_data.append(file_info)

            if extract_to == 'temp_directory':
                return (extraction_root, return_data)
            else:
                return return_data

        except:
            # Clean up non-temporary file if we get an error:
            _delete_files(_remove_on_error)
            raise
    finally:
        # Always clean up temporary files, error or not:
        _delete_files(_remove_always)
コード例 #4
0
def extract(input, # byte string of file location
            input_disposition='byte_string', # ['byte_string', 'file_location']
            compression_hint=None, # [None, 'gz', 'bz2', 'tar', 'tar.gz', 'tar.bz2', 'zip']
            extract_to='byte_strings', # ['byte_strings', 'my_directory', 'temp_directory']

            my_directory=None, # directory path
            backup_extension=None, # extension including dot, for backup of my_directory
            directory_structure='retain', # ['retain', 'flatten']

            file_handle = None, # [None, 'py', 'os']
            file_handle_mode = 'rb',

            force_file_permissions=None, # file permission bits. eg 0777.
            force_dir_permissions=None, # file permission bits. eg 0777.
            umask=None, # file permission bits. eg. 0777 (assuming standard umask interpretation).

            allow_file_types=_valid_file_types, # list containing any of ['regular, dir, symlink, hardlink, char_dev, block_dev, fifo']
            on_find_invalid_file_type='throw_error', # ['throw_error', 'skip']

            filename_collision='rename_similar', # ['throw_error', 'rename_similar', 'rename_random', 'overwrite', 'skip']
            rename_from_set='abcdefghijklmnopqrstuvwxyz', # characters to use if filename_collision='rename_random'
            num_random_bits=8, # number of random bits to use in the random filename.

            allow_clobber=False, # [True, False]

            on_find_dotdot_path='throw_error', # ['throw_error', 'skip', 'allow']
            on_find_absolute_path='throw_error' # ['throw_error', 'skip', 'allow']

            # Shelved options:
            # file_name_regexp, non_matches='rename_safely', etc.
            # Hopefully to be implemented in the future.
            ):

    # Clean out the written files list:
    global _remove_on_error
    global _remove_always
    _remove_on_error = []
    _remove_always = []

    # Validate arguments.
    _validate_args('input_disposition', input_disposition, ['byte_string', 'file_location'])
    _validate_args('compression_hint', compression_hint, [None] + available_tools.keys())
    _validate_args('extract_to', extract_to, ['byte_strings', 'my_directory', 'temp_directory'])
    # _validate_args('extract_to', return_objects, [None, 'file_location', 'open_c_filehandles', 'open_py_file_handles'])
    f = lambda type: _validate_args('allow_file_types', type, _valid_file_types)
    map(f, allow_file_types)
    if not input: raise ValueError('argument input must specify a filename or a byte string')

    # From here on, we start writing things out to disk, so we wrap it
    # in a try loop and catch all exceptions. This allows us to clean
    # up the disk if we didn't succeed with the whole of the
    # extraction.

    try:
        # try/except/finally cannot be combined, so we have to nest:
        try:
            # Write input to a temp file if we are given a byte string.
            if input_disposition == 'byte_string':
                input_file_loc = _write_to_and_return_tempfile_name(input)
                _remember_write(input_file_loc)
            else:
                # input_disposition == 'file_location'
                # Check that the input file location we've been given exists;
                # stat will throw the right error for us:
                os.stat(input)

                # Check it is a file:
                if not os.path.isfile(input):
                    raise ValueError("argument input must be a path to an archive file if input_disposition='file_location': %s"
                                     % (input))
                input_file_loc = input

            # Make sure we know what type of file we're dealing with:
            if compression_hint is None:
                compression_ext = _calculate_filename_extension(filename=input_file_loc)
                compression_ext = _pick_compression_type(compression_ext)
            else:
                compression_ext = compression_hint

            # Select approriate archive/compression tool:
            try:
                tool_class = available_tools[compression_ext]
            except KeyError:
                raise EZArchiveError('Unrecognized archive type: %s' % (compression_ext))

            # Instantiate the tool:
            archive = tool_class(input_file_loc, mode='r', allow_clobber=allow_clobber)

            if extract_to == 'byte_strings':
                # If extract_to == byte_strings, permissions mean nothing.
                # However, because we use a temp directory to load the files
                # into byte strings, we force the permissions to be nice and
                # liberal inside the temp dir:
                force_file_permissions = 0700
                force_dir_permissions = 0700

            # Get extraction_root:
            if extract_to == 'byte_strings' or extract_to == 'temp_directory':
                # Need a temp directory to work in.
                extraction_root = tempfile.mkdtemp()

                if extract_to == 'byte_strings':
                    _remember_write(extraction_root, error_only=False)
                else:
                    # extract_to == 'temp_directory':
                    _remember_write(extraction_root, error_only=True)
            else:
                # extract_to == 'my_directory':

                if my_directory is None:
                    raise ValueError("my_directory must be specified if extract_to='my_directory'")

                # Make given directory into a nice sane one.
                my_directory = os.path.abspath(os.path.expanduser(os.path.normpath(my_directory)))

                # Check it exists, and we can stat it:
                # stat will throw the right error for us:
                os.stat(my_directory)

                # Check it is a dir.
                if not os.path.isdir(my_directory):
                    raise ValueError("argument my_directory must be a directory: %s" % (my_directory))

                # If we've been asked to back it up, do so:
                if backup_extension is not None:
                    backup_dir = my_directory + backup_extension
                    if _backup_directory(my_directory, backup_dir) is not None:
                        raise EZArchiveError('creation of backup directory using GNU mirrordir failed: %s' % (backup_dir))

                # Finally set the extraction root:
                extraction_root = my_directory

                # Logically we would also check we have write permissions
                # here.  But this is acutally better served by letting
                # builtin/other functions raise EnvironmentErrors when we fail
                # to write: Checking for write permissions is actually quite
                # complex: e.g. you'd have to check group membership to see if
                # the group bits allow write.

            # If we haven't been given a umask, use take the system umask as a
            # default. If we have been given a umask, set the system umask to
            # it, so all calls to builtin open/file apply the given umask:
            if umask is None:
                # It doesn't seem possible to read the umask without also
                # setting it. Hence this fudge:
                umask = os.umask(0777)
                os.umask(umask)

            # Used in the extraction for loop to check for filename collisions
            # when flattening directory structure:
            seen_filenames = {}

            # Collect the returned file information here:
            return_data = []

            for mem in archive.list_all_members():
                name = mem['name']
                dir = mem['dir']
                file_type = mem['file_type']
                identity_object = mem['identity_object']

                # Check it is an allowed file type:
                if file_type not in allow_file_types:
                    if on_find_invalid_file_type=='skip':
                        continue
                    else:
                        # on_find_invalid_file_type='throw_error':
                        raise EZArchiveError("found disallowed file type '%s': %s" % (file_type, os.path.join(dir, name)))

                # Deal with dotdot paths:
                if on_find_dotdot_path == 'allow':
                    pass
                else:
                    # check if path contains '..'
                    dir_parts = dir.split(os.sep)
                    if '..' in dir_parts or name == '..':
                        if on_find_dotdot_path == 'throw_error':
                            raise EZArchiveError("tar entry's path contains '..' (*cautiously* consider on_find_dotdot_path='allow'): "
                                                 + os.path.join(dir, name))
                        else:
                            # on_find_dotdot_path == 'skip'
                            # next file please:
                            continue

                # Deal with absolute paths in a similar way:
                if on_find_absolute_path == 'allow':
                    pass
                else:
                    # check if path begins with '/'
                    if dir != '' and dir[0] == '/':
                        if on_find_absolute_path == 'throw_error':
                            raise EZArchiveError("tar entry's path is absolute (*cautiously* consider on_find_absolute_path='allow'): "
                                                 + os.path.join(dir, name))
                        else:
                            # on_find_absolute_path == 'skip'
                            # next file please:
                            continue

                # Deal with flattening of directories:
                if directory_structure == 'flatten':
                    dir = ''

                    if file_type == 'dir':
                        continue

                # tars allow multiple entries for same path/file:
                # extracting such tarballs with GNU/tar will just
                # cause the second entry to overwrite the first.  We
                # try to be more graceful:

                verified_fullname = _verify_filename(name=os.path.join(dir, name), seen_filenames=seen_filenames,
                                                     filename_collision=filename_collision, num_random_bits=num_random_bits,
                                                     rename_from_set=rename_from_set)

                if verified_fullname == ['skip']: continue
                name = os.path.basename(verified_fullname)

                archive.extract_member(identity_object=identity_object, root_dir=extraction_root, dir=dir, new_filename=name,
                                       umask=umask, force_file_permissions=force_file_permissions, force_dir_permissions=force_dir_permissions,
                                       allow_clobber=allow_clobber)

                fullname = os.path.join(extraction_root, dir, name)

                file_info = {}
                file_info['basename'] = name
                file_info['tar_dir'] = dir
                file_info['file_type'] = file_type

                if extract_to == 'byte_strings':
                    if file_type == 'regular':
                        file_info['file'] = open(fullname, 'rb').read()
                else:
                    # extract_to in ['my_directory', 'temp_directory']
                    file_info['fullname'] = fullname
                    file_info['dirname'] = os.path.join(extraction_root, dir)

                    if file_type == 'regular':
                        if file_handle == 'py':
                            file_info['fh'] = open(fullname, file_handle_mode)
                        elif file_handle == 'os':
                            file_info['fh'] = os.open(fullname, file_handle_mode)

                return_data.append(file_info)

            if extract_to == 'temp_directory':
                return (extraction_root, return_data)
            else:
                return return_data

        except:
            # Clean up non-temporary file if we get an error:
            _delete_files(_remove_on_error)
            raise
    finally:
        # Always clean up temporary files, error or not:
        _delete_files(_remove_always)