Exemple #1
0
    def _make_file(self, name, filename, mimetype, body, default_language):
        from webpage import WebPage

        if type(name) is not str:
            raise TypeError, 'expected string, got %s' % repr(name)

        # Web Pages are first class citizens
        if mimetype == 'text/html':
            body = tidy_html(body)
            class_id = 'webpage'
        elif mimetype == 'application/xhtml+xml':
            class_id = 'webpage'
        else:
            class_id = mimetype
        cls = self.database.get_resource_class(class_id)

        # Special case: web pages
        kw = {'filename': filename, 'data': body}
        if issubclass(cls, WebPage):
            kk, kk, language = FileName.decode(filename)
            if language is None:
                text = XHTMLFile(string=body).to_text()
                language = guess_language(text) or default_language
            kw['data'] = {language: body}

        return self.make_resource(name, cls, **kw)
Exemple #2
0
    def _make_file(self, name, filename, mimetype, body, default_language):
        from webpage import WebPage

        if type(name) is not str:
            raise TypeError, 'expected string, got %s' % repr(name)

        # Web Pages are first class citizens
        if mimetype == 'text/html':
            body = tidy_html(body)
            class_id = 'webpage'
        elif mimetype == 'application/xhtml+xml':
            class_id = 'webpage'
        else:
            class_id = mimetype
        cls = self.database.get_resource_class(class_id)

        # Special case: web pages
        kw = {'filename': filename, 'data': body}
        if issubclass(cls, WebPage):
            kk, kk, language = FileName.decode(filename)
            if language is None:
                text = XHTMLFile(string=body).to_text()
                language = guess_language(text) or default_language
            kw['data'] = {language: body}

        return self.make_resource(name, cls, **kw)
Exemple #3
0
    def extract_archive(self, handler, default_language, filter=None,
                        postproc=None, update=False):
        change_resource = self.database.change_resource
        for path_str in handler.get_contents():
            # 1. Skip folders
            path = Path(path_str)
            if path.endswith_slash:
                continue

            # Skip the owner file (garbage produced by microsoft)
            filename = path[-1]
            if filename.startswith('~$'):
                continue

            # 2. Create parent folders if needed
            folder = self
            for name in path[:-1]:
                name, title = process_name(name)
                subfolder = folder.get_resource(name, soft=True)
                if subfolder is None:
                    folder = folder.make_resource(name, Folder)
                    folder.set_value('title', title, default_language)
                elif not isinstance(subfolder, Folder):
                    raise RuntimeError, MSG_NAME_CLASH
                else:
                    folder = subfolder

            # 3. Find out the resource name and title, the file mimetype and
            # language
            mimetype = guess_mimetype(filename, 'application/octet-stream')
            name, extension, language = FileName.decode(filename)
            name, title = process_name(name)
            language = language or default_language
            # Keep the filename extension (except in webpages)
            if mimetype not in ('application/xhtml+xml', 'text/html'):
                name = FileName.encode((name, extension, None))

            # 4. The body
            body = handler.get_file(path_str)
            if filter:
                body = filter(path_str, mimetype, body)
                if body is None:
                    continue

            # 5. Update or make file
            file = folder.get_resource(name, soft=True)
            if file:
                if update is False:
                    msg = 'unexpected resource at {path}'
                    raise RuntimeError, msg.format(path=path_str)
                if mimetype == 'text/html':
                    body = tidy_html(body)
                    file_handler = file.get_handler(language)
                else:
                    file_handler = file.get_handler()
                old_body = file.handler.to_str()
                file_handler.load_state_from_string(body)
                if postproc:
                    postproc(file)
                # FIXME Comparing the bytes does not work for XML, so we use
                # this weak heuristic
                if len(old_body) != len(file.handler.to_str()):
                    change_resource(file)
            else:
                # Case 1: the resource does not exist
                file = folder._make_file(name, filename, mimetype, body,
                                         language)
                file.set_value('title', title, language=language)
                if postproc:
                    postproc(file)
Exemple #4
0
    def extract_archive(self, handler, default_language, filter=None,
                        postproc=None, update=False):
        change_resource = self.database.change_resource
        for path_str in handler.get_contents():
            # 1. Skip folders
            clean_path = "/".join([
              checkid(x) or 'file'
              if x else 'file' for x in path_str.split("/")])
            path = Path(clean_path)
            if path.endswith_slash:
                continue

            # Skip the owner file (garbage produced by microsoft)
            filename = path[-1]
            if filename.startswith('~$'):
                continue

            # 2. Create parent folders if needed
            folder = self
            for name in path[:-1]:
                name, title = process_name(name)
                subfolder = folder.get_resource(name, soft=True)
                if subfolder is None:
                    folder = folder.make_resource(name, Folder)
                    folder.set_value('title', title, default_language)
                elif not isinstance(subfolder, Folder):
                    raise RuntimeError, MSG_NAME_CLASH
                else:
                    folder = subfolder

            # 3. Find out the resource name and title, the file mimetype and
            # language
            mimetype = guess_mimetype(filename, 'application/octet-stream')
            name, extension, language = FileName.decode(filename)
            name, title = process_name(name)
            language = language or default_language
            # Keep the filename extension (except in webpages)
            if mimetype not in ('application/xhtml+xml', 'text/html'):
                name = FileName.encode((name, extension, None))

            # 4. The body
            body = handler.get_file(path_str)
            if filter:
                body = filter(path_str, mimetype, body)
                if body is None:
                    continue

            # 5. Update or make file
            file = folder.get_resource(name, soft=True)
            if file:
                if update is False:
                    msg = 'unexpected resource at {path}'
                    raise RuntimeError, msg.format(path=path_str)
                if mimetype == 'text/html':
                    body = tidy_html(body)
                    file_handler = file.get_handler(language)
                else:
                    file_handler = file.get_handler()
                old_body = file.handler.to_str()
                file_handler.load_state_from_string(body)
                if postproc:
                    postproc(file)
                # FIXME Comparing the bytes does not work for XML, so we use
                # this weak heuristic
                if len(old_body) != len(file.handler.to_str()):
                    change_resource(file)
            else:
                # Case 1: the resource does not exist
                file = folder._make_file(name, filename, mimetype, body,
                                         language)
                file.set_value('title', title, language=language)
                if postproc:
                    postproc(file)