def _make_file(self, name, filename, mimetype, body, default_language): from webpage import WebPage if type(name) is not str: raise TypeError, 'expected string, got %s' % repr(name) # Web Pages are first class citizens if mimetype == 'text/html': body = tidy_html(body) class_id = 'webpage' elif mimetype == 'application/xhtml+xml': class_id = 'webpage' else: class_id = mimetype cls = self.database.get_resource_class(class_id) # Special case: web pages kw = {'filename': filename, 'data': body} if issubclass(cls, WebPage): kk, kk, language = FileName.decode(filename) if language is None: text = XHTMLFile(string=body).to_text() language = guess_language(text) or default_language kw['data'] = {language: body} return self.make_resource(name, cls, **kw)
def extract_archive(self, handler, default_language, filter=None, postproc=None, update=False): change_resource = self.database.change_resource for path_str in handler.get_contents(): # 1. Skip folders path = Path(path_str) if path.endswith_slash: continue # Skip the owner file (garbage produced by microsoft) filename = path[-1] if filename.startswith('~$'): continue # 2. Create parent folders if needed folder = self for name in path[:-1]: name, title = process_name(name) subfolder = folder.get_resource(name, soft=True) if subfolder is None: folder = folder.make_resource(name, Folder) folder.set_value('title', title, default_language) elif not isinstance(subfolder, Folder): raise RuntimeError, MSG_NAME_CLASH else: folder = subfolder # 3. Find out the resource name and title, the file mimetype and # language mimetype = guess_mimetype(filename, 'application/octet-stream') name, extension, language = FileName.decode(filename) name, title = process_name(name) language = language or default_language # Keep the filename extension (except in webpages) if mimetype not in ('application/xhtml+xml', 'text/html'): name = FileName.encode((name, extension, None)) # 4. The body body = handler.get_file(path_str) if filter: body = filter(path_str, mimetype, body) if body is None: continue # 5. Update or make file file = folder.get_resource(name, soft=True) if file: if update is False: msg = 'unexpected resource at {path}' raise RuntimeError, msg.format(path=path_str) if mimetype == 'text/html': body = tidy_html(body) file_handler = file.get_handler(language) else: file_handler = file.get_handler() old_body = file.handler.to_str() file_handler.load_state_from_string(body) if postproc: postproc(file) # FIXME Comparing the bytes does not work for XML, so we use # this weak heuristic if len(old_body) != len(file.handler.to_str()): change_resource(file) else: # Case 1: the resource does not exist file = folder._make_file(name, filename, mimetype, body, language) file.set_value('title', title, language=language) if postproc: postproc(file)
def extract_archive(self, handler, default_language, filter=None, postproc=None, update=False): change_resource = self.database.change_resource for path_str in handler.get_contents(): # 1. Skip folders clean_path = "/".join([ checkid(x) or 'file' if x else 'file' for x in path_str.split("/")]) path = Path(clean_path) if path.endswith_slash: continue # Skip the owner file (garbage produced by microsoft) filename = path[-1] if filename.startswith('~$'): continue # 2. Create parent folders if needed folder = self for name in path[:-1]: name, title = process_name(name) subfolder = folder.get_resource(name, soft=True) if subfolder is None: folder = folder.make_resource(name, Folder) folder.set_value('title', title, default_language) elif not isinstance(subfolder, Folder): raise RuntimeError, MSG_NAME_CLASH else: folder = subfolder # 3. Find out the resource name and title, the file mimetype and # language mimetype = guess_mimetype(filename, 'application/octet-stream') name, extension, language = FileName.decode(filename) name, title = process_name(name) language = language or default_language # Keep the filename extension (except in webpages) if mimetype not in ('application/xhtml+xml', 'text/html'): name = FileName.encode((name, extension, None)) # 4. The body body = handler.get_file(path_str) if filter: body = filter(path_str, mimetype, body) if body is None: continue # 5. Update or make file file = folder.get_resource(name, soft=True) if file: if update is False: msg = 'unexpected resource at {path}' raise RuntimeError, msg.format(path=path_str) if mimetype == 'text/html': body = tidy_html(body) file_handler = file.get_handler(language) else: file_handler = file.get_handler() old_body = file.handler.to_str() file_handler.load_state_from_string(body) if postproc: postproc(file) # FIXME Comparing the bytes does not work for XML, so we use # this weak heuristic if len(old_body) != len(file.handler.to_str()): change_resource(file) else: # Case 1: the resource does not exist file = folder._make_file(name, filename, mimetype, body, language) file.set_value('title', title, language=language) if postproc: postproc(file)