class Handler(object): """ImageMagic Handler is used to handler images.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """Convert a image""" logger.debug("ImageMagickConvert: %s > %s" % (self.file.source_format, destination_format)) output_url = mktemp(suffix='.%s' % destination_format, dir=self.base_folder_url) command = ["convert", self.file.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.file.reload(output_url) try: return self.file.getContent() finally: self.file.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ command = ["identify", "-verbose", self.file.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.file.trash() metadata_dict = {} for std in stdout.split("\n"): std = std.strip() if re.search("^[a-zA-Z]", std): if std.count(":") > 1: key, value = re.compile(".*\:\ ").split(std) else: key, value = std.split(":") metadata_dict[key] = value.strip() return metadata_dict def setMetadata(self, metadata={}): """Returns image with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ raise NotImplementedError
class TestUnoConverter(HandlerTestCase): """Test case to test all features of the unoconverter script""" file_msg_list = ["Microsoft Office Document", "CDF V2 Document, Little Endian, Os: Windows, Version 1.0,"] def afterSetUp(self): """ """ openoffice.acquire() self.hostname, self.port = openoffice.getAddress() data = open("data/test.odt", 'r').read() self.document = File(self.tmp_url, data, 'odt') def tearDown(self): """Called to unlock the openoffice""" openoffice.release() def testUnoConverterOdtToDoc(self): """Test script unoconverter""" mimemapper = dict(filter_list=[('doc', 'com.sun.star.text.TextDocument', 'MS Word 97')], doc_type_list_by_extension=dict(doc=['com.sun.star.text.TextDocument'])) mimemapper_pickled = json.dumps(mimemapper) python = join(self.office_binary_path, "python") command = [exists(python) and python or "python", pkg_resources.resource_filename("cloudooo.handler.ooo", "/helper/unoconverter.py"), "--convert", "--uno_path=%s" % self.uno_path, "--office_binary_path=%s" % self.office_binary_path, "--hostname=%s" % self.hostname, "--port=%s" % self.port, "--document_url=%s" % self.document.getUrl(), "--destination_format=%s" % "doc", "--source_format=%s" % "odt", "--mimemapper=%s" % mimemapper_pickled] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE).communicate() self.assertEquals(stderr, '') output_url = stdout.replace('\n', '') self.assertTrue(exists(output_url), stdout) mime = magic.Magic(mime=True) self.assertEquals(mime.from_file(output_url), 'application/msword') self.document.trash() self.assertEquals(exists(output_url), False)
class Handler(object): """PDF Handler is used to handler inputed pdf document.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.document = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert a pdf document """ logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.document.directory_name) command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ command = ["pdfinfo", self.document.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() info_list = filter(None, stdout.split("\n")) metadata = {} for info in iter(info_list): if info.count(":") == 1: info_name, info_value = info.split(":") else: info_name, info_value = info.split(" ") info_name = info_name.replace(":", "") info_value = info_value.strip() metadata[info_name.lower()] = info_value self.document.trash() return metadata def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ text_template = "InfoKey: %s\nInfoValue: %s\n" text_list = [text_template % (key.capitalize(), value) \ for key, value in metadata.iteritems()] metadata_file = File(self.document.directory_name, "".join(text_list), "txt") output_url = mktemp(suffix=".pdf", dir=self.document.directory_name) command = [ "pdftk", self.document.getUrl(), "update_info", metadata_file.getUrl(), "output", output_url ] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash()
class Handler(object): """FFMPEG Handler is used to handler inputed audio and video files""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ base_folder_url(string) The requested url for data base folder data(string) The opened and readed file into a string source_format(string) The source format of the inputed file""" self.base_folder_url = base_folder_url self.input = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format): """ Convert the inputed file to output as format that were informed """ # XXX This implementation could use ffmpeg -i pipe:0, but # XXX seems super unreliable currently and it generates currupted files in # the end logger.debug("FfmpegConvert: %s > %s" % (self.input.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.input.directory_name) command = ["ffmpeg", "-i", self.input.getUrl(), "-y", output_url] # XXX ffmpeg has a bug that needs this options to work with webm format if destination_format == "webm": command.insert(3, "32k") command.insert(3, "-ab") try: stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.input.reload(output_url) if len(self.input.getContent()) == 0: logger.error(stderr.split("\n")[-2]) return self.input.getContent() finally: self.input.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of the file. Keywords Arguments:""" command = ["ffprobe",self.input.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() metadata = stderr.split('Metadata:')[1].split('\n') metadata_dict = {} for data in metadata: if len(data) != 0: key, value = data.split(':') metadata_dict[key.strip().capitalize()] = value.strip() self.input.trash() return metadata_dict def setMetadata(self, metadata_dict={}): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ output_url = mktemp(suffix=".%s" % self.input.source_format, dir=self.input.directory_name) command = ["ffmpeg", "-i", self.input.getUrl(), "-y", output_url] for metadata in metadata_dict: command.insert(3, "%s=%s"%(metadata, metadata_dict[metadata])) command.insert(3, "-metadata") try: stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.input.reload(output_url) return self.input.getContent() finally: self.input.trash() @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('audio/ogg;codecs=opus', 'Opus Audio File Format'), ('video/webm', 'Webm Video File Format'), ... ] """ # XXX NotImplemented return []
class Handler(object): """PDF Handler is used to handler inputed pdf document.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.document = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert a pdf document """ logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = NamedTemporaryFile(suffix=".%s" % destination_format, dir=self.document.directory_name).name if self.document.source_format == 'ps': command = ["ps2pdf", "-dASCII85EncodePages=false", "-dLanguageLevel=1", self.document.getUrl(), output_url] else: command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ command = ["pdfinfo", self.document.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() info_list = filter(None, stdout.split("\n")) metadata = {} for info in iter(info_list): if info.count(":") == 1: info_name, info_value = info.split(":") else: info_name, info_value = info.split(" ") info_name = info_name.replace(":", "") info_value = info_value.strip() metadata[info_name.lower()] = info_value self.document.trash() return metadata def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ text_template = "InfoKey: %s\nInfoValue: %s\n" text_list = [text_template % (key.capitalize(), value) \ for key, value in metadata.iteritems()] metadata_file = File(self.document.directory_name, "".join(text_list), "txt") output_url = NamedTemporaryFile(suffix=".pdf", dir=self.document.directory_name).name command = ["pdftk", self.document.getUrl(), "update_info", metadata_file.getUrl(), "output", output_url ] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash()
class Handler(object): """ImageMagic Handler is used to handler images.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def makeTempFile(self, destination_format=None): path = mktemp( suffix='.%s' % destination_format, dir=self.file.directory_name, ) return path def makeTempDir(self, *args, **kw): return mkdtemp(*args, dir=self.file.directory_name, **kw) def convertPathToUrl(self, path): if path.startswith("/"): return "file://" + path raise ValueError("path %r is not absolute" % path) def convert(self, destination_format=None, **kw): """Convert a image""" logger.debug("wkhtmltopdf convert: %s > %s" % (self.file.source_format, destination_format)) output_path = self.makeTempFile(destination_format) command = self.makeWkhtmltopdfCommandList( self.convertPathToUrl(self.file.getUrl()), output_path, conversion_kw=kw, ) stdout, stderr = Popen( command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, cwd=self.file.directory_name, ).communicate() self.file.reload(output_path) try: return self.file.getContent() finally: self.file.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ return NotImplementedError def setMetadata(self, metadata={}): """Returns image with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ raise NotImplementedError @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('application/pdf', 'PDF - Portable Document Format'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("text/html", "htm", "html"): return [("application/pdf", "PDF - Portable Document Format")] return [] def makeSwitchOptionList(self, allowed_option_list, option_dict): """ A switch option is enable if it exists. Ex: for : --grayscale option_dict : {"grayscale": True} result : ["--grayscale"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value: option_list.append(keyNameToOption(option_name)) return option_list def makeNoPrefixedOptionList(self, allowed_option_list, option_dict): """ A "no" prefixed option is an option that if disable contains a "no" prefix. Ex: for : --images (and --no-images) option_dict : {"images": False} result : ["--no-images"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: option_list.append( keyNameToOption(option_name, prefix="" if value else "no-")) return option_list def makeEnablePrefixedOptionList(self, allowed_option_list, option_dict): """ An "enable" prefixed option is an option that if enable contains a "enable" prefix else contains a "disable" prefix. Ex: for : --enable-external-links (and --disable-external-links) option_dict : {"enable_external_links": False} result : ["--disable-external-links"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: if value: option_list.append(keyNameToOption(option_name)) else: option_list.append( keyNameToOption(option_name[7:], prefix="disable-")) return option_list def makeIncludeInPrefixedOptionList(self, allowed_option_list, option_dict): """ An "include-in" prefixed option is an option that if enable contains a "include-in" prefix else contains a "exclude-from" prefix. Ex: for : --include-in-outline (and --exclude-from-outline) option_dict : {"include_in_outline": False} result : ["--exclude-from-outline"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: if value: option_list.append(keyNameToOption(option_name)) else: option_list.append( keyNameToOption(option_name[11:], prefix="exclude-from-")) return option_list def makeOneStringArgumentOptionList(self, allowed_option_list, option_dict): """ A one-string-argument option is a option that require an argument which is a string. Ex: for : --title <text> option_dict : {"title": "Hello World!"} result : ["--title", "Hello World!"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: option_list += [keyNameToOption(option_name), str(value)] return option_list def makeRepeatableOneStringArgumentOptionList(self, allowed_option_list, option_dict): """ A repeatable one-string-argument option is a option that require one string argument, this option can be set several times. Ex: for : --allow <path> option_dict : {"allow_list": ["a", "b"]} result : ["--allow", "a", "--allow", "b"] """ option_list = [] for option_name in allowed_option_list: value_list = option_dict.get(option_name) if value_list: for value in value_list: option_list += [ keyNameToOption(option_name[:-5]), str(value) ] return option_list def makeRepeatableTwoStringArgumentOptionList(self, allowed_option_list, option_dict): """ A repeatable two-string-argument option is a option that require two string arguments, this option can be set several times. Ex: for : --cookie <name> <value> option_dict : {"cookie_list": [("a", "b"), ("c", "d")]} result : ["--cookie", "a", "b", "--cookie", "c", "d"] """ option_list = [] for option_name in allowed_option_list: tuple_list = option_dict.get(option_name) if tuple_list: for name, value in tuple_list: option_list += [ keyNameToOption(option_name[:-5]), str(name), str(value) ] return option_list def makeDataUrlArgumentOptionList(self, allowed_option_list, option_dict, url_type="url", destination_format=None, use_switch=True): """ A data-file-argument option is a option that require an url argument. Here, we don't want option value to be an url but data, so that we can put the data to a temp file an use it's url as option value. Ex: for : --user-style-sheet <url> (and url_type="url") option_dict : {"user_style_sheet_data": b64encode("body { background-color: black; }")} result : ["--user-style-sheet", "file:///tmp/tmp.XYZ.css"] Ex: for : --checkbox-svg <path> (and url_type="path") option_dict : {"checkbox_svg_data": b64encode("<svg>....</svg>")} result : ["--checkbox-svg", "/tmp/tmp.XYZ.svg"] Ex: for : --xsl-style-sheet <file> (and url_type="file") option_dict : {"xsl_style_sheet_data": b64encode("table { border: none; }")} result : ["--xsl-style-sheet", "tmp.XYZ.css"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: # creates a tmp file in the directory which will be trashed path = self.makeTempFile(destination_format=destination_format) open(path, "wb").write(b64decode(value)) if url_type == "url": path = self.convertPathToUrl(path) elif url_type == "file": path = basename(path) if use_switch: option_list += [keyNameToOption(option_name[:-5]), path] else: option_list.append(path) return option_list def makeDataPathArgumentOptionList(self, *args, **kw): return self.makeDataUrlArgumentOptionList(*args, url_type="path", **kw) def makeDataFileArgumentOptionList(self, *args, **kw): return self.makeDataUrlArgumentOptionList(*args, url_type="file", **kw) def makeRepeatableDataUrlArgumentOptionList(self, allowed_option_list, option_dict, **kw): option_list = [] for option_name in allowed_option_list: data_list = option_dict.get(option_name) if data_list: for data in data_list: option_name = option_name[:-5] option_list += self.makeDataUrlArgumentOptionList( [ option_name, ], {option_name: data}, **kw) return option_list def makeWkhtmltopdfCommandList(self, *args, **kw): # http://wkhtmltopdf.org/usage/wkhtmltopdf.txt conversion_kw = kw.get("conversion_kw", {}) command = ["wkhtmltopdf"] # Global Options command += self.makeNoPrefixedOptionList(["collate"], conversion_kw) command += self.makeSwitchOptionList( [ #"extended-help", "grayscale", #"help", #"htmldoc", #"licence", "lowquality", #"manpage", "no_pdf_compression", #"quiet", # we decide #"read_args_from_stdin", # only for several command line at a time #"readme", #"version", ], conversion_kw) command += self.makeOneStringArgumentOptionList( [ #"cookie_jar", # no cookie jar "copies", "dpi", "image_dpi", "image_quality", "margin_bottom", "margin_left", "margin_right", "margin_top", "orientation", "page_height", "page_size", "page_width", "title", ], conversion_kw) # Outline Options command += self.makeNoPrefixedOptionList(["outline"], conversion_kw) #"dump_default_toc_xsl", command += self.makeOneStringArgumentOptionList( [ #"dump_outline", "outline_depth", ], conversion_kw) # Page Options command += self.makeNoPrefixedOptionList( [ "background", "custom_header_propagation", "images", "print_media_type", #"debug_javascript", # we decide #"stop_slow_scripts", # we decide ], conversion_kw) command += self.makeEnablePrefixedOptionList( [ "enable_external_links", "enable_forms", "enable_internal_links", "enable_javascript", #"enable_local_file_access", # we decide #"enable_plugins", "enable_smart_shrinking", "enable_toc_back_links", ], conversion_kw) command += ["--disable-local-file-access"] command += self.makeIncludeInPrefixedOptionList([ "include_in_outline", ], conversion_kw) command += self.makeSwitchOptionList(["default_header"], conversion_kw) # put cache in the temp dir - to disable cache command += ["--cache-dir", self.makeTempDir(prefix="cache")] command += self.makeOneStringArgumentOptionList( [ #"cache_dir", # we decide "encoding", "javascript_delay", "load_error_handling", "load_media_error_handling", "minimum_font_size", "page_offset", #"password", # too dangerous #"proxy", # we decide #"username", # too dangerous "viewport_size", "window_status", "zoom", ], conversion_kw) #"allow", # we decide command += self.makeDataPathArgumentOptionList( [ # <option_name>_data "checkbox_checked_svg_data", "checkbox_svg_data", "radiobutton_checked_svg_data", "radiobutton_svg_data", ], conversion_kw, destination_format="svg") command += self.makeDataUrlArgumentOptionList([ "user_style_sheet_data", ], conversion_kw, destination_format="css") #"run_script_list", # too dangerous, fills --run-script command += self.makeRepeatableTwoStringArgumentOptionList( [ # <option_name>_list "cookie_list", "custom_header_list", #"post_list", #"post_file_list", ], conversion_kw) # Headers and Footer Options command += self.makeNoPrefixedOptionList([ "footer_line", "header_line", ], conversion_kw) command += self.makeOneStringArgumentOptionList( [ "footer_center", "footer_font_name", "footer_font_size", "footer_left", "footer_right", "footer_spacing", "header_center", "header_font_name", "header_font_size", "header_left", "header_right", # there's a --top option (not documented) # may be we can do header_right_top option "header_spacing", ], conversion_kw) command += self.makeDataUrlArgumentOptionList( [ # <option_name>_data "footer_html_data", "header_html_data", ], conversion_kw, destination_format="html") command += self.makeRepeatableTwoStringArgumentOptionList([ "replace", ], conversion_kw) # Custom Options command += self.makeRepeatableDataUrlArgumentOptionList( [ "before_toc_data_list", ], conversion_kw, destination_format="html", use_switch=False) # TOC Options value = conversion_kw.get("toc") if value: command += ["toc"] command += self.makeEnablePrefixedOptionList([ "enable_dotted_lines", "enable_toc_links", ], conversion_kw) command += self.makeOneStringArgumentOptionList([ "toc_header_text", "toc_level_indentation", "toc_text_size_shrink", ], conversion_kw) command += self.makeDataFileArgumentOptionList( [ "xsl_style_sheet_data", ], conversion_kw, destination_format="xsl") # Custom Options command += self.makeRepeatableDataUrlArgumentOptionList( [ "after_toc_data_list", "before_body_data_list", ], conversion_kw, destination_format="html", use_switch=False) command += args[:-1] # input_url command += self.makeRepeatableDataUrlArgumentOptionList( [ "after_body_data_list", ], conversion_kw, destination_format="html", use_switch=False) command += args[-1:] # output_path return command
class Handler(object): """PDF Handler is used to handler inputed pdf document.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.document = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert a pdf document """ # TODO: use pyPdf logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.document.directory_name) command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ # TODO: use pyPdf and not use lower() command = ["pdfinfo", self.document.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() info_list = filter(None, stdout.split("\n")) metadata = {} for info in iter(info_list): info = info.split(":") info_name = info[0].lower() info_value = ":".join(info[1:]).strip() metadata[info_name] = info_value self.document.trash() return metadata def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ # TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate input_pdf = PdfFileReader(open(self.document.getUrl(), "rb")) output_pdf = PdfFileWriter() modification_date = metadata.pop("ModificationDate", None) if modification_date: metadata['ModDate'] = modification_date if type(metadata.get('Keywords', None)) is list: metadata['Keywords'] = metadata['Keywords'].join(' ') args = {} for key, value in list(metadata.items()): args[NameObject('/' + key.capitalize())] = createStringObject(value) output_pdf._info.getObject().update(args) for page_num in range(input_pdf.getNumPages()): output_pdf.addPage(input_pdf.getPage(page_num)) output_stream = io.BytesIO() output_pdf.write(output_stream) return output_stream.getvalue() @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('text/plain', 'Plain Text'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("application/pdf", "pdf"): return [("text/plain", "Plain Text")] return []
class Handler(object): """ImageMagic Handler is used to handler images.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def makeTempFile(self, destination_format=None): path = mktemp( suffix='.%s' % destination_format, dir=self.file.directory_name, ) return path def convertPathToUrl(self, path): if path.startswith("/"): return "file://" + path raise ValueError("path %r is not absolute" % path) def convert(self, destination_format=None, **kw): """Convert a image""" logger.debug("wkhtmltopdf convert: %s > %s" % (self.file.source_format, destination_format)) output_path = self.makeTempFile(destination_format) command = self.makeWkhtmltopdfCommandList( self.convertPathToUrl(self.file.getUrl()), output_path, conversion_kw=kw, ) stdout, stderr = Popen( command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, cwd=self.file.directory_name, ).communicate() self.file.reload(output_path) try: return self.file.getContent() finally: self.file.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ return NotImplementedError def setMetadata(self, metadata={}): """Returns image with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ raise NotImplementedError @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('application/pdf', 'PDF - Portable Document Format'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("text/html", "htm", "html"): return [("application/pdf", "PDF - Portable Document Format")] return [] def makeSwitchOptionList(self, allowed_option_list, option_dict): """ A switch option is enable if it exists. Ex: for : --grayscale option_dict : {"grayscale": True} result : ["--grayscale"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value: option_list.append(keyNameToOption(option_name)) return option_list def makeNoPrefixedOptionList(self, allowed_option_list, option_dict): """ A "no" prefixed option is an option that if disable contains a "no" prefix. Ex: for : --images (and --no-images) option_dict : {"images": False} result : ["--no-images"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: option_list.append(keyNameToOption(option_name, prefix="" if value else "no-")) return option_list def makeEnablePrefixedOptionList(self, allowed_option_list, option_dict): """ An "enable" prefixed option is an option that if enable contains a "enable" prefix else contains a "disable" prefix. Ex: for : --enable-external-links (and --disable-external-links) option_dict : {"enable_external_links": False} result : ["--disable-external-links"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: if value: option_list.append(keyNameToOption(option_name)) else: option_list.append(keyNameToOption(option_name[7:], prefix="disable-")) return option_list def makeIncludeInPrefixedOptionList(self, allowed_option_list, option_dict): """ An "include-in" prefixed option is an option that if enable contains a "include-in" prefix else contains a "exclude-from" prefix. Ex: for : --include-in-outline (and --exclude-from-outline) option_dict : {"include_in_outline": False} result : ["--exclude-from-outline"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: if value: option_list.append(keyNameToOption(option_name)) else: option_list.append(keyNameToOption(option_name[11:], prefix="exclude-from-")) return option_list def makeOneStringArgumentOptionList(self, allowed_option_list, option_dict): """ A one-string-argument option is a option that require an argument which is a string. Ex: for : --title <text> option_dict : {"title": "Hello World!"} result : ["--title", "Hello World!"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: option_list += [keyNameToOption(option_name), str(value)] return option_list def makeRepeatableOneStringArgumentOptionList(self, allowed_option_list, option_dict): """ A repeatable one-string-argument option is a option that require one string argument, this option can be set several times. Ex: for : --allow <path> option_dict : {"allow_list": ["a", "b"]} result : ["--allow", "a", "--allow", "b"] """ option_list = [] for option_name in allowed_option_list: value_list = option_dict.get(option_name) if value_list: for value in value_list: option_list += [keyNameToOption(option_name[:-5]), str(value)] return option_list def makeRepeatableTwoStringArgumentOptionList(self, allowed_option_list, option_dict): """ A repeatable two-string-argument option is a option that require two string arguments, this option can be set several times. Ex: for : --cookie <name> <value> option_dict : {"cookie_list": [("a", "b"), ("c", "d")]} result : ["--cookie", "a", "b", "--cookie", "c", "d"] """ option_list = [] for option_name in allowed_option_list: tuple_list = option_dict.get(option_name) if tuple_list: for name, value in tuple_list: option_list += [keyNameToOption(option_name[:-5]), str(name), str(value)] return option_list def makeDataUrlArgumentOptionList(self, allowed_option_list, option_dict, url_type="url", destination_format=None, use_switch=True): """ A data-file-argument option is a option that require an url argument. Here, we don't want option value to be an url but data, so that we can put the data to a temp file an use it's url as option value. Ex: for : --user-style-sheet <url> (and url_type="url") option_dict : {"user_style_sheet_data": b64encode("body { background-color: black; }")} result : ["--user-style-sheet", "file:///tmp/tmp.XYZ.css"] Ex: for : --checkbox-svg <path> (and url_type="path") option_dict : {"checkbox_svg_data": b64encode("<svg>....</svg>")} result : ["--checkbox-svg", "/tmp/tmp.XYZ.svg"] Ex: for : --xsl-style-sheet <file> (and url_type="file") option_dict : {"xsl_style_sheet_data": b64encode("table { border: none; }")} result : ["--xsl-style-sheet", "tmp.XYZ.css"] """ option_list = [] for option_name in allowed_option_list: value = option_dict.get(option_name) if value is not None: # creates a tmp file in the directory which will be trashed path = self.makeTempFile(destination_format=destination_format) open(path, "wb").write(b64decode(value)) if url_type == "url": path = self.convertPathToUrl(path) elif url_type == "file": path = basename(path) if use_switch: option_list += [keyNameToOption(option_name[:-5]), path] else: option_list.append(path) return option_list def makeDataPathArgumentOptionList(self, *args, **kw): return self.makeDataUrlArgumentOptionList(*args, url_type="path", **kw) def makeDataFileArgumentOptionList(self, *args, **kw): return self.makeDataUrlArgumentOptionList(*args, url_type="file", **kw) def makeRepeatableDataUrlArgumentOptionList(self, allowed_option_list, option_dict, **kw): option_list = [] for option_name in allowed_option_list: data_list = option_dict.get(option_name) if data_list: for data in data_list: option_name = option_name[:-5] option_list += self.makeDataUrlArgumentOptionList([ option_name, ], {option_name: data}, **kw) return option_list def makeWkhtmltopdfCommandList(self, *args, **kw): # http://wkhtmltopdf.org/usage/wkhtmltopdf.txt conversion_kw = kw.get("conversion_kw", {}) command = ["wkhtmltopdf"] # Global Options command += self.makeNoPrefixedOptionList(["collate"], conversion_kw) command += self.makeSwitchOptionList([ #"extended-help", "grayscale", #"help", #"htmldoc", #"licence", "lowquality", #"manpage", "no_pdf_compression", #"quiet", # we decide #"read_args_from_stdin", # only for several command line at a time #"readme", #"version", ], conversion_kw) command += self.makeOneStringArgumentOptionList([ #"cookie_jar", # no cookie jar "copies", "dpi", "image_dpi", "image_quality", "margin_bottom", "margin_left", "margin_right", "margin_top", "orientation", "page_height", "page_size", "page_width", "title", ], conversion_kw) # Outline Options command += self.makeNoPrefixedOptionList(["outline"], conversion_kw) #"dump_default_toc_xsl", command += self.makeOneStringArgumentOptionList([ #"dump_outline", "outline_depth", ], conversion_kw) # Page Options command += self.makeNoPrefixedOptionList([ "background", "custom_header_propagation", "images", "print_media_type", #"debug_javascript", # we decide #"stop_slow_scripts", # we decide ], conversion_kw) command += self.makeEnablePrefixedOptionList([ "enable_external_links", "enable_forms", "enable_internal_links", "enable_javascript", #"enable_local_file_access", # we decide #"enable_plugins", "enable_smart_shrinking", "enable_toc_back_links", ], conversion_kw) command += ["--disable-local-file-access"] command += self.makeIncludeInPrefixedOptionList([ "include_in_outline", ], conversion_kw) command += self.makeSwitchOptionList(["default_header"], conversion_kw) command += self.makeOneStringArgumentOptionList([ #"cache_dir", # we decide "encoding", "javascript_delay", "load_error_handling", "load_media_error_handling", "minimum_font_size", "page_offset", #"password", # too dangerous #"proxy", # we decide #"username", # too dangerous "viewport_size", "window_status", "zoom", ], conversion_kw) #"allow", # we decide command += self.makeDataPathArgumentOptionList([ # <option_name>_data "checkbox_checked_svg_data", "checkbox_svg_data", "radiobutton_checked_svg_data", "radiobutton_svg_data", ], conversion_kw, destination_format="svg") command += self.makeDataUrlArgumentOptionList([ "user_style_sheet_data", ], conversion_kw, destination_format="css") #"run_script_list", # too dangerous, fills --run-script command += self.makeRepeatableTwoStringArgumentOptionList([ # <option_name>_list "cookie_list", "custom_header_list", #"post_list", #"post_file_list", ], conversion_kw) # Headers and Footer Options command += self.makeNoPrefixedOptionList([ "footer_line", "header_line", ], conversion_kw) command += self.makeOneStringArgumentOptionList([ "footer_center", "footer_font_name", "footer_font_size", "footer_left", "footer_right", "footer_spacing", "header_center", "header_font_name", "header_font_size", "header_left", "header_right", # there's a --top option (not documented) # may be we can do header_right_top option "header_spacing", ], conversion_kw) command += self.makeDataUrlArgumentOptionList([ # <option_name>_data "footer_html_data", "header_html_data", ], conversion_kw, destination_format="html") command += self.makeRepeatableTwoStringArgumentOptionList([ "replace", ], conversion_kw) # Custom Options command += self.makeRepeatableDataUrlArgumentOptionList([ "before_toc_data_list", ], conversion_kw, destination_format="html", use_switch=False) # TOC Options value = conversion_kw.get("toc") if value: command += ["toc"] command += self.makeEnablePrefixedOptionList([ "enable_dotted_lines", "enable_toc_links", ], conversion_kw) command += self.makeOneStringArgumentOptionList([ "toc_header_text", "toc_level_indentation", "toc_text_size_shrink", ], conversion_kw) command += self.makeDataFileArgumentOptionList([ "xsl_style_sheet_data", ], conversion_kw, destination_format="xsl") # Custom Options command += self.makeRepeatableDataUrlArgumentOptionList([ "after_toc_data_list", "before_body_data_list", ], conversion_kw, destination_format="html", use_switch=False) command += args[:-1] # input_url command += self.makeRepeatableDataUrlArgumentOptionList([ "after_body_data_list", ], conversion_kw, destination_format="html", use_switch=False) command += args[-1:] # output_path return command
class Handler(object): """ X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice documents. """ implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ base_folder_url(string) The requested url for data base folder data(string) The opened and readed file into a string source_format(string) The source format of the inputed file """ self.base_folder_url = base_folder_url self._data = data self._source_format = source_format self._init_kw = kw self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map[destination_format] root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input") output_dir = os.path.join(root_dir, "output") final_file_name = os.path.join(root_dir, "document.%s" % destination_format) input_file_name = self.file.getUrl() output_file_name = final_file_name config_file_name = os.path.join(root_dir, "config.xml") if source_format in yformat_tuple: if self._data.startswith("PK\x03\x04"): os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) for _, _, files in os.walk(input_dir): input_file_name, = files break input_file_name = os.path.join(input_dir, input_file_name) if destination_format in yformat_tuple: os.mkdir(output_dir) output_file_name = os.path.join(output_dir, "body.txt") config_file = open(config_file_name, "w") config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': in_format, 'm_sFileTo': output_file_name, 'm_nFormatTo': out_format, # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") config_file.close() # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError( "x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name ]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) if destination_format in yformat_tuple: zipTree( final_file_name, (output_file_name, ""), (os.path.join(os.path.dirname(output_file_name), "media"), ""), ) self.file.reload(final_file_name) try: return self.file.getContent() finally: self.file.trash() def getMetadata(self, base_document=False): r"""Returns a dictionary with all metadata of document. /!\ Not Implemented: no format are handled correctly. """ # XXX Cloudooo takes the first handler that can "handle" source_mimetype. # However, docx documents metadata can only be "handled" by the ooo handler. # Handlers should provide a way to tell if such capability is available for the required source mimetype. # We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. # And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. if self._source_format in ( "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ): return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document) return {} def setMetadata(self, metadata={}): r"""Returns document with new metadata. /!\ Not Implemented: no format are handled correctly. Keyword arguments: metadata -- expected an dictionary with metadata. """ # XXX Cloudooo takes the first handler that can "handle" source_mimetype. # However, docx documents metadata can only be "handled" by the ooo handler. # Handlers should provide a way to tell if such capability is available for the required source mimetype. # We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. # And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. if self._source_format in ( "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ): return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata) return self.file.getContent() @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('application/x-asc-text', 'OnlyOffice Text Document'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ( "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): return [("application/x-asc-text", "OnlyOffice Text Document")] if source_mimetype in ("docy", "application/x-asc-text"): return [( "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document")] if source_mimetype in ( "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ): return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet") ] if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"): return [( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet")] if source_mimetype in ( "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation" ): return [("application/x-asc-presentation", "OnlyOffice Presentation")] if source_mimetype in ("ppty", "application/x-asc-presentation"): return [( "application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation")] return []
class Handler(object): """ X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice documents. """ implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ base_folder_url(string) The requested url for data base folder data(string) The opened and readed file into a string source_format(string) The source format of the inputed file """ self.base_folder_url = base_folder_url self._data = data self._source_format = source_format self._init_kw = kw self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map[destination_format] root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input"); output_dir = os.path.join(root_dir, "output"); final_file_name = os.path.join(root_dir, "document.%s" % destination_format) input_file_name = self.file.getUrl() output_file_name = final_file_name config_file_name = os.path.join(root_dir, "config.xml") if source_format in yformat_tuple: os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) for _, _, files in os.walk(input_dir): input_file_name, = files break input_file_name = os.path.join(input_dir, input_file_name) if destination_format in yformat_tuple: os.mkdir(output_dir) output_file_name = os.path.join(output_dir, "body.txt") config_file = open(config_file_name, "w") config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': in_format, 'm_sFileTo': output_file_name, 'm_nFormatTo': out_format, # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") config_file.close() # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) if destination_format in yformat_tuple: zipTree( final_file_name, (output_file_name, ""), (os.path.join(os.path.dirname(output_file_name), "media"), ""), ) self.file.reload(final_file_name) try: return self.file.getContent() finally: self.file.trash() def getMetadata(self, base_document=False): r"""Returns a dictionary with all metadata of document. /!\ Not Implemented: no format are handled correctly. """ # XXX Cloudooo takes the first handler that can "handle" source_mimetype. # However, docx documents metadata can only be "handled" by the ooo handler. # Handlers should provide a way to tell if such capability is available for the required source mimetype. # We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. # And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. if self._source_format in ( "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ): return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document) return {} def setMetadata(self, metadata={}): r"""Returns document with new metadata. /!\ Not Implemented: no format are handled correctly. Keyword arguments: metadata -- expected an dictionary with metadata. """ # XXX Cloudooo takes the first handler that can "handle" source_mimetype. # However, docx documents metadata can only be "handled" by the ooo handler. # Handlers should provide a way to tell if such capability is available for the required source mimetype. # We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. # And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. if self._source_format in ( "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ): return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata) return self.file.getContent() @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('application/x-asc-text', 'OnlyOffice Text Document'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"): return [("application/x-asc-text", "OnlyOffice Text Document")] if source_mimetype in ("docy", "application/x-asc-text"): return [("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document")] if source_mimetype in ("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"): return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")] if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"): return [("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet")] if source_mimetype in ("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"): return [("application/x-asc-presentation", "OnlyOffice Presentation")] if source_mimetype in ("ppty", "application/x-asc-presentation"): return [("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation")] return []
class TestFile(unittest.TestCase): """Test to class File""" def setUp(self): """Create data to tests and instantiated a File""" self.tmp_url = '/tmp' self.data = decodestring("cloudooo Test") self.fsdocument = File(self.tmp_url, self.data, 'txt') def tearDown(self): """Remove the file in system""" if self.fsdocument.getUrl() is not None: self.fsdocument.trash() def testRestoreOriginal(self): """Test if changing the document and call remake, the document back to original state""" old_document_url = self.fsdocument.getUrl() document_filename = "document" document_test_url = path.join(self.fsdocument.directory_name, document_filename) open(document_test_url, 'wb').write(decodestring("Test Document")) self.fsdocument.reload(document_test_url) self.assertEquals(path.exists(old_document_url), False) self.assertNotEquals(self.fsdocument.original_data, self.fsdocument.getContent()) old_document_url = self.fsdocument.getUrl() self.fsdocument.restoreOriginal() self.assertEquals(path.exists(old_document_url), False) self.assertNotEquals(old_document_url, self.fsdocument.getUrl()) self.assertTrue(path.exists(self.fsdocument.getUrl())) self.assertEquals(self.fsdocument.getContent(), self.data) def testgetContent(self): """Test if returns the data correctly""" self.assertEquals(self.fsdocument.getContent(), self.data) def testgetUrl(self): """Check if the url is correct""" url = self.fsdocument.getUrl() self.assertTrue(path.exists(url)) def testLoadDocumentFile(self): """Test if the document is created correctly""" url = self.fsdocument.getUrl() tmp_document = open(url, 'r').read() self.assertEquals(self.data, tmp_document) self.fsdocument.trash() self.assertEquals(path.exists(url), False) def testReload(self): """Change url and check if occurs correctly""" old_document_url = self.fsdocument.getUrl() document_filename = "document" document_test_url = path.join(self.fsdocument.directory_name, document_filename) open(document_test_url, 'wb').write(self.data) self.fsdocument.reload(document_test_url) url = self.fsdocument.getUrl() self.assertEquals(path.exists(old_document_url), False) self.assertEquals(self.fsdocument.getContent(), self.data) self.fsdocument.trash() self.assertEquals(path.exists(url), False) def testZipDocumentList(self): """Tests if the zip file is returned correctly""" open(path.join(self.fsdocument.directory_name, 'document2'), 'w').write('test') zip_file = self.fsdocument.getContent(True) mime = magic.Magic(mime=True) mimetype = mime.from_buffer(zip_file) self.assertEquals(mimetype, 'application/zip') ziptest = ZipFile(StringIO(zip_file), 'r') self.assertEquals(len(ziptest.filelist), 2) for file in ziptest.filelist: if file.filename.endswith("document2"): self.assertEquals(file.file_size, 4) else: self.assertEquals(file.file_size, 9) def testSendZipFile(self): """Tests if the htm is extrated from zipfile""" zip_input_url = 'data/test.zip' data = open(zip_input_url).read() zipdocument = File(self.tmp_url, data, 'zip') mime = magic.Magic(mime=True) mimetype = mime.from_buffer(zipdocument.getContent(True)) self.assertEquals(mimetype, "application/zip") mimetype = mime.from_buffer(zipdocument.getContent()) self.assertEquals(mimetype, "text/html") zipfile = ZipFile(StringIO(zipdocument.getContent(True))) self.assertEquals(sorted(zipfile.namelist()), sorted(['logo.gif', 'test.htm']))
class Handler(object): """PDF Handler is used to handler inputed pdf document.""" implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ Load pdf document """ self.base_folder_url = base_folder_url self.document = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert a pdf document """ logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.document.directory_name) command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash() def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. along with the metadata. """ command = ["pdfinfo", self.document.getUrl()] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() info_list = filter(None, stdout.split("\n")) metadata = {} for info in iter(info_list): if info.count(":") == 1: info_name, info_value = info.split(":") else: info_name, info_value = info.split(" ") info_name = info_name.replace(":", "") info_value = info_value.strip() metadata[info_name.lower()] = info_value self.document.trash() return metadata def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ text_template = "InfoKey: %s\nInfoValue: %s\n" text_list = [text_template % (key.capitalize(), value) \ for key, value in metadata.iteritems()] metadata_file = File(self.document.directory_name, "".join(text_list), "txt") output_url = mktemp(suffix=".pdf", dir=self.document.directory_name) command = ["pdftk", self.document.getUrl(), "update_info", metadata_file.getUrl(), "output", output_url ] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash() @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('text/plain', 'Plain Text'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("application/pdf", "pdf"): return [("text/plain", "Plain Text")] return []
class Handler(object): """ X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice documents. """ implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """ base_folder_url(string) The requested url for data base folder data(string) The opened and readed file into a string source_format(string) The source format of the inputed file """ self.base_folder_url = base_folder_url self._data = data self._source_format = source_format self._init_kw = kw self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map_output.get(destination_format, format_code_map[destination_format]) root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input"); input_file_name = self.file.getUrl() output_file_name = os.path.join(root_dir, "document.%s" % destination_format) config_file_name = os.path.join(root_dir, "config.xml") metadata = None output_data = None if source_format in yformat_tuple: if self._data.startswith("PK\x03\x04"): os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) input_file_name = os.path.join(input_dir, "body.txt") if not os.path.isfile(input_file_name): input_file_name = os.path.join(input_dir, "Editor.bin") if not os.path.isfile(input_file_name): raise RuntimeError("input format incorrect: Editor.bin absent in zip archive") metadata_file_name = os.path.join(input_dir, "metadata.json") if os.path.isfile(metadata_file_name): with open(metadata_file_name) as metadata_file: metadata = json.loads(metadata_file.read()) with open(config_file_name, "w") as config_file: config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': str(in_format), 'm_sFileTo': output_file_name, 'm_nFormatTo': str(out_format), # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) self.file.reload(output_file_name) try: if source_format in yformat_tuple: if metadata: output_data = OOoHandler(self.base_folder_url, self.file.getContent(), source_format, **self._init_kw)\ .setMetadata(metadata) else: output_data = self.file.getContent() elif destination_format in yformat_tuple: if not metadata: if source_format not in yformat_tuple: metadata = OOoHandler(self.base_folder_url, self._data, source_format, **self._init_kw).getMetadata() if not metadata: metadata = {} metadata.pop('MIMEType', None) metadata.pop('Generator', None) metadata.pop('AppVersion', None) metadata.pop('ImplementationName', None) with ZipFile(output_file_name, mode="a") as zipfile: zipfile.writestr("metadata.json", json.dumps(metadata)) output_data = self.file.getContent() finally: self.file.trash() return output_data def _getContentType(self): mimetype_type = None if "/" not in self._source_format: mimetype_type = guess_type('a.' + self._source_format)[0] if mimetype_type is None: mimetype_type = self._source_format return mimetype_type def getMetadata(self, base_document=False): r"""Returns a dictionary with all metadata of document. """ if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"): if base_document: openxml_format = yformat_map[self._source_format] data = self.convert(yformat_map[self._source_format]) return OOoHandler(self.base_folder_url, data, openxml_format, **self._init_kw).getMetadata(base_document) else: with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile: try: metadata = zipfile.read("metadata.json") except KeyError: metadata = '{}' metadata = json.loads(metadata) metadata['MIMEType'] = self._getContentType() return metadata else: return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw)\ .getMetadata(base_document) def setMetadata(self, metadata=None): r"""Returns document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ if metadata is None: metadata = {} if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"): root_dir = self.file.directory_name output_file_name = os.path.join(root_dir, "tmp") try: input_dir = os.path.join(root_dir, "input") os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) with open(os.path.join(input_dir, "metadata.json"), "w") as metadata_file: metadata_file.write(json.dumps(metadata)) with ZipFile(output_file_name, "w") as zipfile: for root, _, files in os.walk(input_dir): relative_root = root.replace(input_dir, '') for file_name in files: absolute_path = os.path.join(root, file_name) file_name = os.path.join(relative_root, file_name) zipfile.write(absolute_path, file_name) output_data = open(output_file_name).read() finally: os.unlink(output_file_name) return output_data else: return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata) @staticmethod def getAllowedConversionFormatList(source_mimetype): """Returns a list content_type and their titles which are supported by enabled handlers. [('application/x-asc-text', 'OnlyOffice Text Document'), ... ] """ source_mimetype = parseContentType(source_mimetype).gettype() if source_mimetype in ("docy", "application/x-asc-text"): return [ ("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document"), ("application/vnd.oasis.opendocument.text", "ODF Text Document"), ] if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"): return [ ("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet"), ("application/vnd.oasis.opendocument.spreadsheet", "ODF Spreadsheet Document"), ] if source_mimetype in ("ppty", "application/x-asc-presentation"): return [ ("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation"), ("application/vnd.oasis.opendocument.presentation", "ODF Presentation Document"), ] get_format_list = OOoHandler.getAllowedConversionFormatList format_list = get_format_list(source_mimetype) format_list_append = format_list.append for f_type, _ in format_list: if f_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": format_list_append(("application/x-asc-text", "OnlyOffice Text Document")) break if f_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": format_list_append(("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")) break if f_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": format_list_append(("application/x-asc-presentation", "OnlyOffice Presentation")) break return format_list
class PDFGranulator(object): def __init__(self, base_folder_url, data, source_format, **kw): self.file = File(base_folder_url, data, source_format) self.environment = kw.get("env", {}) self.grain_directory = mkdtemp(dir=self.file.directory_name) # XXX - It should have another name for returning all images def getImageItemList(self): logger.debug("PDFImageGrainExtract") command = ["pdftohtml", self.file.getUrl(), "%s/" % self.grain_directory] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() # XXX - PDF can be protect if "Erro" in stderr: return False else: removeEqualImages(self.grain_directory) images = glob("%s/*.*" % self.grain_directory) imagesList = getImages(images) return imagesList def getTableItemList(self): """Returns the list of table title""" tables = self.getTablesMatrix() if tables == False: return "PDF Protect or have no Table Item List" else: table_list = tables.keys() return table_list def getTable(self, id, format="html"): """Returns the table into html format.""" try: table_matrix = self.getTablesMatrix()[id] content = "<html><body><h1> %s </h1><table>" % id for line in table_matrix: content += "<tr>" for column in line: if not type(column) == list: content += "<td> %s </td>" % column else: content += "<td>" for element in column: content += "%s </br>" % element content += "</td>" content += "</tr>" content += "</table></body></html>" return content except: return "PDF Protect or have no table with this id" def getTablesMatrix(self): """Returns the table as a matrix""" logger.debug("PDFTableGrainExtract") output_url = NamedTemporaryFile(suffix=".xml", dir=self.file.directory_name).name command = ["pdftohtml", "-xml", self.file.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() # XXX - PDF can be protect if "Erro" in stderr: return False else: output = etree.fromstring(open(output_url).read()) row_list = output.xpath("//text") name, previous, next = "", "", "" tables = {} element = [] line = [] matrix = [] i, j, l, m = 0, 0, 0, 0 old_x_left = 600 for x in row_list: base_line = x.attrib["top"] base_column = x.attrib["left"] i += 1 for y in row_list[i:]: if base_line == y.attrib["top"]: l += 1 line.append(get_text(y)) base_column = y.attrib["left"] row_list.remove(y) elif base_column == y.attrib["left"]: m = l if len(element) > 0: element.append(get_text(y)) # In case name of the table is after table if len(line) == 0: next = get_text(x) if next != None and len(next.split(":")) == 2: name = next next = "" elif len(line) > 0: element.append(line.pop()) element.append(get_text(y)) else: if len(element) > 0: line.insert(m - 1, element) l = 0 element = [] base_column = 0 break if len(line) > 0: # In case name of the table is before table previous = get_text(x.getprevious()) if previous != None and len(previous.split(":")) == 2: name = previous previous = "" line.insert(0, get_text(x)) if len(line) > 1: matrix.append(line) line = [] if x.attrib["left"] < old_x_left and len(matrix) > 0: if len(matrix) > 0: j += 1 if name == "": name = "Tabela %d" % j name += " - pag %s" % x.getparent().attrib["number"] tables[name] = matrix name = "" matrix = [] old_x_left = x.attrib["left"] return tables def trash(self): """Remove file from memory""" self.file.trash()
class Handler(object): """OOO Handler is used to access the one Document and OpenOffice. For each Document inputed is created on instance of this class to manipulate the document. This Document must be able to create and remove a temporary document at FS, load and export. """ implements(IHandler) def __init__(self, base_folder_url, data, source_format, **kw): """Creates document in file system and loads it in OOo.""" self.document = File(base_folder_url, data, source_format) self.zip = kw.get('zip', False) self.uno_path = kw.get("uno_path", None) self.office_binary_path = kw.get("office_binary_path", None) self.timeout = kw.get("timeout", 600) self.refresh = kw.get('refresh', False) self.source_format = source_format if not self.uno_path: self.uno_path = environ.get("uno_path") if not self.office_binary_path: self.office_binary_path = environ.get("office_binary_path") def _getCommand(self, *args, **kw): """Transforms all parameters passed in a command""" hostname, port = openoffice.getAddress() kw['hostname'] = hostname kw['port'] = port python = path.join(self.office_binary_path, "python") command_list = [path.exists(python) and python or "python", pkg_resources.resource_filename(__name__, path.join("helper", "unoconverter.py")), "--uno_path=%s" % self.uno_path, "--office_binary_path=%s" % self.office_binary_path, '--document_url=%s' % self.document.getUrl()] for arg in args: command_list.insert(3, "--%s" % arg) for k, v in kw.iteritems(): command_list.append("--%s=%s" % (k, v)) return command_list def _startTimeout(self): """start the Monitor""" self.monitor = MonitorTimeout(openoffice, self.timeout) self.monitor.start() return def _stopTimeout(self): """stop the Monitor""" self.monitor.terminate() return def _subprocess(self, command_list): """Run one procedure""" if monitor_sleeping_time is not None: monitor_sleeping_time.touch() try: self._startTimeout() process = Popen(command_list, stdout=PIPE, stderr=PIPE, close_fds=True, env=openoffice.environment_dict.copy()) stdout, stderr = process.communicate() finally: self._stopTimeout() if pid_exists(process.pid): process.terminate() return stdout, stderr def _callUnoConverter(self, *feature_list, **kw): """ """ if not openoffice.status(): openoffice.start() command_list = self._getCommand(*feature_list, **kw) stdout, stderr = self._subprocess(command_list) if not stdout and len(re.findall("\w*Exception|\w*Error", stderr)) >= 1: logger.debug(stderr) self.document.restoreOriginal() openoffice.restart() kw['document_url'] = self.document.getUrl() command = self._getCommand(*feature_list, **kw) stdout, stderr = self._subprocess(command) if stderr != "": raise Exception(stderr) return stdout, stderr def _serializeMimemapper(self, source_extension=None, destination_extension=None): """Serialize parts of mimemapper""" if destination_extension is None: return json.dumps(dict(mimetype_by_filter_type=mimemapper._mimetype_by_filter_type)) filter_list = [] service_type_list = mimemapper._doc_type_list_by_extension.get( source_extension, mimemapper.document_service_list) for service_type in service_type_list: filter_list.append((destination_extension, service_type, mimemapper.getFilterName(destination_extension, service_type))) logger.debug("Filter List: %r" % filter_list) return json.dumps(dict(doc_type_list_by_extension=mimemapper._doc_type_list_by_extension, filter_list=filter_list, mimetype_by_filter_type=mimemapper._mimetype_by_filter_type)) def convert(self, destination_format=None, **kw): """Convert a document to another format supported by the OpenOffice Keyword Arguments: destination_format -- extension of document as String """ logger.debug("OooConvert: %s > %s" % (self.source_format, destination_format)) kw['source_format'] = self.source_format if destination_format: kw['destination_format'] = destination_format kw['mimemapper'] = self._serializeMimemapper(self.source_format, destination_format) kw['refresh'] = json.dumps(self.refresh) openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*['convert'], **kw) finally: openoffice.release() url = stdout.replace('\n', '') self.document.reload(url) content = self.document.getContent(self.zip) self.document.trash() return content def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. Keywords Arguments: base_document -- Boolean variable. if true, the document is also returned along with the metadata.""" logger.debug("getMetadata") kw = dict(mimemapper=self._serializeMimemapper()) if base_document: feature_list = ['getmetadata', 'convert'] else: feature_list = ['getmetadata'] openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*feature_list, **kw) finally: openoffice.release() metadata = json.loads(decodestring(stdout)) if 'document_url' in metadata: self.document.reload(metadata['document_url']) metadata['Data'] = self.document.getContent() del metadata['document_url'] self.document.trash() return metadata def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ metadata_pickled = json.dumps(metadata) logger.debug("setMetadata") kw = dict(metadata=encodestring(metadata_pickled)) openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*['setmetadata'], **kw) finally: openoffice.release() doc_loaded = self.document.getContent() self.document.trash() return doc_loaded