def convert(self, destination_format): """ Convert the inputed file to output as format that were informed """ # XXX This implementation could use ffmpeg -i pipe:0, but # XXX seems super unreliable currently and it generates currupted files in # the end logger.debug("FfmpegConvert: %s > %s" % (self.input.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.input.directory_name) command = ["ffmpeg", "-i", self.input.getUrl(), "-y", output_url] # XXX ffmpeg has a bug that needs this options to work with webm format if destination_format == "webm": command.insert(3, "32k") command.insert(3, "-ab") try: stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.input.reload(output_url) if len(self.input.getContent()) == 0: logger.error(stderr.split("\n")[-2]) return self.input.getContent() finally: self.input.trash()
def run(self): """Start the process""" port = self.openoffice.getAddress()[-1] pid = self.openoffice.pid() logger.debug("Monitoring OpenOffice: Port %s, Pid: %s" % (port, pid)) self.status_flag = True sleep(self.interval) if self.openoffice.isLocked(): logger.debug("Stop OpenOffice - Port %s - Pid %s" % (port, pid)) self.openoffice.stop()
def getImageItemList(self): logger.debug("PDFImageGrainExtract") command = ["pdftohtml", self.file.getUrl(), "%s/" % self.grain_directory] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() # XXX - PDF can be protect if "Erro" in stderr: return False else: removeEqualImages(self.grain_directory) images = glob("%s/*.*" % self.grain_directory) imagesList = getImages(images) return imagesList
def stop(self): """Stop the process""" if hasattr(self, 'process') and self.status(): process_pid = self.process.pid logger.debug("Stop Pid - %s" % process_pid) try: self.process.terminate() waitStopDaemon(self, self.timeout) finally: if pid_exists(process_pid) or self.status(): Process(process_pid).kill() delattr(self, "process")
def _releaseOpenOfficePort(self): for process in psutil.process_iter(): try: if process.exe == join(self.office_binary_path, self._bin_soffice): for connection in process.get_connections(): if connection.status == "LISTEN" and \ connection.local_address[1] == self.port: process.terminate() except AccessDenied, e: pass except TypeError, e: # exception to prevent one psutil issue with zombie processes logger.debug(e)
def get_memory_usage(self): try: if not hasattr(self, 'process') or \ self.process.pid != int(self.openoffice.pid()): self.create_process() return self.process.get_memory_info().rss / (1024 * 1024) except TypeError: logger.debug("OpenOffice is stopped") return 0 except psutil.NoSuchProcess: # Exception raised when a process with a certain PID doesn't or no longer # exists (zombie). return 0
def _releaseOpenOfficePort(self): for process in psutil.process_iter(): try: if process.exe == join(self.office_binary_path, self._bin_soffice): for connection in process.get_connections(): if connection.status == "LISTEN" and \ connection.local_address[1] == self.port: process.terminate() except psutil.error.AccessDenied, e: pass except TypeError, e: # exception to prevent one psutil issue with zombie processes logger.debug(e)
def convert(self, destination_format=None, **kw): """Convert a image""" logger.debug("ImageMagickConvert: %s > %s" % (self.file.source_format, destination_format)) output_url = mktemp(suffix='.%s' % destination_format, dir=self.base_folder_url) command = ["convert", self.file.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.file.reload(output_url) try: return self.file.getContent() finally: self.file.trash()
def setMetadata(self, metadata): """Returns a document with new metadata. Keyword arguments: metadata -- expected an dictionary with metadata. """ metadata_pickled = json.dumps(metadata) logger.debug("setMetadata") kw = dict(metadata=encodestring(metadata_pickled)) openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*['setmetadata'], **kw) finally: openoffice.release() doc_loaded = self.document.getContent() self.document.trash() return doc_loaded
def convert(self, destination_format=None, **kw): """ Convert a pdf document """ logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = mktemp(suffix=".%s" % destination_format, dir=self.document.directory_name) command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash()
def _callUnoConverter(self, *feature_list, **kw): """ """ if not openoffice.status(): openoffice.start() command_list = self._getCommand(*feature_list, **kw) stdout, stderr = self._subprocess(command_list) if not stdout and len(re.findall("\w*Exception|\w*Error", stderr)) >= 1: logger.debug(stderr) self.document.restoreOriginal() openoffice.restart() kw['document_url'] = self.document.getUrl() command = self._getCommand(*feature_list, **kw) stdout, stderr = self._subprocess(command) if stderr != "": raise Exception(stderr) return stdout, stderr
def _serializeMimemapper(self, source_extension=None, destination_extension=None): """Serialize parts of mimemapper""" if destination_extension is None: return json.dumps(dict(mimetype_by_filter_type=mimemapper._mimetype_by_filter_type)) filter_list = [] service_type_list = mimemapper._doc_type_list_by_extension.get( source_extension, mimemapper.document_service_list) for service_type in service_type_list: filter_list.append((destination_extension, service_type, mimemapper.getFilterName(destination_extension, service_type))) logger.debug("Filter List: %r" % filter_list) return json.dumps(dict(doc_type_list_by_extension=mimemapper._doc_type_list_by_extension, filter_list=filter_list, mimetype_by_filter_type=mimemapper._mimetype_by_filter_type))
def convert(self, destination_format=None, **kw): """Convert a document to another format supported by the OpenOffice Keyword Arguments: destination_format -- extension of document as String """ logger.debug("OooConvert: %s > %s" % (self.source_format, destination_format)) kw['source_format'] = self.source_format if destination_format: kw['destination_format'] = destination_format kw['mimemapper'] = self._serializeMimemapper(self.source_format, destination_format) kw['refresh'] = json.dumps(self.refresh) openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*['convert'], **kw) finally: openoffice.release() url = stdout.replace('\n', '') self.document.reload(url) content = self.document.getContent(self.zip) self.document.trash() return content
def convert(self, destination_format=None, **kw): """Convert a image""" logger.debug("wkhtmltopdf convert: %s > %s" % (self.file.source_format, destination_format)) output_path = self.makeTempFile(destination_format) command = self.makeWkhtmltopdfCommandList( self.convertPathToUrl(self.file.getUrl()), output_path, conversion_kw=kw, ) stdout, stderr = Popen( command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, cwd=self.file.directory_name, ).communicate() self.file.reload(output_path) try: return self.file.getContent() finally: self.file.trash()
def getMetadata(self, base_document=False): """Returns a dictionary with all metadata of document. Keywords Arguments: base_document -- Boolean variable. if true, the document is also returned along with the metadata.""" logger.debug("getMetadata") kw = dict(mimemapper=self._serializeMimemapper()) if base_document: feature_list = ['getmetadata', 'convert'] else: feature_list = ['getmetadata'] openoffice.acquire() try: stdout, stderr = self._callUnoConverter(*feature_list, **kw) finally: openoffice.release() metadata = json.loads(decodestring(stdout)) if 'document_url' in metadata: self.document.reload(metadata['document_url']) metadata['Data'] = self.document.getContent() del metadata['document_url'] self.document.trash() return metadata
def convert(self, destination_format=None, **kw): """ Convert a pdf document """ logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format)) output_url = NamedTemporaryFile(suffix=".%s" % destination_format, dir=self.document.directory_name).name if self.document.source_format == 'ps': command = ["ps2pdf", "-dASCII85EncodePages=false", "-dLanguageLevel=1", self.document.getUrl(), output_url] else: command = ["pdftotext", self.document.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() self.document.reload(output_url) try: return self.document.getContent() finally: self.document.trash()
def run(self): """Is called by start function""" logger.debug("Start MonitorRequest") while self.status_flag: if self.openoffice.request > self.request_limit: self.openoffice.acquire() logger.debug("Openoffice: %s, %s will be restarted" % \ self.openoffice.getAddress()) self.openoffice.restart() self.openoffice.release() sleep(self.interval) logger.debug("Stop MonitorRequest ")
def run(self): """Is called by start function. this function is responsible for controlling the amount of memory used, and if the process exceeds the limit it is stopped forcibly """ self.status_flag = True logger.debug("Start MonitorMemory") while self.status_flag: if self.get_memory_usage() > self.limit: logger.debug("Stopping OpenOffice") self.openoffice.stop() sleep(self.interval) logger.debug("Stop MonitorMemory")
def _testOpenOffice(self, host, port): """Test if OpenOffice was started correctly""" logger.debug("Test OpenOffice %s - Pid %s" % (self.getAddress()[-1], self.pid())) python = join(self.office_binary_path, "python") args = [exists(python) and python or "python", pkg_resources.resource_filename("cloudooo", join('handler', 'ooo', "helper", "openoffice_tester.py")), "--hostname=%s" % host, "--port=%s" % port, "--uno_path=%s" % self.uno_path] logger.debug("Testing Openoffice Instance %s" % port) stdout, stderr = Popen(args, stdout=PIPE, stderr=PIPE, close_fds=True).communicate() stdout_bool = convertStringToBool(stdout.replace("\n", "")) if stdout_bool and stderr != "": logger.debug("%s\n%s" % (stderr, stdout)) return False else: logger.debug("Instance %s works" % port) return True
def run(self): """Start monitoring process. Stop daemon if running and not touch after sleeping duration """ logger.debug("Start MonitorSpleepingTime") while self.status_flag: current_time = time() if self.openoffice.status() and\ (self._touched_at + self.sleeping_time) <= current_time: logger.debug("Stopping OpenOffice after sleeping time of %is" %\ self.sleeping_time) self.openoffice.acquire() self.openoffice.stop() self.openoffice.release() sleep(self.interval) logger.debug("Stop MonitorSpleepingTime")
def release(self): """Unlock Instance.""" logger.debug("OpenOffice %s, %s unlocked" % self.getAddress()) self._lock.release()
def start(self, init=True): """Start Application""" logger.debug("Process Started %s, Port %s. Pid %s" % (self.name, self.getAddress()[-1], self.pid()))
def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map[destination_format] root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input") output_dir = os.path.join(root_dir, "output") final_file_name = os.path.join(root_dir, "document.%s" % destination_format) input_file_name = self.file.getUrl() output_file_name = final_file_name config_file_name = os.path.join(root_dir, "config.xml") if source_format in yformat_tuple: if self._data.startswith("PK\x03\x04"): os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) for _, _, files in os.walk(input_dir): input_file_name, = files break input_file_name = os.path.join(input_dir, input_file_name) if destination_format in yformat_tuple: os.mkdir(output_dir) output_file_name = os.path.join(output_dir, "body.txt") config_file = open(config_file_name, "w") config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': in_format, 'm_sFileTo': output_file_name, 'm_nFormatTo': out_format, # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") config_file.close() # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError( "x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name ]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) if destination_format in yformat_tuple: zipTree( final_file_name, (output_file_name, ""), (os.path.join(os.path.dirname(output_file_name), "media"), ""), ) self.file.reload(final_file_name) try: return self.file.getContent() finally: self.file.trash()
def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map_output.get(destination_format, format_code_map[destination_format]) root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input"); input_file_name = self.file.getUrl() output_file_name = os.path.join(root_dir, "document.%s" % destination_format) config_file_name = os.path.join(root_dir, "config.xml") metadata = None output_data = None if source_format in yformat_tuple: if self._data.startswith("PK\x03\x04"): os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) input_file_name = os.path.join(input_dir, "body.txt") if not os.path.isfile(input_file_name): input_file_name = os.path.join(input_dir, "Editor.bin") if not os.path.isfile(input_file_name): raise RuntimeError("input format incorrect: Editor.bin absent in zip archive") metadata_file_name = os.path.join(input_dir, "metadata.json") if os.path.isfile(metadata_file_name): with open(metadata_file_name) as metadata_file: metadata = json.loads(metadata_file.read()) with open(config_file_name, "w") as config_file: config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': str(in_format), 'm_sFileTo': output_file_name, 'm_nFormatTo': str(out_format), # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) self.file.reload(output_file_name) try: if source_format in yformat_tuple: if metadata: output_data = OOoHandler(self.base_folder_url, self.file.getContent(), source_format, **self._init_kw)\ .setMetadata(metadata) else: output_data = self.file.getContent() elif destination_format in yformat_tuple: if not metadata: if source_format not in yformat_tuple: metadata = OOoHandler(self.base_folder_url, self._data, source_format, **self._init_kw).getMetadata() if not metadata: metadata = {} metadata.pop('MIMEType', None) metadata.pop('Generator', None) metadata.pop('AppVersion', None) metadata.pop('ImplementationName', None) with ZipFile(output_file_name, mode="a") as zipfile: zipfile.writestr("metadata.json", json.dumps(metadata)) output_data = self.file.getContent() finally: self.file.trash() return output_data
def touch(self): """Restart countdown """ logger.debug("Touch MonitorSpleepingTime") self._touched_at = time()
def convert(self, destination_format=None, **kw): """ Convert the inputed file to output as format that were informed """ source_format = self.file.source_format logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) # init vars and xml configuration file in_format = format_code_map[source_format] out_format = format_code_map[destination_format] root_dir = self.file.directory_name input_dir = os.path.join(root_dir, "input"); output_dir = os.path.join(root_dir, "output"); final_file_name = os.path.join(root_dir, "document.%s" % destination_format) input_file_name = self.file.getUrl() output_file_name = final_file_name config_file_name = os.path.join(root_dir, "config.xml") if source_format in yformat_tuple: os.mkdir(input_dir) unzip(self.file.getUrl(), input_dir) for _, _, files in os.walk(input_dir): input_file_name, = files break input_file_name = os.path.join(input_dir, input_file_name) if destination_format in yformat_tuple: os.mkdir(output_dir) output_file_name = os.path.join(output_dir, "body.txt") config_file = open(config_file_name, "w") config = { # 'm_sKey': 'from', 'm_sFileFrom': input_file_name, 'm_nFormatFrom': in_format, 'm_sFileTo': output_file_name, 'm_nFormatTo': out_format, # 'm_bPaid': 'true', # 'm_bEmbeddedFonts': 'false', # 'm_bFromChanges': 'false', # 'm_sFontDir': '/usr/share/fonts', # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes', } root = ElementTree.Element('root') for key, value in config.items(): ElementTree.SubElement(root, key).text = value ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml") config_file.close() # run convertion binary p = Popen( ["x2t", config_file.name], stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment, ) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) if destination_format in yformat_tuple: zipTree( final_file_name, (output_file_name, ""), (os.path.join(os.path.dirname(output_file_name), "media"), ""), ) self.file.reload(final_file_name) try: return self.file.getContent() finally: self.file.trash()
def getTablesMatrix(self): """Returns the table as a matrix""" logger.debug("PDFTableGrainExtract") output_url = NamedTemporaryFile(suffix=".xml", dir=self.file.directory_name).name command = ["pdftohtml", "-xml", self.file.getUrl(), output_url] stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate() # XXX - PDF can be protect if "Erro" in stderr: return False else: output = etree.fromstring(open(output_url).read()) row_list = output.xpath("//text") name, previous, next = "", "", "" tables = {} element = [] line = [] matrix = [] i, j, l, m = 0, 0, 0, 0 old_x_left = 600 for x in row_list: base_line = x.attrib["top"] base_column = x.attrib["left"] i += 1 for y in row_list[i:]: if base_line == y.attrib["top"]: l += 1 line.append(get_text(y)) base_column = y.attrib["left"] row_list.remove(y) elif base_column == y.attrib["left"]: m = l if len(element) > 0: element.append(get_text(y)) # In case name of the table is after table if len(line) == 0: next = get_text(x) if next != None and len(next.split(":")) == 2: name = next next = "" elif len(line) > 0: element.append(line.pop()) element.append(get_text(y)) else: if len(element) > 0: line.insert(m - 1, element) l = 0 element = [] base_column = 0 break if len(line) > 0: # In case name of the table is before table previous = get_text(x.getprevious()) if previous != None and len(previous.split(":")) == 2: name = previous previous = "" line.insert(0, get_text(x)) if len(line) > 1: matrix.append(line) line = [] if x.attrib["left"] < old_x_left and len(matrix) > 0: if len(matrix) > 0: j += 1 if name == "": name = "Tabela %d" % j name += " - pag %s" % x.getparent().attrib["number"] tables[name] = matrix name = "" matrix = [] old_x_left = x.attrib["left"] return tables