def process_conversion(files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice): if files is None and urls is None: raise docvert_exception.needs_files_or_urls() if pipeline_id is None: raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id) storage = docvert_storage.get_storage(storage_type_name) for filename, data in files.iteritems(): doc_type = document_type.detect_document_type(data) if doc_type != document_type.types.oasis_open_document: data = generate_open_document(data, converter) document_xml = opendocument.extract_useful_open_document_files(data, storage, filename) process_pipeline(document_xml, pipeline_id, pipeline_type, auto_pipeline_id, storage, filename) return storage
def process_conversion(files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice, suppress_errors=False): if files is None and urls is None: raise docvert_exception.needs_files_or_urls() if pipeline_id is None: raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id) storage = docvert_storage.get_storage(storage_type_name) def _title(name, files, data): filename = os.path.basename(name).replace('\\','-').replace('/','-').replace(':','-') if len(filename) == 0: filename = "document.odt" if files.has_key(filename): if data and hasattr(files[filename], 'read') and files[filename].getvalue() == data: return filename unique = 1 potential_filename = filename while files.has_key(potential_filename): unique += 1 if filename.count("."): potential_filename = filename.replace(".", "%i." % unique, 1) else: potential_filename = filename + str(unique) filename = potential_filename return filename for filename, data in files.iteritems(): storage.set_friendly_name(filename, filename) for url in urls: try: data = urllib2.urlopen(url, None, http_timeout).read() doc_type = document_type.detect_document_type(data) if doc_type == document_type.types.html: data = html_to_opendocument(data, url) filename = _title(url, files, data) storage.set_friendly_name(filename, "%s (%s)" % (filename, url)) files[filename] = StringIO.StringIO(data) except IOError, e: filename = _title(url, files, None) storage.set_friendly_name(filename, "%s (%s)" % (filename, url)) files[filename] = Exception("Download error from %s: %s" % (url, e))
def process_conversion( files=None, urls=None, pipeline_id=None, pipeline_type="pipelines", auto_pipeline_id=None, storage_type_name=docvert_storage.storage_type.memory_based, converter=converter_type.python_streaming_to_libreoffice): if files is None and urls is None: raise docvert_exception.needs_files_or_urls() if pipeline_id is None: raise docvert_exception.unrecognised_pipeline("Unknown pipeline '%s'" % pipeline_id) storage = docvert_storage.get_storage(storage_type_name) for filename, data in files.iteritems(): doc_type = document_type.detect_document_type(data) if doc_type != document_type.types.oasis_open_document: data = generate_open_document(data, converter) document_xml = opendocument.extract_useful_open_document_files( data, storage, filename) process_pipeline(document_xml, pipeline_id, pipeline_type, auto_pipeline_id, storage, filename) return storage
except TypeError, e: print "TypeError on '%s'?" % entity raise return text soup = BeautifulSoup(html, convertEntities=BeautifulSoup.XML_ENTITIES) to_extract = soup.findAll('script') for item in to_extract: item.extract() pretty_xml = soup.html.prettify() pretty_xml = re.sub("&?\w+;", to_ncr, pretty_xml) pretty_xml = re.sub('&(\w+);', '&\\1', pretty_xml) pretty_xml = pretty_xml.replace("& ", "& ") #display_lines(pretty_xml, 5, 15) xml = docvert_xml.get_document(pretty_xml) storage = docvert_storage.get_storage(docvert_storage.storage_type.memory_based) result = process_pipeline(xml, 'default', 'html_to_opendocument', None, storage) #print result #print storage return result def display_lines(data, start_line, end_line): data = data.split("\n") segment = data[start_line:end_line] for line in segment: print "%s%s" % (start_line, line) start_line += 1 def get_all_pipelines(include_default_autopipeline = True): def _title(name):