def pandoc_convert(content, from_format, to_format, full=False, raw=False): """ Convert markdown content to pdf >>> res = pandoc_convert('<span>dssd', 'html', 'pdf') """ # pandoc does not react well when html is not valid # use tidy to clean html if from_format == 'html': try: content = do_tidy(content) except: # tidy fails ... try pandoc anyway... content = to_unicode(content) # if to_format is pdf: use markdown2pdf if MARKDOWN2PDF_BIN and to_format == 'pdf': if from_format != 'markdown': content = pandoc_convert(content, from_format, 'markdown', True) return pandoc_markdown2pdf(content) return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
def pandoc_convert(content, from_format, to_format, full=False, raw=False): """ Convert markdown content to pdf >>> res = pandoc_convert('<span>dssd', 'html', 'pdf') """ # pandoc does not react well when html is not valid # use tidy to clean html if from_format == 'html': try: content = do_tidy(content) except: # tidy fails ... try pandoc anyway... content = to_unicode(content) # if to_format is pdf: use markdown2pdf if MARKDOWN2PDF_BIN and to_format == 'pdf': if from_format != 'markdown': content = pandoc_convert(content, from_format, 'markdown', True) return pandoc_markdown2pdf(content) return pandoc_pandoc(content, from_format, to_format, full, from_format == to_format == 'html') # use raw pandoc convertion if html->html
def do_tidy(content=None, file_name=None): """ Tidy (html) content >>> res = do_tidy('<span>sdd') """ content = content_or_file_name(content, file_name) tidy_options = dict(output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0, logical_emphasis=1, wrap=0, input_encoding='utf8', output_encoding='utf8', ) src = to_unicode(content).encode('utf8') tidied_content, errors = tidylib.tidy_document(src, options=tidy_options) tidied_content = str(tidied_content) if content and not tidied_content.strip(): raise Exception('Content could not be tidyfied') return str(tidied_content).decode('utf8')
def do_tidy(content=None, file_name=None): """ Tidy (html) content >>> res = do_tidy('<span>sdd') """ content = content_or_file_name(content, file_name) tidy_options = dict( output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0, logical_emphasis=1, wrap=0, input_encoding='utf8', output_encoding='utf8', ) src = to_unicode(content).encode('utf8') tidied_content, errors = tidylib.tidy_document(src, options=tidy_options) tidied_content = str(tidied_content) if content and not tidied_content.strip(): raise Exception('Content could not be tidyfied') return str(tidied_content).decode('utf8')
def _convert_from_mimetype(input, mime_type, format): #input = to_unicode(input) attachs = [] attachs_dir = None ############################## # OO/MS-Word if mime_type in ['application/vnd.oasis.opendocument.text', 'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/rtf', 'text/rtf', ]: from cm.cm_settings import USE_ABI if USE_ABI: from abi_converters import AbiFileConverter converter = AbiFileConverter() try: html_input, attachs = converter.convert_to_html(input) html_input = re.sub(r' awml:style="[^"]*"', '', html_input) converted_input = pandoc_convert(html_input, 'html', format) except: # If Abiword fails for any reason, try libreoffice html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) if format == 'html': _not_used_css, converted_input = extract_css_body(xhtml_input) #converted_input = xhtml_input converted_input = pandoc_convert(html_input, 'html', format) else: html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) if format == 'html': _not_used_css, converted_input = extract_css_body(xhtml_input) #converted_input = xhtml_input converted_input = pandoc_convert(html_input, 'html', format) ############################## # latex elif mime_type in ['application/x-latex','text/x-tex',]: converted_input = pandoc_convert(to_unicode(input), 'latex', format) ############################## # anything looks like code: put them into markdown citation elif mime_type.startswith('text/x-') or mime_type in ['application/x-ruby',]: converted_input = markdown_from_code(input) ############################## # html elif mime_type in ['text/html', 'application/xhtml+xml']: if format == 'html': converted_input = input converted_input = pandoc_convert(input, 'html', format) ############################## # anything looks like text -> markdown elif mime_type in ['text/plain', 'text/english', 'text/enriched' ]: converted_input = to_unicode(input) ############################## # default case: assume it's text else: converted_input = to_unicode(input) return converted_input, attachs