Python to_unicode Examples, cm.utils.string_utils.to_unicode Python Examples

Example #1

0

Show file

File: pandoc_converters.py Project: co-ment/comt

def pandoc_convert(content, from_format, to_format, full=False, raw=False):
    """
    Convert markdown content to pdf
    
    >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
    """
    # pandoc does not react well when html is not valid
    # use tidy to clean html  
    if from_format == 'html':
        try:
            content = do_tidy(content)
        except:
            # tidy fails ... try pandoc anyway...
            content = to_unicode(content)
    # if to_format is pdf: use markdown2pdf
    if MARKDOWN2PDF_BIN and to_format == 'pdf':        
        if from_format != 'markdown':
            content = pandoc_convert(content, from_format, 'markdown', True)
        return pandoc_markdown2pdf(content)
    return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html

Example #2

0

Show file

File: pandoc_converters.py Project: ovnicraft/comt

def pandoc_convert(content, from_format, to_format, full=False, raw=False):
    """
    Convert markdown content to pdf
    
    >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
    """
    # pandoc does not react well when html is not valid
    # use tidy to clean html
    if from_format == 'html':
        try:
            content = do_tidy(content)
        except:
            # tidy fails ... try pandoc anyway...
            content = to_unicode(content)
    # if to_format is pdf: use markdown2pdf
    if MARKDOWN2PDF_BIN and to_format == 'pdf':
        if from_format != 'markdown':
            content = pandoc_convert(content, from_format, 'markdown', True)
        return pandoc_markdown2pdf(content)
    return pandoc_pandoc(content, from_format, to_format, full,
                         from_format == to_format ==
                         'html')  # use raw pandoc convertion if html->html

Example #3

0

Show file

File: pandoc_converters.py Project: co-ment/comt

def do_tidy(content=None, file_name=None):
    """
    Tidy (html) content
    
    >>> res = do_tidy('<span>sdd')
    """
    content = content_or_file_name(content, file_name)
    
    tidy_options = dict(output_xhtml=1, 
                        add_xml_decl=0, 
                        indent=0, 
                        tidy_mark=0,
                        logical_emphasis=1,
                        wrap=0,
                        input_encoding='utf8',
                        output_encoding='utf8',
                        )
    src = to_unicode(content).encode('utf8')
    tidied_content, errors = tidylib.tidy_document(src, options=tidy_options)
    tidied_content = str(tidied_content)
    if content and not tidied_content.strip():
        raise Exception('Content could not be tidyfied') 
    return str(tidied_content).decode('utf8')

Example #4

0

Show file

File: pandoc_converters.py Project: ovnicraft/comt

def do_tidy(content=None, file_name=None):
    """
    Tidy (html) content
    
    >>> res = do_tidy('<span>sdd')
    """
    content = content_or_file_name(content, file_name)

    tidy_options = dict(
        output_xhtml=1,
        add_xml_decl=0,
        indent=0,
        tidy_mark=0,
        logical_emphasis=1,
        wrap=0,
        input_encoding='utf8',
        output_encoding='utf8',
    )
    src = to_unicode(content).encode('utf8')
    tidied_content, errors = tidylib.tidy_document(src, options=tidy_options)
    tidied_content = str(tidied_content)
    if content and not tidied_content.strip():
        raise Exception('Content could not be tidyfied')
    return str(tidied_content).decode('utf8')

Example #5

0

Show file

File: __init__.py Project: co-ment/comt

def _convert_from_mimetype(input, mime_type, format):
    #input = to_unicode(input)
        
    attachs = []
    attachs_dir = None
    ##############################
    # OO/MS-Word
    if mime_type in ['application/vnd.oasis.opendocument.text',
                     'application/msword',
                     'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                     'application/rtf',
                     'text/rtf',
                     ]:
        
        from cm.cm_settings import USE_ABI
        if USE_ABI:
          from abi_converters import AbiFileConverter
          converter = AbiFileConverter()
          try:
            html_input, attachs = converter.convert_to_html(input)
            html_input = re.sub(r' awml:style="[^"]*"', '', html_input)
            converted_input = pandoc_convert(html_input, 'html', format)
          except:
            # If Abiword fails for any reason, try libreoffice
            html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
            if format == 'html':
                  _not_used_css, converted_input = extract_css_body(xhtml_input)
                  #converted_input = xhtml_input
  
            converted_input = pandoc_convert(html_input, 'html', format)
        else:
          html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
          if format == 'html':
                _not_used_css, converted_input = extract_css_body(xhtml_input)
                #converted_input = xhtml_input
        
          converted_input = pandoc_convert(html_input, 'html', format)
        
    ##############################
    # latex
    elif mime_type in ['application/x-latex','text/x-tex',]:
        converted_input = pandoc_convert(to_unicode(input), 'latex', format)
    
    ##############################
    # anything looks like code: put them into markdown citation
    elif mime_type.startswith('text/x-') or mime_type in ['application/x-ruby',]:
        converted_input = markdown_from_code(input)

    ##############################
    # html
    elif mime_type in ['text/html', 'application/xhtml+xml']:
        if format == 'html':
            converted_input = input
        
        converted_input = pandoc_convert(input, 'html', format)
    ##############################
    # anything looks like text -> markdown
    elif mime_type in ['text/plain',
                       'text/english',
                       'text/enriched'
                      ]:
        converted_input = to_unicode(input)
    ##############################
    # default case: assume it's text
    else:
        converted_input = to_unicode(input)


    return converted_input, attachs