def _decode_zip_filename(self, name): for enc in ('cp866', 'cp1251', 'utf-8'): try: return to_unicode(name, enc) except UnicodeDecodeError: pass return name
def get_source(self, name): logging.debug('ZipLoader.get_source %s', name) if self.base_path: name = path.join(self.base_path, name) logging.debug('ZipLoader.get_source has base_path, result name is %s', name) self._unpack_zip() if isinstance(name, str): name = to_unicode(name, 'utf-8') data = self.mapping.get(name, None) if data is not None: return data, name original_name = self._filenames.get(name) logging.debug('ZipLoader.get_source original_name=%s', original_name) if original_name is None: raise FileNotFound(name) data = self.zipfile.read(original_name) logging.debug('ZipLoader.get_source returns %s bytes', len(data)) return data, name
def encode_header(self, value): value = to_unicode(value, charset=self.charset) if isinstance(value, string_types): value = value.rstrip() _r = Header(value, self.charset) return str(_r) else: return value
def test_after_build(): AFTER_BUILD_HEADER = 'X-After-Build' def my_after_build(original_message, built_message): built_message[AFTER_BUILD_HEADER] = '1' kwargs = common_email_data() m = emails.Message(**kwargs) m.after_build = my_after_build s = m.as_string() print("type of message.as_string() is {0}".format(type(s))) assert AFTER_BUILD_HEADER in to_unicode(s, 'utf-8')
def parse_name_and_email(obj, encoding='utf-8'): # In: '*****@*****.**' or '"John Smith" <*****@*****.**>' or ('John Smith', '*****@*****.**') # Out: (u'John Smith', u'*****@*****.**') _realname = '' _email = '' if isinstance(obj, (list, tuple)): if len(obj) == 2: _realname, _email = obj elif isinstance(obj, string_types): _realname, _email = parseaddr(obj) else: raise ValueError("Can not parse_name_and_email from %s" % obj) if isinstance(_realname, bytes): _realname = to_unicode(_realname, encoding) if isinstance(_email, bytes): _email = to_unicode(_email, encoding) return _realname or None, _email or None
def start_load_file(self, html, encoding="utf-8"): """ Set some params and load start page """ if hasattr(html, 'read'): html = html.read() if not isinstance(html, text_type): html = to_unicode(html, encoding) #print(__name__, type(html)) html = html.replace('\r\n', '\n') # Remove \r, or we'll get much self.html_content = html self.html_encoding = encoding # ? self.start_url = None self.base_url = None self.headers = None
def patch_message(self, message): """ Some SMTP requires from and to emails """ if self.from_email: message.mail_from = (message.mail_from[0], self.from_email) if self.to_email: message.mail_to = self.to_email # TODO: this code breaks template in subject; fix it if not to_unicode(message.subject).startswith(self.subject_prefix) : message.subject = " ".join([self.subject_prefix, message.subject, 'py%s' % sys.version[:3]]) message._headers['X-Test-Date'] = datetime.datetime.utcnow().isoformat() message._headers['X-Python-Version'] = "%s/%s" % (platform.python_version(), platform.platform()) return message
def sanitize_address(addr, encoding): if isinstance(addr, string_types): addr = parseaddr(to_unicode(addr)) nm, addr = addr # This try-except clause is needed on Python 3 < 3.2.4 # http://bugs.python.org/issue14291 try: nm = Header(nm, encoding).encode() except UnicodeEncodeError: nm = Header(nm, 'utf-8').encode() try: addr.encode('ascii') except UnicodeEncodeError: # IDN if '@' in addr: localpart, domain = addr.split('@', 1) localpart = str(Header(localpart, encoding)) domain = domain.encode('idna').decode('ascii') addr = '@'.join([localpart, domain]) else: addr = Header(addr, encoding).encode() return formataddr((nm, addr))
def guess_charset(headers, html): # guess by http headers if headers: #print(__name__, "guess_charset has headers", headers) content_type = headers['content-type'] if content_type: _, params = cgi.parse_header(content_type) r = params.get('charset', None) if r: return r # guess by html meta #print(__name__, "guess_charset html=", html[:1024]) for s in RE_META.findall(html): for x in RE_INSIDE_META.findall(s): for charset in RE_CHARSET.findall(x): return to_unicode(charset) # guess by chardet return chardet.detect(html)['encoding']
def log(self, level, *msg): if self.DEBUG: print(('%s- %s' % (level * '\t ', ' '.join((to_unicode(m or '') for m in msg)))))
def normalize_html(s): return "".join(to_unicode(s).split())
def get_link(self): r = self.el.uri if self.encoding: r = to_unicode(self.el.uri, self.encoding) return r
def get_link(self): #print(__name__, "ElementWithLink encoding=", self.encoding) r = self.el.get(self.LINK_ATTR_NAME) if self.encoding: r = to_unicode(r, self.encoding) return r
def update_tag(self): if self.element is not None: self._concatenate_sheets() cssText = self._cached_stylesheet.cssText cssText = cssText and to_unicode(cssText, 'utf-8') or '' self.element.text = cssText
def getView(self, document, sheet, media='all', name=None, styleCallback=None): """ document a DOM document, currently an lxml HTML document sheet a CSS StyleSheet object, currently cssutils sheet media: optional TODO: view for which media it should be name: optional TODO: names of sheets only styleCallback: optional should return css.CSSStyleDeclaration of inline styles, for html a style declaration for ``element@style``. Gets one parameter ``element`` which is the relevant DOMElement returns style view a dict of {DOMElement: css.CSSStyleDeclaration} for html """ styleCallback = styleCallback or self.styleattribute _unmergable_rules = CSSStyleSheet() view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) for rule in rules: for selector in rule.selectorList: self.log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml try: cssselector = CSSSelector(selector.selectorText) except (ExpressionError, NotImplementedError) as e: _unmergable_rules.add(CSSStyleRule(selectorText=selector.selectorText, style=rule.style)) continue matching = cssselector.evaluate(document) for element in matching: if element.tag in self.NONVISUAL_TAGS: continue # add styles for all matching DOM elements self.log(1, 'ELEMENT', id(element), element.text) if element not in view: # add initial empty style declatation view[element] = CSSStyleDeclaration() specificities[element] = {} # and add inline @style if present inlinestyle = styleCallback(element) if inlinestyle: for p in inlinestyle: # set inline style specificity view[element].setProperty(p) specificities[element][p.name] = (1, 0, 0, 0) for p in rule.style: # update style declaration if p not in view[element]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[element].setProperty(p.name, p.value, p.priority) specificities[element][p.name] = selector.specificity self.log(2, view[element].getProperty('color')) else: self.log(2, view[element].getProperty('color')) sameprio = (p.priority == view[element].getPropertyPriority(p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[element][p.name]): # later, more specific or higher prio view[element].setProperty(p.name, p.value, p.priority) _unmergable_css = _unmergable_rules.cssText if _unmergable_css: e = etree.Element('style') # print __name__, _unmergable_css.__repr__() e.text = to_unicode(_unmergable_css, 'utf-8') body = document.find('body') or document body.insert(0, e) # add <style> right into body return view
def update(self): cssText = self.style.cssText if isinstance(cssText, str): cssText = to_unicode(cssText, 'utf-8') self.el.set('style', cssText)