def wrapper_func(*args, **kwargs): exc_info = kwargs.pop('exc_info', None) output = self.prefix + text.decode(text.format(*args, **kwargs)) wrapped_func(output) if exc_info is not None: for line in traceback.format_exception(*exc_info): wrapped_func(self.prefix + text.chomp(text.decode(line)))
def wrapper_func(*args, **kwargs): exc_info = kwargs.pop('exc_info', None) output = self.prefix + text.decode( text.format(*args, **kwargs)) wrapped_func(output) if exc_info is not None: for line in traceback.format_exception(*exc_info): wrapped_func(self.prefix + text.chomp(text.decode(line)))
def get_domain_from_url(url): """Return normalized domain portion of the URL""" from gruntle.memebot.utils import text items = text.decode(urlparse.urlparse(url).netloc).lower().rsplit( u':', 1)[0].split(u'.')[-2:] return text.encode(u'.'.join(item for item in (item.strip() for item in items) if item))
def get_db_prep_value(self, value): """Serialize data before storing to database""" if value is not None: if not isinstance(value, str): value = text.encode(value, settings.TEXT_ENCODING) compressed_data = self.engine.compress(value, self.level) encoded_data = base64.encodestring(compressed_data) parts = self.HEADER, self.engine.id, encoded_data joined = ''.join(parts) value = text.decode(joined, settings.TEXT_ENCODING) return value
def __str__(self): from gruntle.memebot.utils.text import encode, decode, format return encode( format( "%s%s matched blacklist rule %r for host %r", self.blacklist.host, ("" if (self.url is None) else (u" (%s)" % decode(self.url))), self.blacklist.rule, self.blacklist.match, ) )
def render_node(node): """Try to turn a soup node into something resembling plain text""" if isinstance(node, (str, unicode)): html = node else: html = node.renderContents() html = text.decode(html) html = html_tag_re.sub(u' ', html) html = decode_entities(html) html = html.replace(u'\u00a0', ' ') html = whitespace_re.sub(u' ', html) html = html.strip() return html
def prettify_node(node): """Try to turn a soup node into something resembling readable html""" if isinstance(node, (str, unicode)): html = node else: html = node.prettify() html = text.decode(html) html = lang_tag_re.sub(u' ', html) html = comment_re.sub(u' ', html) html = html.strip() lines = html.splitlines() lines = (line.rstrip() for line in lines) lines = (line for line in lines if line) html = u'\n'.join(lines) + u'\n' return text.encode(html)
def summarize_soup(self, soup): """ Experimental: Try to guess where the main content is and return summary text. In it's current form, picking news sites & articles at random, it seems to have just slightly better than a 50% chance of working right. The other time it finds total garbage. Better than nothing for a first attempt, I guess ... """ # first, lean up the html a little bit and then remove every tag # that isn't a <div> or <p> tag. theory being that the latter two are mostly # the ones that define the structure of the document, which is what we # are most interested in. well, its structure relative to whatever text nodes # are left over. html = text.decode(browser.prettify_node(soup.body)) for orig, name in tag_re.findall(html): if name not in ('div', 'p'): html = html.replace(orig, u' ') # put it back into soup form and perform the main logic thingy here. the idea # is to walk each remaining node in the branch and look at the text contents of # each of its *siblings*. the idea is that these would be paragraphs in the article. # this falls apart spectacularly on some sites, and in a very specific way. I thin # certain menus or sidebars are laid out this way too. since we do a basic word count, if # they are large enough, they'll overtake the article. perhaps we can correct for this # in another way.. requires investigation. soup = BeautifulSoup(html) blocks = [] for node in soup.findAll(True): size = 0 for p in node: for el in p: if isinstance(el, NavigableString): size += len(el) if size: blocks.append((size, node)) # now we have a list of nodes & how much text in "paragraph" form they contain. whatever is # the largest we will assume is the intended content, and grab a cleaned up snippet from # the front of it. if blocks: article = browser.render_node(max(blocks)[1]) words = article[:self.summary_size].split() if len(article) > self.summary_size: words[-1] = self.summary_cont return u' '.join(words)
def normalize(self, host): """Clean up hostname""" items = text.decode(host).lower().rsplit(u':', 1)[0].split(u'.') return text.encode(u'.'.join(item for item in (item.strip() for item in items) if item))
def get_domain_from_url(url): """Return normalized domain portion of the URL""" from gruntle.memebot.utils import text items = text.decode(urlparse.urlparse(url).netloc).lower().rsplit(u':', 1)[0].split(u'.')[-2:] return text.encode(u'.'.join(item for item in (item.strip() for item in items) if item))
def prefix(self): if self.name: return text.decode(text.format('[%s] ', self.name)) return u''
if length is not None: length = int(length) complete = read >= int(length) else: complete = (self.max_read == -1) or (read < self.max_read) content_encoding = response.headers.get('content-encoding') if content_encoding == 'gzip': data = gzip.GzipFile(fileobj=stringio.StringIO(data), mode='r').read() elif content_encoding == 'deflate': data = inflate(data) raw = data if response.headers.maintype == 'text': data = text.decode(data, response.headers.getparam('charset')) data_type = 'text' if response.headers.subtype == 'html' and BeautifulSoup is not None: try: with TrapErrors(): data = BeautifulSoup(data) data_type = 'soup' except TrapError: data_type = 'broken_html' elif response.headers.subtype == 'xml' and etree is not None: try: with TrapErrors(): data = etree.ElementTree(etree.fromstring(data, parser=self.xml_parser)) data_type = 'etree' except TrapError:
def __str__(self): from gruntle.memebot.utils.text import encode, decode, format return encode(format('%s%s matched blacklist rule %r for host %r', self.blacklist.host, ('' if (self.url is None) else (u' (%s)' % decode(self.url))), self.blacklist.rule, self.blacklist.match))