コード例 #1
0
ファイル: __init__.py プロジェクト: Havvy/madcow
 def wrapper_func(*args, **kwargs):
     exc_info = kwargs.pop('exc_info', None)
     output = self.prefix + text.decode(text.format(*args, **kwargs))
     wrapped_func(output)
     if exc_info is not None:
         for line in traceback.format_exception(*exc_info):
             wrapped_func(self.prefix + text.chomp(text.decode(line)))
コード例 #2
0
ファイル: __init__.py プロジェクト: seunboi4u/madcow
 def wrapper_func(*args, **kwargs):
     exc_info = kwargs.pop('exc_info', None)
     output = self.prefix + text.decode(
         text.format(*args, **kwargs))
     wrapped_func(output)
     if exc_info is not None:
         for line in traceback.format_exception(*exc_info):
             wrapped_func(self.prefix +
                          text.chomp(text.decode(line)))
コード例 #3
0
ファイル: __init__.py プロジェクト: seunboi4u/madcow
def get_domain_from_url(url):
    """Return normalized domain portion of the URL"""
    from gruntle.memebot.utils import text
    items = text.decode(urlparse.urlparse(url).netloc).lower().rsplit(
        u':', 1)[0].split(u'.')[-2:]
    return text.encode(u'.'.join(item for item in (item.strip()
                                                   for item in items) if item))
コード例 #4
0
ファイル: fields.py プロジェクト: Havvy/madcow
 def get_db_prep_value(self, value):
     """Serialize data before storing to database"""
     if value is not None:
         if not isinstance(value, str):
             value = text.encode(value, settings.TEXT_ENCODING)
         compressed_data = self.engine.compress(value, self.level)
         encoded_data = base64.encodestring(compressed_data)
         parts = self.HEADER, self.engine.id, encoded_data
         joined = ''.join(parts)
         value = text.decode(joined, settings.TEXT_ENCODING)
         return value
コード例 #5
0
ファイル: exceptions.py プロジェクト: Jzarecta/madcow
    def __str__(self):
        from gruntle.memebot.utils.text import encode, decode, format

        return encode(
            format(
                "%s%s matched blacklist rule %r for host %r",
                self.blacklist.host,
                ("" if (self.url is None) else (u" (%s)" % decode(self.url))),
                self.blacklist.rule,
                self.blacklist.match,
            )
        )
コード例 #6
0
def render_node(node):
    """Try to turn a soup node into something resembling plain text"""
    if isinstance(node, (str, unicode)):
        html = node
    else:
        html = node.renderContents()
    html = text.decode(html)
    html = html_tag_re.sub(u' ', html)
    html = decode_entities(html)
    html = html.replace(u'\u00a0', ' ')
    html = whitespace_re.sub(u' ', html)
    html = html.strip()
    return html
コード例 #7
0
def prettify_node(node):
    """Try to turn a soup node into something resembling readable html"""
    if isinstance(node, (str, unicode)):
        html = node
    else:
        html = node.prettify()
    html = text.decode(html)
    html = lang_tag_re.sub(u' ', html)
    html = comment_re.sub(u' ', html)
    html = html.strip()
    lines = html.splitlines()
    lines = (line.rstrip() for line in lines)
    lines = (line for line in lines if line)
    html = u'\n'.join(lines) + u'\n'
    return text.encode(html)
コード例 #8
0
ファイル: html.py プロジェクト: Havvy/madcow
    def summarize_soup(self, soup):
        """
        Experimental: Try to guess where the main content is and return summary text.
        In it's current form, picking news sites & articles at random, it seems to have
        just slightly better than a 50% chance of working right. The other time it finds
        total garbage. Better than nothing for a first attempt, I guess ...
        """

        # first, lean up the html a little bit and then remove every tag
        # that isn't a <div> or <p> tag. theory being that the latter two are mostly
        # the ones that define the structure of the document, which is what we
        # are most interested in. well, its structure relative to whatever text nodes
        # are left over.
        html = text.decode(browser.prettify_node(soup.body))
        for orig, name in tag_re.findall(html):
            if name not in ('div', 'p'):
                html = html.replace(orig, u' ')

        # put it back into soup form and perform the main logic thingy here. the idea
        # is to walk each remaining node in the branch and look at the text contents of
        # each of its *siblings*. the idea is that these would be paragraphs in the article.
        # this falls apart spectacularly on some sites, and in a very specific way. I thin
        # certain menus or sidebars are laid out this way too. since we do a basic word count, if
        # they are large enough, they'll overtake the article. perhaps we can correct for this
        # in another way.. requires investigation.
        soup = BeautifulSoup(html)
        blocks = []
        for node in soup.findAll(True):
            size = 0
            for p in node:
                for el in p:
                    if isinstance(el, NavigableString):
                        size += len(el)
            if size:
                blocks.append((size, node))

        # now we have a list of nodes & how much text in "paragraph" form they contain. whatever is
        # the largest we will assume is the intended content, and grab a cleaned up snippet from
        # the front of it.
        if blocks:
            article = browser.render_node(max(blocks)[1])
            words = article[:self.summary_size].split()
            if len(article) > self.summary_size:
                words[-1] = self.summary_cont
            return u' '.join(words)
コード例 #9
0
ファイル: html.py プロジェクト: seunboi4u/madcow
    def summarize_soup(self, soup):
        """
        Experimental: Try to guess where the main content is and return summary text.
        In it's current form, picking news sites & articles at random, it seems to have
        just slightly better than a 50% chance of working right. The other time it finds
        total garbage. Better than nothing for a first attempt, I guess ...
        """

        # first, lean up the html a little bit and then remove every tag
        # that isn't a <div> or <p> tag. theory being that the latter two are mostly
        # the ones that define the structure of the document, which is what we
        # are most interested in. well, its structure relative to whatever text nodes
        # are left over.
        html = text.decode(browser.prettify_node(soup.body))
        for orig, name in tag_re.findall(html):
            if name not in ('div', 'p'):
                html = html.replace(orig, u' ')

        # put it back into soup form and perform the main logic thingy here. the idea
        # is to walk each remaining node in the branch and look at the text contents of
        # each of its *siblings*. the idea is that these would be paragraphs in the article.
        # this falls apart spectacularly on some sites, and in a very specific way. I thin
        # certain menus or sidebars are laid out this way too. since we do a basic word count, if
        # they are large enough, they'll overtake the article. perhaps we can correct for this
        # in another way.. requires investigation.
        soup = BeautifulSoup(html)
        blocks = []
        for node in soup.findAll(True):
            size = 0
            for p in node:
                for el in p:
                    if isinstance(el, NavigableString):
                        size += len(el)
            if size:
                blocks.append((size, node))

        # now we have a list of nodes & how much text in "paragraph" form they contain. whatever is
        # the largest we will assume is the intended content, and grab a cleaned up snippet from
        # the front of it.
        if blocks:
            article = browser.render_node(max(blocks)[1])
            words = article[:self.summary_size].split()
            if len(article) > self.summary_size:
                words[-1] = self.summary_cont
            return u' '.join(words)
コード例 #10
0
ファイル: blacklist.py プロジェクト: Havvy/madcow
 def normalize(self, host):
     """Clean up hostname"""
     items = text.decode(host).lower().rsplit(u':', 1)[0].split(u'.')
     return text.encode(u'.'.join(item for item in (item.strip() for item in items) if item))
コード例 #11
0
ファイル: __init__.py プロジェクト: Havvy/madcow
def get_domain_from_url(url):
    """Return normalized domain portion of the URL"""
    from gruntle.memebot.utils import text
    items = text.decode(urlparse.urlparse(url).netloc).lower().rsplit(u':', 1)[0].split(u'.')[-2:]
    return text.encode(u'.'.join(item for item in (item.strip() for item in items) if item))
コード例 #12
0
ファイル: __init__.py プロジェクト: Havvy/madcow
 def prefix(self):
     if self.name:
         return text.decode(text.format('[%s] ', self.name))
     return u''
コード例 #13
0
        if length is not None:
            length = int(length)
            complete = read >= int(length)
        else:
            complete = (self.max_read == -1) or (read < self.max_read)

        content_encoding = response.headers.get('content-encoding')
        if content_encoding == 'gzip':
            data = gzip.GzipFile(fileobj=stringio.StringIO(data), mode='r').read()
        elif content_encoding == 'deflate':
            data = inflate(data)

        raw = data

        if response.headers.maintype == 'text':
            data = text.decode(data, response.headers.getparam('charset'))
            data_type = 'text'

        if response.headers.subtype == 'html' and BeautifulSoup is not None:
            try:
                with TrapErrors():
                    data = BeautifulSoup(data)
                    data_type = 'soup'
            except TrapError:
                data_type = 'broken_html'
        elif response.headers.subtype == 'xml' and etree is not None:
            try:
                with TrapErrors():
                    data = etree.ElementTree(etree.fromstring(data, parser=self.xml_parser))
                    data_type = 'etree'
            except TrapError:
コード例 #14
0
ファイル: blacklist.py プロジェクト: seunboi4u/madcow
 def normalize(self, host):
     """Clean up hostname"""
     items = text.decode(host).lower().rsplit(u':', 1)[0].split(u'.')
     return text.encode(u'.'.join(item for item in (item.strip()
                                                    for item in items)
                                  if item))
コード例 #15
0
ファイル: __init__.py プロジェクト: seunboi4u/madcow
 def prefix(self):
     if self.name:
         return text.decode(text.format('[%s] ', self.name))
     return u''
コード例 #16
0
ファイル: exceptions.py プロジェクト: seunboi4u/madcow
 def __str__(self):
     from gruntle.memebot.utils.text import encode, decode, format
     return encode(format('%s%s matched blacklist rule %r for host %r',
                          self.blacklist.host, ('' if (self.url is None) else (u' (%s)' % decode(self.url))),
                          self.blacklist.rule, self.blacklist.match))