Esempio n. 1
0
def remove_empty_lines(html):
    key = '%s:remove_empty_lines' % hash(html)
    out = cache.get(key, namespace="filters")
    if out:
        return out

    if '</' in html:
        html = html.strip().replace('\n', '')
        soup = BeautifulSoup(html)
        lines = []
        for element in soup.contents:
            if isinstance(element, Tag):
                if element.text:
                    lines.append(str(element).strip())
                elif 'br' in str(element):
                    lines.append('\n')
            elif isinstance(element, NavigableString):
                lines.append(str(element).strip())
        out = ''.join(lines).strip()
        while '\n\n' in out:
            out = out.replace('\n\n', '\n')
    else:
        out = '\n'.join([line for line in html.split('\n') if line.strip()])
    cache.set(key, out, namespace="filters")
    return out
Esempio n. 2
0
 def get_text(msg):
     text = ""
     if msg.is_multipart():
         html = None
         for part in msg.get_payload():
             if part.get_content_charset() is None:
                 charset = chardet.detect(str(part))['encoding']
             else:
                 charset = part.get_content_charset()
             if part.get_content_type() == 'text/plain':
                 text = decode_str(
                     ' '.join(part.get_payload().split('\n')) + "\n")
             if part.get_content_type() == 'text/html':
                 html = str(part.get_payload(decode=True), str(charset),
                            "ignore")
         if html is None:
             return text.strip()
         else:
             msg_data = lxml.html.document_fromstring(html.strip())
             return str("\n".join(etree.XPath("//text()")(msg_data)))
     elif part.get_content_type() == 'text/plain':
         text = decode_str(' '.join(part.get_payload().split('\n')) + "\n")
         ret = "\n".join(
             [ll.rstrip() for ll in text.splitlines() if ll.strip()])
         # text = str(msg.get_payload(decode=True),msg.get_content_charset(),'ignore')
         return ret.strip()
Esempio n. 3
0
def remove_empty_lines(html):
  key = '%s:remove_empty_lines' % hash(html)
  out = cache.get(key, namespace="filters")
  if out:
    return out
  
  if '</' in html:
    html = html.strip().replace('\n', '')
    soup = BeautifulSoup(html)
    lines = []
    for element in soup.contents:
      if isinstance(element, Tag):
        if element.text:
          lines.append(str(element).strip())
        elif 'br' in str(element):
          lines.append('\n')
      elif isinstance(element, NavigableString):
        lines.append(str(element).strip())
    out = ''.join(lines).strip()
    while '\n\n' in out:
      out = out.replace('\n\n', '\n')
  else:
    out = '\n'.join([line for line in html.split('\n') if line.strip()])
  cache.set(key, out, namespace="filters")
  return out
Esempio n. 4
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Esempio n. 5
0
def markdown(url):
  """ http://mikelev.in/2014/01/stripping-html-text-markdown-readability/ """
  html = url
  if checkurl(url):
    html = gethtml(url)
    if not html:
      return None
  html = barebones(html)
  html = html.replace('<p>', "\n")
  html = html.replace('</p>', "")
  html = html.replace('<hr>', "\n---\n")
  html = html.replace('<blockquote>', "\n> ")
  html = html.replace('</blockquote>', "")
  html = html.replace('<h1>', "\n# ")
  html = html.replace('<h2>', "\n## ")
  html = html.replace('<h3>', "\n### ")
  html = html.replace('<h4>', "\n#### ")
  html = html.replace('<h5>', "\n##### ")
  html = html.replace('<h6>', "\n###### ")
  html = html.replace('</h1>', "")
  html = html.replace('</h2>', "")
  html = html.replace('</h3>', "")
  html = html.replace('</h4>', "")
  html = html.replace('</h5>', "")
  html = html.replace('</h6>', "")
  html = html.replace('<li>', "")
  html = html.replace('</li>', "")
  html = html.replace('<ul>', "")
  html = html.replace('<ol>', "")
  html = html.replace('</ul>', "")
  html = html.replace('</ol>', "")
  html = lesslines(html)
  html = html.strip()
  return html
Esempio n. 6
0
def barebones(url):
  html = url
  if checkurl(url):
    html = gethtml(url)
    if not html:
      return None
  # This chops out the following tags AND all the presumably extraneous content in-between.
  for nuketagblock in ['title', 'head']:
    html = deletenode(html, nuketagblock)
  html = bodycopy(html)
  html = stripcomments(html)
  # Same as above, but a second-pass on the usual code-bloating suspects in between body tags.
  for nuketagblock in ['header', 'footer', 'nav', 'script', 'style', 'noscript', 'form', 'object', 'embed', 'select']:
    html = deletenode(html, nuketagblock)
  html = stripparams(html)
  html = lowercasetags(html)
  # html = striplists(html)
  html = stripemptyhtml(html)
  html = stripbr(html)
  # This strips out the following tags, but leaves the in-between content in place.
  for nuketag in ['label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i', 'param', 'table',
    'td', 'tr', 'font', 'title', 'head', 'meta', 'strong', 'em', 'iframe']:
    html = deletetag(html, nuketag)
  html = stripwhitespace(html)
  html = stripcrlf(html)
  html = onetagoneline(html)
  html = convert_html_entities(html)
  html = lesslines(html)
  html = html.replace('\n', ' ')
  html = html.replace('  ', ' ')
  html = html.strip()
  return html
Esempio n. 7
0
def strip_wrapping(html):
    """
    Removes the wrapping that might have resulted when using get_html_tree().
    """
    if html.startswith('<div>') and html.endswith('</div>'):
        html = html[5:-6]
    return html.strip()
Esempio n. 8
0
def extractHtml(html, selector, type='css', dump=False):
    items = []

    if html.strip() != '':
        try:
            try:
                soup = lxml.html.fromstring(html)
            except ValueError:
                soup = lxml.html.fromstring(html.encode('utf-8'))

            if type == 'css':
                for item in soup.cssselect(selector):
                    item = lxml.etree.tostring(item).decode('utf-8').strip()
                    items.append(item)
            elif type == 'xpath':
                result = soup.xpath(selector)
                result = result if isinstance(result, list) else [result]
                for item in result:
                    if isinstance(item, lxml.etree._Element):
                        item = lxml.etree.tostring(item).decode('utf-8')
                    items.append(str(item).strip())

        except Exception as e:
            items.append('ERROR: ' + str(e))

    return items
Esempio n. 9
0
def get_decoded_email_body(message_body):
    """ Decode email body.
    Detect character set if the header is not set.
    We try to get text/plain, but if there is not one then fallback to text/html.
    :param message_body: Raw 7-bit message body input e.g. from imaplib. Double encoded in quoted-printable and latin-1
    :return: Message body as unicode string
    """
    msg = message_from_string(message_body)

    text = ""
    if msg.is_multipart():
        html = None
        for part in msg.get_payload():
            #logger.debug( "%s, %s" % (part.get_content_type(), part.get_content_charset()))
            if part.get_content_charset() is None:
                # We cannot know the character set, so return decoded
                # "something"
                text = part.get_payload(decode=True)
                continue

            charset = part.get_content_charset()

            if part.get_content_type() == 'text/plain':
                text = str(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace')

            if part.get_content_type() == 'text/html':
                html = str(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace')

        if text is not None:
            return text.strip()
        else:
            return html.strip()
    else:
        text = str(msg.get_payload(decode=True),msg.get_content_charset(), 'ignore').encode('utf8', 'replace')
        return text.strip()
Esempio n. 10
0
def FindRSSFeeds():

#    blacklist = ( 'Pictures', 'Coffee Break', 'Live mag', 'You mag' )
    blacklist = ()
    feeds = []

    # page to read the list of rss feeds from
    rss_feed_page = "http://www.dailymail.co.uk/home/rssMenu.html"
    html = ukmedia.FetchURL( rss_feed_page )
    assert html.strip() != ''
    soup = BeautifulSoup( html )

    # look for rss icons, step back to find the links.

    

    for btn in soup.findAll( 'span', {'class':"rss-btn rss"} ):
        a = btn.find('a')
        if a:
            feed_url = urlparse.urljoin( rss_feed_page, a['href'] )
            # could get a more human-readable name, but relative url is good enough
            feed_name = a['href']
            feeds.append( (feed_name,feed_url) )

    assert len(feeds) > 120         # 168 feeds at time of writing

    return feeds
Esempio n. 11
0
def markdown(url):
    """ http://mikelev.in/2014/01/stripping-html-text-markdown-readability/ """
    html = url
    if checkurl(url):
        html = gethtml(url)
        if not html:
            return None
    html = barebones(html)
    html = html.replace('<p>', "\n")
    html = html.replace('</p>', "")
    html = html.replace('<hr>', "\n---\n")
    html = html.replace('<blockquote>', "\n> ")
    html = html.replace('</blockquote>', "")
    html = html.replace('<h1>', "\n# ")
    html = html.replace('<h2>', "\n## ")
    html = html.replace('<h3>', "\n### ")
    html = html.replace('<h4>', "\n#### ")
    html = html.replace('<h5>', "\n##### ")
    html = html.replace('<h6>', "\n###### ")
    html = html.replace('</h1>', "")
    html = html.replace('</h2>', "")
    html = html.replace('</h3>', "")
    html = html.replace('</h4>', "")
    html = html.replace('</h5>', "")
    html = html.replace('</h6>', "")
    html = html.replace('<li>', "")
    html = html.replace('</li>', "")
    html = html.replace('<ul>', "")
    html = html.replace('<ol>', "")
    html = html.replace('</ul>', "")
    html = html.replace('</ol>', "")
    html = lesslines(html)
    html = html.strip()
    return html
Esempio n. 12
0
def format_linebreaks(html=''):
    paragraphs = [
        '<p>%s</p>' % p if not tags_re.match(p) else p
        for p in linebreaks_re.split(html.strip())
        if not whitespace_re.match(p)
    ]
    return ''.join(paragraphs)
Esempio n. 13
0
    def mentions_in_html(self, html: str) -> List[Tuple[str, str]]:
        if not html.strip():
            return []

        return [(a_tag.text, href)
                for a_tag, _, href, _ in lxml.html.iterlinks(html)
                if a_tag.text
                and self.link_is_matrix_to_regex.match(unquote(href.strip()))]
Esempio n. 14
0
def extract(url):
    doc = readability.Document(urllib2.urlopen(url).read())
    title = doc.title()
    html = doc.summary()
    html = lstrip(html, '<html>')
    html = lstrip(html, '<body/>')
    html = rstrip(html, '</html>')
    return '<h1>%s</h1>%s' % (title, html.strip())
Esempio n. 15
0
 def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties):
     if not style_str:
         return None
     properties = {}
     for p in style_str.split(";"):
         if p.strip():
             token = p.split(":")
             if len(token) > 1:
                 properties[token[0].strip()] = token[1].strip()
     return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)
Esempio n. 16
0
def clean_html(html):
    """Clean an HTML snippet into a readable string"""
    # Newline vs <br />
    html = html.replace('\n', ' ')
    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
    # Strip html tags
    html = re.sub('<.*?>', '', html)
    # Replace html entities
    html = html_decode(html)
    return html.strip()
Esempio n. 17
0
def shift_headings(html, header_shift=1):
    """ """
    if not html.strip():
        return ""
    dom = lxml.html.fromstring(html)
    for header in dom.cssselect("h1, h2, h3, h4, h5, h6"):
        header.tag = "h{}".format(int(header.tag[1]) + header_shift)
    output = lxml.html.tostring(dom).decode("utf-8")
    if output.startswith("<div>"):
        output = output[5:-6]
    return output
Esempio n. 18
0
    def strip_tags_parser(self, html):
        """
        去除文本中的HTML标签.用到了HTMLParser
        使用示例:
        str_text=strip_tags("<font color=red>hello</font>")

        :return: String
        """
        from HTMLParser import HTMLParser
        html = html.strip('\n')
        html = html.strip('\t')
        html = html.strip(' ')
        html = html.strip()

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        return '$'.join(result)
Esempio n. 19
0
def clean_html(html):
    """Clean an HTML snippet into a readable string"""
    # Newline vs <br />
    html = html.replace('\n', ' ')
    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
    # Strip html tags
    html = re.sub('<.*?>', '', html)
    # Replace html entities
    html = html_decode(html)
    return html.strip()
    def __call__(self,
                 html,
                 input_encoding=None,
                 output_encoding=unicode,
                 return_body=False):

        if not isinstance(html, unicode):
            if not input_encoding:
                raise TypeError('Input data must be unicode')
            html = unicode(html, input_encoding)

        html = html.strip()
        if not html:
            return u''

        root = lxml.html.fromstring(html)

        for name in self.transformation_names:
            method = TRANSFORMATIONS.get(name)
            params = dict(
                context=self.context,
                request=getattr(self.context, 'REQUEST', None),
                destdir=self.destdir,
            )
            if method is None:
                raise ValueError('No transformation "%s" registered' % name)

            ts = time.time()
            argspec = inspect.getargspec(method)
            if isinstance(argspec, tuple):
                args = argspec[0]  # Python 2.4
            else:
                args = argspec.args
            if 'params' in args:
                method(root, params)
            else:
                method(root)

            LOG.info('Transformation %-30s: %3.6f seconds' %
                     (name, time.time() - ts))

        if return_body:
            body = root.xpath('//body')[0]
            html_new = body.text + u''.join([
                lxml.html.tostring(b, encoding=output_encoding) for b in body
            ])

        else:
            html_new = lxml.html.tostring(root, encoding=output_encoding)
            if html_new.startswith('<div>') and html_new.endswith('</div>'):
                html_new = html_new[5:-6].strip()

        return html_new.strip()
Esempio n. 21
0
def termSearch(userinput):
    ## URL Modification for user term search
    url = "http://tlwi.w3-969.ibm.com/standards/terminology/cgi-bin/lookup?ug=corporate&term=" + userinput + "&submit=Search&source=en&target_switch=none&template=simple&db_name=LOGOS&11=main+form&11=acronym~abbreviation&11=prohibited"
    ##Set the URL
    content = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(content, "lxml")
    ## Get Results
    soup = soup.find_all('ol')
    ## Delete Reference Links and Convert
    clean = bleach.clean(soup,
                         tags=['ol', 'br', 'li', 'p'],
                         strip=True,
                         strip_comments=True)
    test = str(clean)
    html = html2text.html2text(test)
    html = html.replace("_", "\n")
    html = html.strip("[")
    html = html.strip("]\n")
    ## Return String
    if html == "":
        html = "No results have been found. Please try a different query."
    return html
Esempio n. 22
0
File: base.py Progetto: feihong/shu
 def write_links_page(self):
     output_file = self._cache_dir / 'links.html'
     with output_file.open('w') as fp:
         fp.write('<meta charset="utf-8"><ul>\n')
         for url, filename in self._files.items():
             path = str(self._cache_dir / filename)
             tree = html5lib.parse(open(path, 'rb'), namespaceHTMLElements=False)
             title = tree.find('.//title').text
             html = """<li>
                 <a href="%s">%s</a> - %s - %s
             </li>""" % (filename, url, title, filename)
             fp.write(html.strip() + '\n')
         fp.write('</ul>')
Esempio n. 23
0
 def get_innerhtml(self, xpath):
     htmls = []
     for element in self.html.xpath(xpath):
         html = ''
         if element.text:
             html += element.text
         for child in element.getchildren():
             if hasattr(self, 'encoding'):
                 html += lxml.html.tostring(
                     child, encoding=self.encoding).decode()
             else:
                 html += lxml.html.tostring(child).decode()
         htmls.append(html.strip())
     return htmls
Esempio n. 24
0
    async def read(self) -> str:
        """Read and convert to HTML the document located at :attr:`path`.

        :raise OSError:
            if the reader's :attr:`~program` cannot convert the document.
        :raise UnicodeDecodeError:
            when the conversion's result is invalid.
        """
        assert self.path is not None, "Open a file before trying to read it"

        cmdline = self.arguments.format(path=self.path)
        # Can raise OSError or UnicodeDecodeError.
        html = await self.run(cmdline)

        return html.strip()
Esempio n. 25
0
def clean_html(html):
    """
    Remove HTML markup from the given string. Borrowed from nltk.
    """
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()
Esempio n. 26
0
def clean_html(html):
    """
    Remove HTML markup from the given string. Borrowed from nltk.
    """
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()
Esempio n. 27
0
 def content_strip_tags(self, html):
     """
     Очищает контент публикации от HTML-разметки и возвращает простой текст.
     """
     # Заменяем удвоенные пробелы одинарным
     html = re.sub(r' {2,}', ' ', html)
     # Удаляем символы перевода строки и табуляцию
     html = re.sub(r'[\r\n\t]+', '', html)
     # Удаляем пробелы между тегами
     html = re.sub(r'(</\w+>)\s+(<\w+[^>]*>)', r'\1\2', html)
     # Производим замену ссылок на: текст [ссылка]
     html = re.sub(r'''<a[^>]*href=['"]([^'"]+)['"][^>]*>([^<]+)</a>''', r'\2 [\1]', html)
     html = re.sub(r'''<a[^>]*>([^<]*)</a>''', r'\1', html)
     # К заголовкам и параграфам добавляем перевод строки
     html = re.sub(r'''<(h1|h2|h3|h4|h5|h6|p)[^>]*>([^<]*)</\1>''', r'\2' + os.linesep * 2, html)
     html = HTMLParser().unescape(html)
     return html.strip()
Esempio n. 28
0
def get_static_urls(url):
    html = urlopen(url).read()
    parser = etree.HTMLParser()
    tree = etree.fromstring(html.strip(), parser).getroottree()
    page = tree.getroot()
    for link in CSSSelector('link')(page):
        yield wrap(url, link.attrib['href'])
        
    for link in CSSSelector('script')(page):
        
        try:
            src = wrap(url, link.attrib['src'])
            if not src.count('googleapis'):
                yield wrap(url, link.attrib['src'])
        except KeyError:
            # block
            pass
Esempio n. 29
0
def get_static_urls(url):
    html = urlopen(url).read()
    parser = etree.HTMLParser()
    tree = etree.fromstring(html.strip(), parser).getroottree()
    page = tree.getroot()
    for link in CSSSelector('link')(page):
        yield wrap(url, link.attrib['href'])

    for link in CSSSelector('script')(page):

        try:
            src = wrap(url, link.attrib['src'])
            if not src.count('googleapis'):
                yield wrap(url, link.attrib['src'])
        except KeyError:
            # block
            pass
Esempio n. 30
0
    def filter(
        self,
        html: str,
        inline: bool = False,
        outgoing: bool = False,
        display_name_mentions: Optional[Dict[str, str]] = None,
    ) -> str:
        """Filter and return HTML."""

        mentions = display_name_mentions

        sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
        html = sanit.sanitize(html).rstrip("\n")

        if not html.strip():
            return html

        tree = etree.fromstring(
            html,
            parser=etree.HTMLParser(encoding="utf-8"),
        )

        for a_tag in tree.iterdescendants("a"):
            self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)

            if not outgoing:
                self._matrix_to_links_add_classes(a_tag)

        html = etree.tostring(tree, encoding="utf-8", method="html").decode()
        html = sanit.sanitize(html).rstrip("\n")

        if outgoing:
            return html

        # Client-side modifications

        html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)

        if not inline:
            return html

        return self.inline_quote_regex.sub(
            r'\1<span class="quote">\2</span>',
            html,
        )
    def __call__(self, html, input_encoding=None, output_encoding=unicode, return_body=False):

        if not isinstance(html, unicode):
            if not input_encoding:
                raise TypeError('Input data must be unicode')
            html = unicode(html, input_encoding)

        html = html.strip()
        if not html:
            return u''

        root = lxml.html.fromstring(html)

        for name in self.transformation_names:
            method = TRANSFORMATIONS.get(name)
            params = dict(context=self.context,
                          request=getattr(self.context, 'REQUEST', None),
                          destdir=self.destdir,
                          )
            if method is None:
                raise ValueError('No transformation "%s" registered' % name)

            ts = time.time()
            argspec = inspect.getargspec(method)
            if isinstance(argspec, tuple):
                args = argspec[0] # Python 2.4
            else:
                args = argspec.args
            if 'params' in args:
                method(root, params)
            else:
                method(root)

            LOG.info('Transformation %-30s: %3.6f seconds' % (name, time.time()-ts))

        if return_body:
            body = root.xpath('//body')[0]
            html_new = body.text + u''.join([lxml.html.tostring(b, encoding=output_encoding) for b in body])

        else:
            html_new = lxml.html.tostring(root, encoding=output_encoding)
            if html_new.startswith('<div>') and html_new.endswith('</div>'):
                html_new = html_new[5:-6].strip()

        return html_new.strip()
Esempio n. 32
0
    def __del_html_tag(self, html):
        '''
        @summary:
        ---------
        @param html:
        @param save_useful_tag:保留有用的标签,如img和p标签
        ---------
        @result:
        '''
        html = self.__replace_str(html, u'(?i)<script(.|\n)*?</script>')
        html = self.__replace_str(html, u'(?i)<style(.|\n)*?</style>')
        html = self.__replace_str(html, u'(?i)<\/?(span|section|font|em)[^<>]*?>')
        html = self.__replace_str(html, u'(?i)<div[^<>]+?(display:.?none|comment|measure).*?>([\s\S]*?)<\/div>')
        html = self.__replace_str(html, u'<!--(.|\n)*?-->')
        html = self.__replace_str(html, u'(?!&[a-z]+=)&[a-z]+;?', ' ')


        html = self.__replace_str(html, '[\f\r\t\v]') # 将空格和换行符外的其他空白符去掉
        html = html.strip()
        return html
Esempio n. 33
0
def get_linked_data(x):
    path = x["path"]
    if path in CACHE:
        return CACHE[path]
    try:
        html = just.read(path)
    except EOFError:
        CACHE[path] = None
        return None
    if not html.strip():
        CACHE[path] = None
        return None
    tree = lxml.html.fromstring(html)
    res = tree.xpath("//input[@name='q' and @type='text']")
    if not res:
        linked_data = None
    else:
        linked_data = {"title": res[0].value}
    CACHE[path] = linked_data
    return linked_data
Esempio n. 34
0
def clean_html(html):
	"""
	Copied from NLTK package.
	Remove HTML markup from the given string.
	:param html: the HTML string to be cleaned
	:type html: str
	:rtype: str
	"""
	# see http://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented

	# First we remove inline JavaScript/CSS:
	cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
	# Then we remove html comments. This has to be done before removing regular
	# tags since comments can contain '>' characters.
	cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
	# Next we can remove the remaining tags:
	cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
	# Finally, we deal with whitespace
	cleaned = re.sub(r"&nbsp;", " ", cleaned)
	cleaned = re.sub(r"  ", " ", cleaned)
	cleaned = re.sub(r"  ", " ", cleaned)
	return cleaned.strip()
Esempio n. 35
0
def clean_html(html):
    """
    Copied from NLTK package.
    Remove HTML markup from the given string.
    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """
    # see http://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented

    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()
Esempio n. 36
0
    def get_text(self, msg):

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove script tags
        cleaner.style = True  # Remove css

        body = ""
        for part in msg.walk():
            # Check mime type for text
            if part.get_content_maintype() == "text":
                body += part.get_payload()

        # Get the transfer encoding used
        transfer_encoding = msg.get("Content-transfer-encoding")
        if transfer_encoding:
            # Decode transfer encoding header
            transfer_encoding = self.decode_header(transfer_encoding)
            if transfer_encoding.lower() == "base64":
                # Base64 transfer decoding
                body = base64.b64decode(body)

        # Quoted-printable transfer decoding
        html = quopri.decodestring(body).decode("utf-8", "ignore")

        # Check if body is empty
        if html.strip() != "":
            # Create document from html
            document = lxml.html.document_fromstring(html)

            # Clean html and get text
            text = "\n".join(
                etree.XPath("//text()")(cleaner.clean_html(document)))
            # Remove blank lines from text
            text = "\n".join(
                filter(lambda x: not re.match(r'^\s*$', x), text.splitlines()))

            return text

        return ""
Esempio n. 37
0
def barebones(url):
    html = url
    if checkurl(url):
        html = gethtml(url)
        if not html:
            return None
    # This chops out the following tags AND all the presumably extraneous content in-between.
    for nuketagblock in ['title', 'head']:
        html = deletenode(html, nuketagblock)
    html = bodycopy(html)
    html = stripcomments(html)
    # Same as above, but a second-pass on the usual code-bloating suspects in between body tags.
    for nuketagblock in [
            'header', 'footer', 'nav', 'script', 'style', 'noscript', 'form',
            'object', 'embed', 'select'
    ]:
        html = deletenode(html, nuketagblock)
    html = stripparams(html)
    html = lowercasetags(html)
    # html = striplists(html)
    html = stripemptyhtml(html)
    html = stripbr(html)
    # This strips out the following tags, but leaves the in-between content in place.
    for nuketag in [
            'label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i',
            'param', 'table', 'td', 'tr', 'font', 'title', 'head', 'meta',
            'strong', 'em', 'iframe'
    ]:
        html = deletetag(html, nuketag)
    html = stripwhitespace(html)
    html = stripcrlf(html)
    html = onetagoneline(html)
    html = convert_html_entities(html)
    html = lesslines(html)
    html = html.replace('\n', ' ')
    html = html.replace('  ', ' ')
    html = html.strip()
    return html
Esempio n. 38
0
    html = html.replace("_", "\n")
    html = html.strip("[")
    html = html.strip("]\n")
    ## Return String
    if html == "":
        html = "No results have been found. Please try a different query."
    return html


if __name__ == "__main__":
    print("Please enter the user input")
    userinput = input()
    url = "http://tlwi.w3-969.ibm.com/standards/terminology/cgi-bin/lookup?ug=corporate&term=" + userinput + "&submit=Search&source=en&target_switch=none&template=simple&db_name=LOGOS&11=main+form&11=acronym~abbreviation&11=prohibited"
    ##Set the URL
    content = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(content, "lxml")
    soup = soup.find_all('ol')
    #Delete Reference Links
    clean = bleach.clean(soup,
                         tags=['ol', 'br', 'li', 'p'],
                         strip=True,
                         strip_comments=True)
    test = str(clean)
    html = html2text.html2text(test)
    html = html.replace("_", "\n")
    html = html.strip("[")
    html = html.strip("]\n")
    print(html)
    if html == "":
        print("EMPTY")
Esempio n. 39
0
def format_linebreaks(html=''):
    paragraphs = ['<p>%s</p>' % p if not tags_re.match(p) else p
                  for p in linebreaks_re.split(html.strip())
                  if not whitespace_re.match(p)]
    return ''.join(paragraphs)
Esempio n. 40
0
 def clean_broken_html(self, html):
     return html.strip().replace("&nbsp", "")