Esempio n. 1
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 2
0
    def get_text(self, html_content: str):
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.inline_style = True

        cleaned = cleaner.clean_html(html_content)

        soup = BeautifulSoup(cleaned, 'lxml')
        text_lines = soup.findAll(text=True)

        text_lines_merged = []
        merge_str = ''

        text_lines_merged.append(text_lines[0])
        for line in text_lines[1:]:
            if '\n' == line or '' == line or ' ' == line:
                if merge_str is not '':
                    text_lines_merged.append(merge_str)
                merge_str = ''
            else:
                merge_str += (' ' + line)

        text_lines_merged = [
            self.strip(line) for line in text_lines_merged
            if len(self.strip(line)) > 128
        ]
        print(' '.join(text_lines_merged))
Esempio n. 3
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import re
import os
import lxml
from bs4 import BeautifulSoup
from lxml.html.clean import Cleaner
from lxml.etree import XMLSyntaxError
from store_helper import StoreHelper
from text_helper import TextHelper

cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
cleaner.inline_style = True
cleaner.whitelist_tags = set([])
cleaner.remove_tags = [
    'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2',
    'h3', 'h4', 'h5', 'span'
]
cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label']


class HTMLHelper(object):
    @staticmethod
    def remove_tag(web_source):
        text = re.sub(r'<[^>]+>', '', web_source)
        return text

    @staticmethod
Esempio n. 4
0
def scrape(lineHashDB, html, encoding):
    # cleaner setup
    cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False)
    cleaner.javascript = True  # activate the javascript filter
    cleaner.style = True  #  activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.annoying_tags = True
    cleaner.inline_style = True
    cleaner.page_structure = False
    cleaner.remove_tags = ['b', 'a', 'h']
    cleaner.kill_tags = ['script']

    #invoke cleaner
    try:
        page = cleaner.clean_html(html)
    except:
        #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr
        content = u""
        return content

    page8 = page
    page8 = re.sub(u'\n', ' ', page8)  # remove NL
    #	page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space
    page8 = re.sub(u'&#13;', ' ', page8)  # remove CR
    page8 = re.sub(u'<!--.*?-->', ' ', page8)  # remove comments
    page8 = re.sub(u' class=".*?"', ' ', page8)  # remove attributes
    page8 = re.sub(u' id=".*?"', ' ', page8)
    page8 = re.sub(u' rel=".*?"', ' ', page8)
    page8 = re.sub(u'\[an error occurred while processing this directive\]',
                   ' ', page8)
    page8 = re.sub(u'>\s*?<', '><', page8)  # remove blanks between tags

    # cycle to remove spurious divs
    for count in range(1, 20):
        page8 = re.sub(u'>.{0,10}<', '><',
                       page8)  # remove words under 10 chars between tags
        page8 = re.sub(u'<div></div>', ' ', page8)
        page8 = re.sub(u'<p></p>', ' ', page8)
        page8 = re.sub(u'<span></span>', ' ', page8)

    page8 = re.sub(u'\s+', ' ', page8)  # remove repeated blanks

    #XPATHs
    xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()'
    xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()'

    sel = Selector(text=page8, type="html")
    text = sel.xpath(xpath).extract()
    content = u""
    if text:
        for s in text:
            # squash duplicate whitespaces
            ' '.join(s.split())
            # remove short lines
            # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc.
            if len(s) < 40:
                next
    # remove leading whitespace
    #if s.endswith(" "): s = s[:-1]
            if s.startswith(" "): s = s[1:]
            content += s
            content += "\n"
    return content