Esempio n. 1
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 2
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 3
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner
Esempio n. 5
0
def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]
Esempio n. 6
0
LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
        cleaning_list.append('table')
    'DATE_ORDER': 'DMY'
}

logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR)
logger.debug('dateparser configuration: %s', PARSERCONFIG)

cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try: