Esempio n. 1
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 2
0
def html2content(html, allowed_tags=["a", "abbr", "article", "aside",
                                     "b", "base", "blockquote", "body",
                                     "br", "caption", "cite", "code", "col", "colgroup",
                                     "dd", "del", "dfn", "dl", "dt",
                                     "em", "embed", "figcaption", "figure", "footer",
                                     "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
                                     "i", "img",
                                     "li",
                                     "map", "mark", "math", "meta", "meter",
                                     "nav", "noscript",
                                     "object", "ol", "optgroup", "option", "output",
                                     "p", "param", "pre", "progress",
                                     "q", "rp", "rt", "ruby",
                                     "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg",
                                     "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track",
                                     "u", "ul",
                                     "var", "video",
                                     "wbr"]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
    def get_clean_html(self, html_text, text_only=True):
        try:
            etree = lxml.html.document_fromstring(html_text)

            self._is_etree(etree)
            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(etree)
            if text_only:
                return ' '.join(html.text_content().split())
                # return html.text_content()

            res = lxml.html.tostring(html)
        except Exception as e:
            logger.error(f"While parsing email in get_clean_html {e}")
            res = "junk"

        return res
Esempio n. 4
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 5
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 6
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 7
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 8
0
 def _get_cleaner(self, print_style, print_js, remove_tags):
     c = Cleaner()
     c.scripts = not print_js
     c.javascript = not print_js
     c.style = not print_style
     c.remove_tags = remove_tags
     c.page_structure = False
     return c
Esempio n. 9
0
def remove_scripts_and_style(body):
    """
    All of the Html files have fairly large blocks dedicated to style and script

    This function cleans out the large useless blocks out of the files and typically drops the size down significantly
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.page_structure = False
    return cleaner.clean_html(body)
Esempio n. 10
0
 def remove_scripts(self):
     if not self.clean_js:
         logger.debug('Scripts will not be removed')
         self.parser_modified_content = False
         return
     cleaner = Cleaner()
     # don't modify original page structure, eg, <head>, <html>, <body> ...
     cleaner.page_structure = False
     # don't remove inline javascript
     cleaner.javascript = False
     # remove <script> tags
     cleaner.scripts = True
     self.modified_doc = cleaner.clean_html(self.doc)
     self.parser_modified_content = True
     logger.debug('Scripts were successfully removed')
Esempio n. 11
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 12
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 13
0
 def gettextonly(self, html, url):
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.style = True
     cleaner.links = True
     cleaner.meta = False
     cleaner.page_structure = False
     cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                    'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr',
                    'table', 'a', 'p', 'br', 'li', 'ul']
     doc = lxml.html.fromstring(html)
     path = '/html/body'
     try:
         body = doc.xpath(path)[0]
     except Exception as detail:
         print detail
         return False
     return cleaner.clean_html(body).text_content().split()
Esempio n. 14
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Esempio n. 15
0
def get_clean_text(filename):
    utf8_parser = html.HTMLParser(encoding='utf-8')
    htmltxt = lxml.html.parse(filename, parser=utf8_parser)
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    htmltxt = cleaner.clean_html(htmltxt)

    txt = etree.tostring(htmltxt, encoding='unicode')
    txtresub = re.sub(r'<.+?>', ' ', txt)
    txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ',
                      txtresub)

    return txtresub
Esempio n. 16
0
    def get_content(self, pathName):
        try:
            file = open(pathName, "r")
            html_text = file.read()
            file.close()
        except:
            print("Fail to open the file located in {}".format(pathName))
            return None
        try:
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
########    Add
            cleaner.page_structure = False 
            htmlData = cleaner.clean_html(html_text)
        except:
            print("Could not remove style and js code from the file located in {}".format(pathName))
            return None
########    Add
        soup = BeautifulSoup(htmlData, "lxml")
########    Change return tuple (raw_content, soup) instead of raw_content
        return soup
Esempio n. 17
0
def html2content(
    html,
    allowed_tags=[
        "a", "abbr", "article", "aside", "b", "base", "blockquote", "body",
        "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn",
        "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1",
        "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
        "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav",
        "noscript", "object", "ol", "optgroup", "option", "output", "p",
        "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp",
        "section", "small", "source", "span", "strong", "sub", "sup", "svg",
        "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr",
        "track", "u", "ul", "var", "video", "wbr"
    ]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 18
0
try:
    readline.read_history_file(histfile)
except IOError:
    pass

try:
    from lxml.html.clean import Cleaner
    import lxml
    from lxml.html import document_fromstring
    import requests
    resp = requests.get('http://en.wikipedia.org/')
    tree = document_fromstring(resp.text)
    raw = resp.text
    # enable filters to remove Javascript and CSS from HTML document                                                                     
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(tree)
    text_content = html.text_content()
except ImportError:
    pass

atexit.register(readline.write_history_file, histfile)
del os, histfile
Esempio n. 19
0
def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]
Esempio n. 20
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc["content"]
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = " ".join([title for title in tree.xpath("//title/text()")])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = " ".join(tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc["url"],
                desc=description,
                rank=doc["rank"],
                content="\n".join([title, doc["url"], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Esempio n. 21
0
LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
Esempio n. 22
0
    "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]",
    "//footer",
    "//*[@class='post-footer' or @class='footer' or @id='footer']",
    "//small",
    "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]",
]

CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]  # 'embed', 'figure', 'img', 'table'

## REGEX cache
JSON_PATTERN = re.compile(
    r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})')
# use of regex module for speed
GERMAN_PATTERN = regex.compile(
Esempio n. 23
0
from django.http import HttpResponse
import rake
from bs4 import BeautifulSoup
import urllib.request
import sys
import testApp.processing as process

import re
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.scripts = True
cleaner.links = True
cleaner.meta = True
cleaner.page_structure = True
cleaner.frames = True
cleaner.forms = True
cleaner.annoying_tags = True


def get_url_content(url):
    try:
        with urllib.request.urlopen(url) as page:
            text = page.read()
    except Exception as e:
        return "Couldn't load url"
    return text


def index(request):
Esempio n. 24
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc['content']
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = ' '.join([title for title in tree.xpath('//title/text()')])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = ' '.join(
                tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc['url'],
                desc=description,
                rank=doc['rank'],
                content='\n'.join([title, doc['url'], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Esempio n. 25
0
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import lxml
from lxml.html.clean import Cleaner
#import json
#import codecs
from peewee import *
import datetime

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.comments = True
cleaner.allow_tags = False
cleaner.links = False
cleaner.page_structure = False

db = MySQLDatabase('scrap', user='******', passwd='')

class BaseModel(Model):
    """
    Base peewee DB model
    """

    class Meta:
        database = db


class Texts(BaseModel):
    title = CharField()
    link = CharField()
Esempio n. 26
0
def scrape(lineHashDB, html, encoding):
    # cleaner setup
    cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False)
    cleaner.javascript = True  # activate the javascript filter
    cleaner.style = True  #  activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.annoying_tags = True
    cleaner.inline_style = True
    cleaner.page_structure = False
    cleaner.remove_tags = ['b', 'a', 'h']
    cleaner.kill_tags = ['script']

    #invoke cleaner
    try:
        page = cleaner.clean_html(html)
    except:
        #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr
        content = u""
        return content

    page8 = page
    page8 = re.sub(u'\n', ' ', page8)  # remove NL
    #	page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space
    page8 = re.sub(u'&#13;', ' ', page8)  # remove CR
    page8 = re.sub(u'<!--.*?-->', ' ', page8)  # remove comments
    page8 = re.sub(u' class=".*?"', ' ', page8)  # remove attributes
    page8 = re.sub(u' id=".*?"', ' ', page8)
    page8 = re.sub(u' rel=".*?"', ' ', page8)
    page8 = re.sub(u'\[an error occurred while processing this directive\]',
                   ' ', page8)
    page8 = re.sub(u'>\s*?<', '><', page8)  # remove blanks between tags

    # cycle to remove spurious divs
    for count in range(1, 20):
        page8 = re.sub(u'>.{0,10}<', '><',
                       page8)  # remove words under 10 chars between tags
        page8 = re.sub(u'<div></div>', ' ', page8)
        page8 = re.sub(u'<p></p>', ' ', page8)
        page8 = re.sub(u'<span></span>', ' ', page8)

    page8 = re.sub(u'\s+', ' ', page8)  # remove repeated blanks

    #XPATHs
    xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()'
    xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()'

    sel = Selector(text=page8, type="html")
    text = sel.xpath(xpath).extract()
    content = u""
    if text:
        for s in text:
            # squash duplicate whitespaces
            ' '.join(s.split())
            # remove short lines
            # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc.
            if len(s) < 40:
                next
    # remove leading whitespace
    #if s.endswith(" "): s = s[:-1]
            if s.startswith(" "): s = s[1:]
            content += s
            content += "\n"
    return content