def create_word_frequencies(self):

        document = re.sub(find_doc_content_pattern, "", self.content)

        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True
        # # cleaner.allow_tags = ['']
        # # cleaner.remove_unknown_tags = False

        try:
            document_visible_text = cleaner.clean_html(document)
        except UnicodeDecodeError:
            document_visible_text = ""
            print "Unicode Error"
        # document_visible_text = document

        word_list = document_visible_text.split()
        for word in word_list:
            word_stemmed = word.lower()
            try:
                self.word_frequencies[
                    word_stemmed] = self.word_frequencies[word_stemmed] + 1
            except:
                self.word_frequencies[word_stemmed] = 1
            self.total_word_count = self.total_word_count + 1
Esempio n. 2
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 3
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 4
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 5
0
def trim_html(html):
    """Takes a html string as input and returns the html without any styles nor javascript"""
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.javascript = True  # Get rid of the javascript and the style
    cleaner.style = True

    return cleaner.clean_html(html)
Esempio n. 6
0
 def _get_cleaner(self, print_style, print_js, remove_tags):
     c = Cleaner()
     c.scripts = not print_js
     c.javascript = not print_js
     c.style = not print_style
     c.remove_tags = remove_tags
     c.page_structure = False
     return c
Esempio n. 7
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Esempio n. 8
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()

        if data["qty"] < 0:
            data["qty"] = 0
        return data
Esempio n. 9
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Esempio n. 10
0
def trim_html(html):
    """Takes a html string as input and returns the html without any styles nor javascript"""
    cleaner = Cleaner()

    cleaner.scripts         = True
    cleaner.javascript      = True  # Get rid of the javascript and the style
    cleaner.style           = True

    cleaner.meta            = False # Keeping the meta tags is important for page redirection purposes
    cleaner.safe_attrs_only = False

    return cleaner.clean_html(html)
Esempio n. 11
0
def cleaned_html(htmlString):

    # Try to parse the provided HTML string using lxml
    # strip all unnecessary information to save space
    cleaner = Cleaner()
    cleaner.comments = True
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.style = True

    htmlString = cleaner.clean_html(htmlString)

    return htmlString
Esempio n. 12
0
	def clean(self):
		cleaner= Cleaner(page_structure=False)
		cleaner.javascript = True
		cleaner.scripts = True
		cleaner.frames = True
		cleaner.allow_tags = []
		cleaner.remove_tags = ['p', 'div', 'a']
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
		self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content()
		self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content()
		self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content()
		self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content()
		self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content()
		self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()		
Esempio n. 13
0
def clean_cachefiles():
    """Clean silly html from all cachefiles in the cachdir"""
    if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'):
        import lxml.html
        from lxml.html.clean import Cleaner
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.scripts = True
        cleaner.javascript = True
        for file in _get_all_cache_files():
            cfile = CompressedFile(file)
            data = cfile.read()
            cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data)))
            cfile.write(cleaned)
            logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
Esempio n. 14
0
    def validate(self, value):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content()
        data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content()
        data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content()

        # 		data['username']=  cleaner.clean_html(data['username'])
        #               data['storename']= cleaner.clean_html(data['storename'])
        #              data['email']= cleaner.clean_html(data['email'])

        return data
Esempio n. 15
0
 def remove_scripts(self):
     if not self.clean_js:
         logger.debug('Scripts will not be removed')
         self.parser_modified_content = False
         return
     cleaner = Cleaner()
     # don't modify original page structure, eg, <head>, <html>, <body> ...
     cleaner.page_structure = False
     # don't remove inline javascript
     cleaner.javascript = False
     # remove <script> tags
     cleaner.scripts = True
     self.modified_doc = cleaner.clean_html(self.doc)
     self.parser_modified_content = True
     logger.debug('Scripts were successfully removed')
Esempio n. 16
0
def clean_cachefiles():
    """Clean silly html from all cachefiles in the cachdir"""
    if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'):
        import lxml.html
        from lxml.html.clean import Cleaner
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.scripts = True
        cleaner.javascript = True
        for file in _get_all_cache_files():
            cfile = CompressedFile(file)
            data = cfile.read()
            cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data)))
            cfile.write(cleaned)
            logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    stuff = lxml.html.tostring(cleaner.clean_html(data))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text
Esempio n. 18
0
    def clean_html(self):
        """
            Cleaner removes HTML tags prior to processing. Note: cleaning removes
            the Title tags from HTML. Do not clean before grabbing titles!
        """
        if len(self.response.content):
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.scripts = True
            cleaner.style = True
            cleaner.comments = True

            try:
                return html.fromstring(cleaner.clean_html(self.response.content))
            except Exception as e:
                logging.error(e)

            return None
Esempio n. 19
0
 def gettextonly(self, html, url):
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.style = True
     cleaner.links = True
     cleaner.meta = False
     cleaner.page_structure = False
     cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                    'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr',
                    'table', 'a', 'p', 'br', 'li', 'ul']
     doc = lxml.html.fromstring(html)
     path = '/html/body'
     try:
         body = doc.xpath(path)[0]
     except Exception as detail:
         print detail
         return False
     return cleaner.clean_html(body).text_content().split()
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    doc = UnicodeDammit(data, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    root = html.document_fromstring(data, parser=parser)
    stuff = lxml.html.tostring(cleaner.clean_html(root))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text.encode('ascii', 'ignore')
Esempio n. 21
0
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
Esempio n. 22
0
    def get_url(self):
        """Get the relevant part of a web page."""

        get_url = requests.get(self.data_path)
        page_data = get_url.content

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = cleaner.clean_html(page_data)

        # Strip tags from final results.
        strip_tags = TagStripper()  # Instantiate the HTML Tag Stripper.
        strip_tags.feed(page_html)  # Strip all HTML tags.

        return strip_tags.get_html_data()
Esempio n. 23
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]

        # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()
        data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content()
        data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content()
        data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content()
        data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content()
        data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content()
        data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content()

        if data[qty] < 0:
            data[qty] = 0

        #           self.name= cleaner.clean_html(self.name)
        return data
Esempio n. 24
0
    def get_url(self):
        """Get the HTML body of a web page."""

        # Create file-like object.
        outfile = StringIO.StringIO()

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img', 'li']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = lxml.html.tostring(
            cleaner.clean_html(
                lxml.html.parse(self.data_path)
            )
        )

        outfile.write(page_html)  # Write the results to this file in memory.

        return outfile
Esempio n. 25
0
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
        cleaning_list.append('table')
    if include_images is True:
        # Many websites have <img> inside <figure> or <picture> or <source> tag
        cleaning_list = [
Esempio n. 26
0
    "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]",
]

CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
]  # 'embed', 'figure', 'img', 'table'

## REGEX cache
JSON_PATTERN = re.compile(
    r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})')
# use of regex module for speed
GERMAN_PATTERN = regex.compile(
    r'(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})')
TIMESTAMP_PATTERN = regex.compile(
    r'([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}'
)
Esempio n. 27
0
from django.core.files.storage import FileSystemStorage
from testApp.models import *
from django.template.loader import get_template
from django.http import HttpResponse
import rake
from bs4 import BeautifulSoup
import urllib.request
import sys
import testApp.processing as process

import re
from lxml.html.clean import Cleaner
cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True
cleaner.scripts = True
cleaner.links = True
cleaner.meta = True
cleaner.page_structure = True
cleaner.frames = True
cleaner.forms = True
cleaner.annoying_tags = True


def get_url_content(url):
    try:
        with urllib.request.urlopen(url) as page:
            text = page.read()
    except Exception as e:
        return "Couldn't load url"
    return text
Esempio n. 28
0
import subprocess
import seleniumclient
import xml.etree.ElementTree as ET
import re, lxml
from lxml.html.clean import Cleaner

WORKERS = 1

siteBase = "https://bed-search.nextprot.org/"
sitemapUrl = siteBase + "sitemap.xml"
#Where to save static site
dirlocation = "/work/tmp/static-site/"

cleaner = Cleaner()
#cleaner.scripts = True # This is True because we want to activate the javascript filter
cleaner.scripts = True  # This is True because we want to activate the javascript filter


def saveToFile(content, filename):
    text_file = open(filename, "w")
    text_file.write(content.encode('UTF-8'))
    text_file.close()
    print str(incrementCounter()) + " creating file " + filename + " "
    sys.stdout.flush()


def createDirectoryStructureIfNeeded(URLS):
    for url in URLS:
        filename = getFilename(url)
        if (filename):
            directoryname = os.path.dirname(filename)
Esempio n. 29
0
		response = urllib2.urlopen(req)
		the_page = json.loads(response.read())
		#print("cap response is " + str(the_page));
		#print(the_page['success']);
		#print("what");
		if ( the_page['success'] != True):
			return jsonify(success=False);#return empty object		
	except urllib2.URLError, e:
		return jsonify(success=False); #return empty object
		

	#so the captcha is valid. Now clean the user data
	cleaner = Cleaner()
	cleaner.javascript = True # This is True because we want to activate the javascript filter
	cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
	cleaner.scripts = True
	cleaner.links = True
	cleaner.allow_tags = None
	
	
	name = cleaner.clean_html(name)
	phone = cleaner.clean_html(phone)
	email = cleaner.clean_html(email)
	message = cleaner.clean_html(message)
	
	#build the email
	newMess = mail.EmailMessage();
	newMess.sender ="pizzaoptimization <*****@*****.**>"
	newMess.subject = escape(strip_tags("Website Contact for tutoring:  "+ name))
	newMess.to = "pizzaoptimization <*****@*****.**>"
	newMess.body = escape(strip_tags("Name: " + name + "\nemail: " + email + "\nphone: " + phone + "\nmessage: " + message))
Esempio n. 30
0
    def _parse(self):
        """Internal parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found.
        """

        # Try to parse the provided HTML string using lxml
        # strip all unnecessary information to save space
        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True

        try:
            parser = lxml.html.HTMLParser(encoding='utf-8')
            self.dom = lxml.html.document_fromstring(self.html, parser=parser)
            self.dom = cleaner.clean_html(self.dom)
            self.dom.resolve_base_href()
        except Exception as e:
            # maybe wrong encoding
            logger.error(e)

        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # short alias because we use it so extensively
        css_to_xpath = HTMLTranslator().css_to_xpath

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors',
                                       None)
        self.search_results['num_results'] = ''

        if isinstance(num_results_selector, list) and num_results_selector:
            for selector in num_results_selector:
                try:
                    self.search_results['num_results'] = self.dom.xpath(
                        css_to_xpath(selector))[0].text_content()
                except IndexError as e:
                    logger.warning(
                        'Cannot parse num_results from serp page with selector {}'
                        .format(selector))
                else:  # leave when first selector grabbed something
                    break

        if not selector_dict and not isinstance(selector_dict, dict):
            raise InvalidSearchTypeException(
                'There is no such attribute: {}. No selectors found'.format(
                    attr_name))

        for result_type, selector_class in selector_dict.items():

            self.search_results[result_type] = []

            for selector_specific, selectors in selector_class.items():

                results = self.dom.xpath(
                    css_to_xpath(
                        '{container} {result_container}'.format(**selectors)))
                to_extract = set(
                    selectors.keys()) - {'container', 'result_container'}
                selectors_to_use = {
                    key: selectors[key]
                    for key in to_extract if key in selectors.keys()
                }

                for index, result in enumerate(results):
                    # Let's add primitive support for CSS3 pseudo selectors
                    # We just need two of them
                    # ::text
                    # ::attr(attribute)

                    # You say we should use xpath expressions instead?
                    # Maybe you're right, but they are complicated when it comes to classes,
                    # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                    serp_result = {}
                    # key are for example 'link', 'snippet', 'snippet', ...
                    # selector is the selector to grab these items
                    for key, selector in selectors_to_use.items():
                        value = None
                        if selector.endswith('::text'):
                            try:
                                value = result.xpath(
                                    css_to_xpath(selector.split('::')
                                                 [0]))[0].text_content()
                            except IndexError as e:
                                pass
                        else:
                            attr = re.search(r'::attr\((?P<attr>.*)\)$',
                                             selector).group('attr')
                            if attr:
                                try:
                                    value = result.xpath(
                                        css_to_xpath(selector.split('::')
                                                     [0]))[0].get(attr)
                                except IndexError as e:
                                    pass
                            else:
                                try:
                                    value = result.xpath(css_to_xpath(
                                        selector))[0].text_content()
                                except IndexError as e:
                                    pass

                        serp_result[key] = value
                    # only add items that have not None links.
                    # Avoid duplicates. Detect them by the link.
                    # If statement below: Lazy evaluation. The more probable case first.
                    if 'link' in serp_result and serp_result['link'] and \
                            not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]:
                        self.search_results[result_type].append(serp_result)
Esempio n. 31
0
# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = True
HTML_CLEANER.forms = True
HTML_CLEANER.frames = True
HTML_CLEANER.javascript = True
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = True
HTML_CLEANER.style = False
HTML_CLEANER.remove_tags = [
    'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
    'small', 'sub', 'sup', 'wbr'
]  #  'center', 'table', 'tbody', 'td', 'th', 'tr', 'span',
HTML_CLEANER.kill_tags = [
    'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',
    'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript',
    'object', 'picture', 'style', 'svg', 'time', 'video'
]  # 'area', 'table' # 'header'

# validation
TEI_VALID_TAGS = set(
    ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote'])
TEI_VALID_ATTRS = set(['rendition'])
Esempio n. 32
0
def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]
Esempio n. 33
0
    def _parse(self):
        """Internal parse the dom according to the provided css selectors.
        
        Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found.
        """
        
        # Try to parse the provided HTML string using lxml
        # strip all unnecessary information to save space
        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True

        try:
            parser = lxml.html.HTMLParser(encoding='utf-8')
            self.dom = lxml.html.document_fromstring(self.html, parser=parser)
            self.dom = cleaner.clean_html(self.dom)
            self.dom.resolve_base_href()
        except Exception as e:
            # maybe wrong encoding
            logger.error(e)
        
        # try to parse the number of results.
        attr_name = self.searchtype + '_search_selectors'
        selector_dict = getattr(self, attr_name, None)

        # short alias because we use it so extensively
        css_to_xpath = HTMLTranslator().css_to_xpath

        # get the appropriate css selectors for the num_results for the keyword
        num_results_selector = getattr(self, 'num_results_search_selectors', None)
        self.search_results['num_results'] = ''

        if isinstance(num_results_selector, list) and num_results_selector:
            for selector in num_results_selector:
                try:
                    self.search_results['num_results'] = self.dom.xpath(css_to_xpath(selector))[0].text_content()
                except IndexError as e:
                    logger.warning('Cannot parse num_results from serp page with selector {}'.format(selector))
                else: # leave when first selector grabbed something
                    break

        if not selector_dict and not isinstance(selector_dict, dict):
            raise InvalidSearchTypeExcpetion('There is no such attribute: {}. No selectors found'.format(attr_name))

        for result_type, selector_class in selector_dict.items():

            self.search_results[result_type] = []

            for selector_specific, selectors in selector_class.items():

                results = self.dom.xpath(
                    css_to_xpath('{container} {result_container}'.format(**selectors))
                )

                to_extract = set(selectors.keys()) - {'container', 'result_container'}
                selectors_to_use = {key: selectors[key] for key in to_extract if key in selectors.keys()}

                for index, result in enumerate(results):
                    # Let's add primitive support for CSS3 pseudo selectors
                    # We just need two of them
                    # ::text
                    # ::attr(someattribute)

                    # You say we should use xpath expresssions instead?
                    # Maybe you're right, but they are complicated when it comes to classes,
                    # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html
                    serp_result = {}
                    for key, selector in selectors_to_use.items():
                        value = None
                        if selector.endswith('::text'):
                            try:
                                value = result.xpath(css_to_xpath(selector.split('::')[0]))[0].text_content()
                            except IndexError as e:
                                pass
                        else:
                            attr = re.search(r'::attr\((?P<attr>.*)\)$', selector).group('attr')
                            if attr:
                                try:
                                    value = result.xpath(css_to_xpath(selector.split('::')[0]))[0].get(attr)
                                except IndexError as e:
                                    pass
                            else:
                                try:
                                    value = result.xpath(css_to_xpath(selector))[0].text_content()
                                except IndexError as e:
                                    pass
                        serp_result[key] = value
                    if serp_result:
                        self.search_results[result_type].append(serp_result)
Esempio n. 34
0
	def clean(self):
                cleaner= Cleaner()
                cleaner.javascript = True
                cleaner.scripts = True
                cleaner.frames = True
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()  #lxml.html.fromstring(self.name) ) 
logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR)
logger.debug('dateparser configuration: %s', PARSERCONFIG)

cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try:
        dateobject = datetime.datetime.strptime(datestring, outputformat)
    except ValueError:
        return False