def create_word_frequencies(self): document = re.sub(find_doc_content_pattern, "", self.content) cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True # # cleaner.allow_tags = [''] # # cleaner.remove_unknown_tags = False try: document_visible_text = cleaner.clean_html(document) except UnicodeDecodeError: document_visible_text = "" print "Unicode Error" # document_visible_text = document word_list = document_visible_text.split() for word in word_list: word_stemmed = word.lower() try: self.word_frequencies[ word_stemmed] = self.word_frequencies[word_stemmed] + 1 except: self.word_frequencies[word_stemmed] = 1 self.total_word_count = self.total_word_count + 1
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def trim_html(html): """Takes a html string as input and returns the html without any styles nor javascript""" cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True # Get rid of the javascript and the style cleaner.style = True return cleaner.clean_html(html)
def _get_cleaner(self, print_style, print_js, remove_tags): c = Cleaner() c.scripts = not print_js c.javascript = not print_js c.style = not print_style c.remove_tags = remove_tags c.page_structure = False return c
def cleaned_html(self): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.comments = True cleaner.style = True self.dom = cleaner.clean_html(self.dom) assert len(self.dom), 'The html needs to be parsed to get the cleaned html' return lxml.html.tostring(self.dom)
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() if data["qty"] < 0: data["qty"] = 0 return data
def trim_html(html): """Takes a html string as input and returns the html without any styles nor javascript""" cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True # Get rid of the javascript and the style cleaner.style = True cleaner.meta = False # Keeping the meta tags is important for page redirection purposes cleaner.safe_attrs_only = False return cleaner.clean_html(html)
def cleaned_html(htmlString): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.comments = True cleaner.javascript = True cleaner.scripts = True cleaner.style = True htmlString = cleaner.clean_html(htmlString) return htmlString
def clean(self): cleaner= Cleaner(page_structure=False) cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.allow_tags = [] cleaner.remove_tags = ['p', 'div', 'a'] self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content() self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content() self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content() self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content() self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content() self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()
def clean_cachefiles(): """Clean silly html from all cachefiles in the cachdir""" if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'): import lxml.html from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.style = True cleaner.scripts = True cleaner.javascript = True for file in _get_all_cache_files(): cfile = CompressedFile(file) data = cfile.read() cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data))) cfile.write(cleaned) logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
def validate(self, value): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content() data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content() data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content() # data['username']= cleaner.clean_html(data['username']) # data['storename']= cleaner.clean_html(data['storename']) # data['email']= cleaner.clean_html(data['email']) return data
def remove_scripts(self): if not self.clean_js: logger.debug('Scripts will not be removed') self.parser_modified_content = False return cleaner = Cleaner() # don't modify original page structure, eg, <head>, <html>, <body> ... cleaner.page_structure = False # don't remove inline javascript cleaner.javascript = False # remove <script> tags cleaner.scripts = True self.modified_doc = cleaner.clean_html(self.doc) self.parser_modified_content = True logger.debug('Scripts were successfully removed')
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def clean_html(self): """ Cleaner removes HTML tags prior to processing. Note: cleaning removes the Title tags from HTML. Do not clean before grabbing titles! """ if len(self.response.content): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.style = True cleaner.comments = True try: return html.fromstring(cleaner.clean_html(self.response.content)) except Exception as e: logging.error(e) return None
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
def extract_content(bytehtml, doc): """ extracts blog post content from html """ lxmldoc = lxml.html.document_fromstring(bytehtml) cleaner = Cleaner() cleaner.scripts = True cleaner.comments = True cleaner.style = True #cleaner.page_structure = True cleaner.kill_tags = ['head', 'noscript'] cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote'] cleaner(lxmldoc) content_el = find_content_element(lxmldoc) if content_el: debug(3, 'content quality {}'.format(content_el._quality)) text = tidy_content(content_el.text_content()) return text else: debug(2, 'no content found!') raise Exception('no content')
def get_url(self): """Get the relevant part of a web page.""" get_url = requests.get(self.data_path) page_data = get_url.content cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img'] # Remove these tags. # Store the cleaned up HTML. page_html = cleaner.clean_html(page_data) # Strip tags from final results. strip_tags = TagStripper() # Instantiate the HTML Tag Stripper. strip_tags.feed(page_html) # Strip all HTML tags. return strip_tags.get_html_data()
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content() data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content() data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content() data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content() data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content() data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content() if data[qty] < 0: data[qty] = 0 # self.name= cleaner.clean_html(self.name) return data
def get_url(self): """Get the HTML body of a web page.""" # Create file-like object. outfile = StringIO.StringIO() cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img', 'li'] # Remove these tags. # Store the cleaned up HTML. page_html = lxml.html.tostring( cleaner.clean_html( lxml.html.parse(self.data_path) ) ) outfile.write(page_html) # Write the results to this file in memory. return outfile
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy cleaning_list, stripping_list = \ MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy() if include_tables is False: cleaning_list.append('table') if include_images is True: # Many websites have <img> inside <figure> or <picture> or <source> tag cleaning_list = [
"//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache JSON_PATTERN = re.compile( r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})') # use of regex module for speed GERMAN_PATTERN = regex.compile( r'(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})') TIMESTAMP_PATTERN = regex.compile( r'([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}' )
from django.core.files.storage import FileSystemStorage from testApp.models import * from django.template.loader import get_template from django.http import HttpResponse import rake from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text
import subprocess import seleniumclient import xml.etree.ElementTree as ET import re, lxml from lxml.html.clean import Cleaner WORKERS = 1 siteBase = "https://bed-search.nextprot.org/" sitemapUrl = siteBase + "sitemap.xml" #Where to save static site dirlocation = "/work/tmp/static-site/" cleaner = Cleaner() #cleaner.scripts = True # This is True because we want to activate the javascript filter cleaner.scripts = True # This is True because we want to activate the javascript filter def saveToFile(content, filename): text_file = open(filename, "w") text_file.write(content.encode('UTF-8')) text_file.close() print str(incrementCounter()) + " creating file " + filename + " " sys.stdout.flush() def createDirectoryStructureIfNeeded(URLS): for url in URLS: filename = getFilename(url) if (filename): directoryname = os.path.dirname(filename)
response = urllib2.urlopen(req) the_page = json.loads(response.read()) #print("cap response is " + str(the_page)); #print(the_page['success']); #print("what"); if ( the_page['success'] != True): return jsonify(success=False);#return empty object except urllib2.URLError, e: return jsonify(success=False); #return empty object #so the captcha is valid. Now clean the user data cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.scripts = True cleaner.links = True cleaner.allow_tags = None name = cleaner.clean_html(name) phone = cleaner.clean_html(phone) email = cleaner.clean_html(email) message = cleaner.clean_html(message) #build the email newMess = mail.EmailMessage(); newMess.sender ="pizzaoptimization <*****@*****.**>" newMess.subject = escape(strip_tags("Website Contact for tutoring: "+ name)) newMess.to = "pizzaoptimization <*****@*****.**>" newMess.body = escape(strip_tags("Name: " + name + "\nemail: " + email + "\nphone: " + phone + "\nmessage: " + message))
def _parse(self): """Internal parse the dom according to the provided css selectors. Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found. """ # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True try: parser = lxml.html.HTMLParser(encoding='utf-8') self.dom = lxml.html.document_fromstring(self.html, parser=parser) self.dom = cleaner.clean_html(self.dom) self.dom.resolve_base_href() except Exception as e: # maybe wrong encoding logger.error(e) # try to parse the number of results. attr_name = self.searchtype + '_search_selectors' selector_dict = getattr(self, attr_name, None) # short alias because we use it so extensively css_to_xpath = HTMLTranslator().css_to_xpath # get the appropriate css selectors for the num_results for the keyword num_results_selector = getattr(self, 'num_results_search_selectors', None) self.search_results['num_results'] = '' if isinstance(num_results_selector, list) and num_results_selector: for selector in num_results_selector: try: self.search_results['num_results'] = self.dom.xpath( css_to_xpath(selector))[0].text_content() except IndexError as e: logger.warning( 'Cannot parse num_results from serp page with selector {}' .format(selector)) else: # leave when first selector grabbed something break if not selector_dict and not isinstance(selector_dict, dict): raise InvalidSearchTypeException( 'There is no such attribute: {}. No selectors found'.format( attr_name)) for result_type, selector_class in selector_dict.items(): self.search_results[result_type] = [] for selector_specific, selectors in selector_class.items(): results = self.dom.xpath( css_to_xpath( '{container} {result_container}'.format(**selectors))) to_extract = set( selectors.keys()) - {'container', 'result_container'} selectors_to_use = { key: selectors[key] for key in to_extract if key in selectors.keys() } for index, result in enumerate(results): # Let's add primitive support for CSS3 pseudo selectors # We just need two of them # ::text # ::attr(attribute) # You say we should use xpath expressions instead? # Maybe you're right, but they are complicated when it comes to classes, # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html serp_result = {} # key are for example 'link', 'snippet', 'snippet', ... # selector is the selector to grab these items for key, selector in selectors_to_use.items(): value = None if selector.endswith('::text'): try: value = result.xpath( css_to_xpath(selector.split('::') [0]))[0].text_content() except IndexError as e: pass else: attr = re.search(r'::attr\((?P<attr>.*)\)$', selector).group('attr') if attr: try: value = result.xpath( css_to_xpath(selector.split('::') [0]))[0].get(attr) except IndexError as e: pass else: try: value = result.xpath(css_to_xpath( selector))[0].text_content() except IndexError as e: pass serp_result[key] = value # only add items that have not None links. # Avoid duplicates. Detect them by the link. # If statement below: Lazy evaluation. The more probable case first. if 'link' in serp_result and serp_result['link'] and \ not [e for e in self.search_results[result_type] if e['link'] == serp_result['link']]: self.search_results[result_type].append(serp_result)
# HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = True HTML_CLEANER.comments = True HTML_CLEANER.embedded = True HTML_CLEANER.forms = True HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta', 'small', 'sub', 'sup', 'wbr' ] # 'center', 'table', 'tbody', 'td', 'th', 'tr', 'span', HTML_CLEANER.kill_tags = [ 'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head', 'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript', 'object', 'picture', 'style', 'svg', 'time', 'video' ] # 'area', 'table' # 'header' # validation TEI_VALID_TAGS = set( ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote']) TEI_VALID_ATTRS = set(['rendition'])
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
def _parse(self): """Internal parse the dom according to the provided css selectors. Raises: InvalidSearchTypeExcpetion if no css selectors for the searchtype could be found. """ # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True try: parser = lxml.html.HTMLParser(encoding='utf-8') self.dom = lxml.html.document_fromstring(self.html, parser=parser) self.dom = cleaner.clean_html(self.dom) self.dom.resolve_base_href() except Exception as e: # maybe wrong encoding logger.error(e) # try to parse the number of results. attr_name = self.searchtype + '_search_selectors' selector_dict = getattr(self, attr_name, None) # short alias because we use it so extensively css_to_xpath = HTMLTranslator().css_to_xpath # get the appropriate css selectors for the num_results for the keyword num_results_selector = getattr(self, 'num_results_search_selectors', None) self.search_results['num_results'] = '' if isinstance(num_results_selector, list) and num_results_selector: for selector in num_results_selector: try: self.search_results['num_results'] = self.dom.xpath(css_to_xpath(selector))[0].text_content() except IndexError as e: logger.warning('Cannot parse num_results from serp page with selector {}'.format(selector)) else: # leave when first selector grabbed something break if not selector_dict and not isinstance(selector_dict, dict): raise InvalidSearchTypeExcpetion('There is no such attribute: {}. No selectors found'.format(attr_name)) for result_type, selector_class in selector_dict.items(): self.search_results[result_type] = [] for selector_specific, selectors in selector_class.items(): results = self.dom.xpath( css_to_xpath('{container} {result_container}'.format(**selectors)) ) to_extract = set(selectors.keys()) - {'container', 'result_container'} selectors_to_use = {key: selectors[key] for key in to_extract if key in selectors.keys()} for index, result in enumerate(results): # Let's add primitive support for CSS3 pseudo selectors # We just need two of them # ::text # ::attr(someattribute) # You say we should use xpath expresssions instead? # Maybe you're right, but they are complicated when it comes to classes, # have a look here: http://doc.scrapy.org/en/latest/topics/selectors.html serp_result = {} for key, selector in selectors_to_use.items(): value = None if selector.endswith('::text'): try: value = result.xpath(css_to_xpath(selector.split('::')[0]))[0].text_content() except IndexError as e: pass else: attr = re.search(r'::attr\((?P<attr>.*)\)$', selector).group('attr') if attr: try: value = result.xpath(css_to_xpath(selector.split('::')[0]))[0].get(attr) except IndexError as e: pass else: try: value = result.xpath(css_to_xpath(selector))[0].text_content() except IndexError as e: pass serp_result[key] = value if serp_result: self.search_results[result_type].append(serp_result)
def clean(self): cleaner= Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() #lxml.html.fromstring(self.name) )
logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR) logger.debug('dateparser configuration: %s', PARSERCONFIG) cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img', def date_validator(datestring, outputformat): """Validate a string with respect to the chosen outputformat and basic heuristics""" # try if date can be parsed using chosen outputformat try: dateobject = datetime.datetime.strptime(datestring, outputformat) except ValueError: return False