def load(request): with urlopen( 'https://secure.toronto.ca/cc_sr_v1/data/swm_waste_wizard_APR?limit=1000' ) as response: data = json.loads(response.read().decode()) cleaner = Cleaner() cleaner.remove_tags = ['span'] item: dict for item in data: to_be_stored_body = html.unescape(item['body']) if '<ul' not in to_be_stored_body: to_be_stored_body = '<ul><li>' + to_be_stored_body + '</li></ul>' to_be_stored_body = cleaner.clean_html(to_be_stored_body) if not Item.objects.filter( body=to_be_stored_body).count(): # Only load if body is unique i = Item(body=to_be_stored_body, category=item['category'], title=item['title'], keywords=item['keywords']) if 'id' in item.keys( ): # Some items have an ID, load them if needed i.opt_id = item['id'] i.save() return HttpResponse("Loaded items from JSON. Current item count: " + str(Item.objects.count()))
def crawl(self): # count starts at first page crawling = True count = 0 time.sleep(5) while crawling: searchterm = self.searchterm city = self.city prov = self.province # url = "http://ca.indeed.com/jobs?q="+searchterm+'&l='+city+"%2C+"+prov+'&start='+str(count) url = "http://ca.indeed.com/jobs?q={0}&l=+{1}+%2C{2}&start={3}".format( searchterm, city, prov, str(count)) print(url, 'current URL') page = requests.get(url) tree = html.fromstring(page.text) # cleans html by removing <b></b> tags in the description # These tags caused a bug where the descriptions were fragmented on multiple rows cleaner = Cleaner() cleaner.remove_tags = ['b'] tree = cleaner.clean_html(tree) jobtitles = tree.xpath('//h2[@class="jobtitle"]/a/text()') joblinks = tree.xpath('//h2[@class="jobtitle"]/a/@href') job_descriptions = tree.xpath('//span[@class="summary"]/text()') jobtitles = (job.lstrip() for job in jobtitles) joblinks = (job.lstrip() for job in joblinks) job_descriptions = (job for job in job_descriptions) Database.add_entry(zip(jobtitles, joblinks, job_descriptions)) link_pages = tree.xpath('//div[@class="pagination"]/a/@href') print(link_pages, 'link_pages') # look for next button # if no longer present it means we have reached the last page next_button = tree.xpath( '//*[@id="resultsCol"]/div/a/span/span/text()') next_button_str = ''.join(next_button) print(next_button) if u'Next' in next_button_str: print('found next will continue scraping...') else: print('Hit last page, crawler will stop...') crawling = False for page in link_pages: # takes digits from end of url # takes last 6 characters, unlikely that the number would be any bigger p = page[-6:] digits_url = ''.join([d for d in p if d.isdigit()]) try: print(digits_url, 'digits url') if digits_url > count: print(page, 'page') count = int(digits_url) print(count, 'count') else: print( 'You probably broke your conditional statement...') print(digits_url, 'current count {}'.format(count)) except ValueError: # print("We're on the first page so no int in the page url") print('This failed', digits_url)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def _get_cleaner(self, print_style, print_js, remove_tags): c = Cleaner() c.scripts = not print_js c.javascript = not print_js c.style = not print_style c.remove_tags = remove_tags c.page_structure = False return c
def create_html_cleaner(self): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.remove_tags = [ 'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' ] return cleaner
def crawl(self): # count starts at first page crawling = True count = 0 time.sleep(5) while crawling: searchterm = self.searchterm city = self.city prov = self.province # url = "http://ca.indeed.com/jobs?q="+searchterm+'&l='+city+"%2C+"+prov+'&start='+str(count) url = "http://ca.indeed.com/jobs?q={0}&l=+{1}+%2C{2}&start={3}".format(searchterm, city, prov, str(count)) print(url, 'current URL') page = requests.get(url) tree = html.fromstring(page.text) # cleans html by removing <b></b> tags in the description # These tags caused a bug where the descriptions were fragmented on multiple rows cleaner = Cleaner() cleaner.remove_tags = ['b'] tree = cleaner.clean_html(tree) jobtitles = tree.xpath('//h2[@class="jobtitle"]/a/text()') joblinks = tree.xpath('//h2[@class="jobtitle"]/a/@href') job_descriptions = tree.xpath('//span[@class="summary"]/text()') jobtitles = (job.lstrip() for job in jobtitles) joblinks = (job.lstrip() for job in joblinks) job_descriptions = (job for job in job_descriptions) Database.add_entry(zip(jobtitles, joblinks, job_descriptions)) link_pages = tree.xpath('//div[@class="pagination"]/a/@href') print(link_pages, 'link_pages') # look for next button # if no longer present it means we have reached the last page next_button = tree.xpath('//*[@id="resultsCol"]/div/a/span/span/text()') next_button_str = ''.join(next_button) print(next_button) if u'Next' in next_button_str: print('found next will continue scraping...') else: print('Hit last page, crawler will stop...') crawling = False for page in link_pages: # takes digits from end of url # takes last 6 characters, unlikely that the number would be any bigger p = page[-6:] digits_url = ''.join([d for d in p if d.isdigit()]) try: print(digits_url, 'digits url') if digits_url > count: print(page, 'page') count = int(digits_url) print(count, 'count') else: print('You probably broke your conditional statement...') print(digits_url, 'current count {}'.format(count)) except ValueError: # print("We're on the first page so no int in the page url") print('This failed', digits_url)
def cleaner_li(self): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.safe_attrs_only = True cleaner.remove_tags = ['i', 'span', 'b', 'li'] cleaner.safe_attrs = ['href'] return cleaner
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() if data["qty"] < 0: data["qty"] = 0 return data
def get_cleaner(): cleaner = Cleaner() cleaner.embedded = True cleaner.frames = True cleaner.style = True cleaner.remove_unknown_tags = True cleaner.processing_instructions = True cleaner.annoying_tags = True cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p'] cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul', 'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub'] return cleaner
def filter_html(html): cleaner = Cleaner(javascript=True,scripts=True,style=True,embedded=False,remove_unknown_tags=True) cleaner.remove_tags = ['div','font','strong','u','em','b'] html = cleaner.clean_html(html) tree = lxml.html.fromstring(html) links = tree.xpath('//a') for a in links: a.set('rel','nofollow') a.set('target','_blank') html = lxml.html.tostring(tree) return html
def clean(self): cleaner= Cleaner(page_structure=False) cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.allow_tags = [] cleaner.remove_tags = ['p', 'div', 'a'] self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content() self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content() self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content() self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content() self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content() self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()
def validate(self, value): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content() data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content() data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content() # data['username']= cleaner.clean_html(data['username']) # data['storename']= cleaner.clean_html(data['storename']) # data['email']= cleaner.clean_html(data['email']) return data
def filter_html(html): cleaner = Cleaner(javascript=True, scripts=True, style=True, embedded=False, remove_unknown_tags=True) cleaner.remove_tags = ['div', 'font', 'strong', 'u', 'em', 'b'] html = cleaner.clean_html(html) tree = lxml.html.fromstring(html) links = tree.xpath('//a') for a in links: a.set('rel', 'nofollow') a.set('target', '_blank') html = lxml.html.tostring(tree) return html
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def get_current_players(self, info_tree): table_head_pat = '//table[@id="players"]//thead//tr//th' # Some but not all headers have an <a> for sorting columns # that needs to be removed cleaner = Cleaner() cleaner.remove_tags = ['a'] headings = [] required_headings = { 'Name', 'Perk', 'Dosh', 'Health', 'Kills', 'Ping', 'Admin' } for heading in info_tree.xpath(table_head_pat): heading = cleaner.clean_html(heading) headings += heading.xpath('//th/text()') if not required_headings.issubset(set(headings)): logger.error("Player is missing columns ({}) on {}".format( required_headings - set(headings), self.server.name)) player_rows_pat = '//table[@id="players"]//tbody//tr' player_rows_tree = info_tree.xpath(player_rows_pat) players_table = [] for player_row in player_rows_tree: values = [] for value in player_row: if not value.text_content(): values += [None] else: values += [value.text_content()] if values[0] == "There are no players": logger.debug("No players on server {}".format( self.server.name)) elif len(values) != len(headings): logger.warning("Player row ({}) length did not " "match the table length on {}".format( player_row[headings.index("Name")], self.server.name)) else: players_table += [values] return (headings, players_table)
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def extract_content(bytehtml, doc): """ extracts blog post content from html """ lxmldoc = lxml.html.document_fromstring(bytehtml) cleaner = Cleaner() cleaner.scripts = True cleaner.comments = True cleaner.style = True #cleaner.page_structure = True cleaner.kill_tags = ['head', 'noscript'] cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote'] cleaner(lxmldoc) content_el = find_content_element(lxmldoc) if content_el: debug(3, 'content quality {}'.format(content_el._quality)) text = tidy_content(content_el.text_content()) return text else: debug(2, 'no content found!') raise Exception('no content')
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content() data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content() data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content() data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content() data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content() data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content() if data[qty] < 0: data[qty] = 0 # self.name= cleaner.clean_html(self.name) return data
HTML_CLEANER.annoying_tags = True HTML_CLEANER.comments = True HTML_CLEANER.embedded = True HTML_CLEANER.forms = True HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta', 'small', 'sub', 'sup', 'wbr' ] # 'center', 'table', 'tbody', 'td', 'th', 'tr', 'span', HTML_CLEANER.kill_tags = [ 'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head', 'iframe', 'img', 'label', 'link', 'map', 'math', 'nav', 'noscript', 'object', 'picture', 'style', 'svg', 'time', 'video' ] # 'area', 'table' # 'header' # validation TEI_VALID_TAGS = set( ['code', 'del', 'div', 'head', 'hi', 'item', 'lb', 'list', 'p', 'quote']) TEI_VALID_ATTRS = set(['rendition']) # counters tokens_posts = 0 tokens_comments = 0
from functools import wraps from flask import Flask, request, session, render_template, url_for from flask import abort, redirect, Markup, make_response from flask_common import Common from names import get_full_name from raven.contrib.flask import Sentry from flask_qrcode import QRcode from . import storage from urllib.parse import quote from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.remove_tags = ['script', 'style', 'link'] cleaner.allow_attributes = ['alt', 'href'] cleaner.remove_attributes = [ 'id', 'class', 'style', 'align', 'border', 'cellpadding', 'cellspacing', 'width', 'height', 'hspace', 'vspace', 'frameborder', 'marginwidth', 'marginheight', 'noresize', 'scrolling', 'target', 'onclick', 'ondblclick', 'onmousedown', 'onmousemove', 'onmouseover', 'onmouseout', 'onmouseup', 'onkeypress', 'onkeydown', 'onkeyup', 'onblur', 'onchange', 'onfocus', 'onselect', 'onreset', 'onsubmit', 'onabort', 'oncanplay', 'oncanplaythrough', 'oncuechange', 'ondurationchange', 'onemptied', 'onended', 'onloadeddata', 'onloadedmetadata', 'onloadstart', 'onpause', 'onplay', 'onplaying', 'onprogress', 'onratechange', 'onseeked', 'onseeking', 'onstalled', 'onsuspend', 'ontimeupdate', 'onvolumechange', 'onwaiting' ]
stopwords.words('english') ] exclude_list = [item for sublist in exclude for item in sublist] exclude_list.append('') word_list = [] with open(args.infile, 'rb') as infile: soup = bs(infile) if len(soup) > 0: if soup.find('title') is not None: title = soup.find('title').contents[0] print title body = soup.findAll('p') print body cleaner = Cleaner() cleaner.remove_tags = ['p'] for x in body: document = lxml.html.document_fromstring(str(x)) word_list.append(document.text_content()) word_list = [re.sub("\\n", '', word) for word in word_list] word_list = [word.split(' ') for word in word_list] word_list = [item for sublist in word_list for item in sublist] wordslist2 = [] for word in word_list: try: word = word.translate(None, string.punctuation.translate( None, '"')).lower() wordslist2.append(word)
HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False # True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False # True HTML_CLEANER.style = False # HTML_CLEANER.remove_tags = ['a', 'abbr', 'acronym', 'address', 'big', 'cite', 'dd', 'font', 'ins', 'meta', 'span', 'small', 'sub', 'sup', 'wbr'] # 'center', 'table', 'tbody', 'td', 'th', 'tr', HTML_CLEANER.remove_tags = ['img'] HTML_CLEANER.kill_tags = ['aside', 'del'] # 'area', 'table' # 'header' CUT_EMPTY_ELEMS = { 'article', 'b', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i', 'li', 'main', 'p', 'section', 'span', 'strong', 'td' } # 'meta', MANUALLY_CLEANED = [ 'audio', 'blink', 'button', 'canvas', 'embed', 'figure', 'footer', 'form', 'head', 'iframe', 'input', 'link', 'map', 'marquee', 'math', 'nav', 'noscript', 'object', 'picture', 'script', 'style', 'svg', 'time', 'video' ] # 'frame' 'frameset' 'source', 'img',
def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u' ', ' ', page8) # remove CR page8 = re.sub(u'<!--.*?-->', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content
"articles of unnecessary stuff.") parser.add_argument("-f","--file", help="metalink article name",required=True) args = parser.parse_args() filename = args.file # file check if not os.path.isfile(filename) and not os.access(sys.argv[1], os.R_OK): print "WARNING - Couldn't find specified file!" sys.exit(1) elif not os.path.exists('original'): print 'Creating original directory for backups...' os.makedirs('original') # cleaner cleaner = Cleaner(page_structure=False) cleaner.remove_tags = ["span"] cleaner.kill_tags = ["script","img","style"] # original file conversion original = codecs.open(filename,"r","cp866") for line in original: line = re.sub(r"[^\x00-\x7F]+","",line) #if " " in line: #line = re.sub(r" ", "", line) if "®" in line: line = line.replace("®","") number = re.search(r"<span style=\"display:none\">\d+</span>", line) if number: line = re.sub(r"<span style=\"display:none\">\d+</span>", "", line) footer = re.search(r"Didn't find what you are looking for\?", line)
import re import os import lxml from bs4 import BeautifulSoup from lxml.html.clean import Cleaner from lxml.etree import XMLSyntaxError from store_helper import StoreHelper from text_helper import TextHelper cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.inline_style = True cleaner.whitelist_tags = set([]) cleaner.remove_tags = [ 'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span' ] cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label'] class HTMLHelper(object): @staticmethod def remove_tag(web_source): text = re.sub(r'<[^>]+>', '', web_source) return text @staticmethod def get_text(web_source): try: _html = lxml.html.document_fromstring(web_source) except XMLSyntaxError:
def parse_html(self, filename): print(filename) parser = etree.HTMLParser() try: tree = html.parse(filename) except: return False, '' content = {} title = tree.xpath("//meta[@name='keywords']/@content")[0].replace( '- TAAZE 讀冊生活', '') content['title'] = title prod_info = tree.xpath("//li//span") for p in prod_info: #print("p={}".format(p.text)) if p.text != None and '作者' in p.text: for info in p.iter('a'): author = info.text.replace( '/著', '').replace('/編著', '').replace('/編', '').replace( '/撰文', '').replace('/總編輯', '').replace('/繪', '') author = author.replace('/譯', '').replace( '/小說改編', '').replace('/原著劇本', '').replace('/資料提供', '').replace('/企劃主編', '') author = author.replace('/改編', '').replace( '/原著', '').replace('/口述', '').replace('/作', '').replace( '/繪,文', '').replace(' ', '') content['author'] = author elif p.text != None and '譯者' in p.text: for info in p.iter('a'): content['translator'] = info.text elif p.text != None and '出版社' in p.text: for info in p.iter('a'): content['publisher'] = info.text elif p.text != None and '出版日期' in p.text: for info in p.iter('span'): content['publish_date'] = info.text elif p.text != None and 'ISBN' in p.text: for info in p.iter('span'): content['ISBN_no'] = info.text elif p.text != None and '類別' in p.text: content['genre'] = [] for info in tree.xpath('//li//span/following-sibling::span/a'): if info.attrib.get('class') == 'linkStyle02': #print("info={}, {}".format(info.getparent().tag, info.text)) content['genre'].append(info.text) tag_info = tree.xpath("//a[@class='tag']") content['tag'] = [] for p in tag_info: #print("p={}".format(p.text)) content['tag'].append(p.text) brief_info = tree.xpath("//div[@id='prodPfDiv']") desc = '' ad_word = ['關鍵特色', '好評推薦', '作者簡介', '佳評如潮', '暢銷書', '本書特色'] if len(brief_info): for child in brief_info[0]: if child.text != None and '作者簡介' in child.text: break cleaner = Cleaner() cleaner.remove_tags = [ 'p', 'br', 'span', 'font', 'b', 'center', 'u', 'strong' ] innertext = etree.tostring(child, encoding='unicode', method='html').replace("<div>", "").replace( "</div>", "") cleaned = cleaner.clean_html(innertext) if len(cleaned): cleaned = cleaned.replace("<div>", "").replace("</div>", "") ad_exist = False for word in ad_word: if word in cleaned: ad_exist = True if ad_exist == True: break desc += cleaned content['description'] = desc head, tail = os.path.split(filename) tazze_link = 'http://www.taaze.tw/sing.html?pid=' + tail[:-5] content['link'] = [tazze_link] #print("content={}".format(content)) if 'ISBN_no' not in content.keys(): content['ISBN_no'] = tail[:-5] filename = "{}.json".format(content['ISBN_no']) file_path = os.path.join(self.json_folder, filename) output = open(file_path, "w") output.write(json.dumps(content, ensure_ascii=False)) output.close() return True, content['ISBN_no']
import codecs import sys from bs4 import BeautifulSoup import lxml from lxml.html.clean import Cleaner import re from cStringIO import StringIO import unicodedata reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.script = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.kill_tags = ['a', 'img', 'href'] cleaner.remove_tags = ['div', 'span', 'li'] directory1 = "C:\Users\Satanu\html_test\\" directory2 = "C:\Users\Satanu\text\\" for filename in os.listdir(directory1): to_write = [] html = codecs.open(directory1 + filename, 'r', 'utf-8') raw = lxml.html.tostring( cleaner.clean_html(lxml.html.parse(directory1 + filename))) name = filename.strip('html') text = codecs.open(directory2 + filename, 'w', 'utf-8') text.write(raw) soup = BeautifulSoup(raw, 'html')
def parse_books_html(self, filename): parser = etree.HTMLParser() try: tree = html.parse(filename) except: return False, '', False try: content = {} title = tree.xpath("//title")[0] if title == None: return False, '', False content['title'] = title.text.replace('博客來-', '') property_info = tree.xpath( "//meta[@name='description']")[0].attrib.get('content') property_list = property_info.split(",") for item in property_list: if 'ISBN' in item: content['ISBN_no'] = item[5:] elif '出版社' in item: content['publisher'] = item[4:] elif '作者' in item: content['author'] = item[3:] elif '譯者' in item: content['translator'] = item[3:] elif '出版日期' in item: content['publish_date'] = item[5:].replace('/', '-') genre_info = tree.xpath( "//div[@class='mod_b type02_m058 clearfix']//ul[@class='sort']" ) for p in genre_info: content['genre'] = [] for item in p.iter('a'): content['genre'].append(item.text) brief_info = tree.xpath("//div[@itemprop='description']") desc = '' ad_word = ['關鍵特色', '好評推薦', '作者簡介', '佳評如潮', '暢銷書', '本書特色'] if len(brief_info): for child in brief_info[0]: if child.text != None and '作者簡介' in child.text: break cleaner = Cleaner() cleaner.remove_tags = [ 'p', 'br', 'span', 'font', 'b', 'center', 'u', 'strong' ] innertext = etree.tostring( child, encoding='unicode', method='html').replace("<div>", "").replace( "</div>", "").replace("\u3000", '').replace('\n', '').replace('\r', '') cleaned = cleaner.clean_html(innertext) if len(cleaned): cleaned = cleaned.replace("<div>", "").replace("</div>", "") desc += cleaned content['description'] = desc head, tail = os.path.split(filename) loc_idx = tail.find('loc=') pid = tail[:loc_idx - 1] content['link'] = ['http://www.books.com.tw/products/' + tail[:-5]] if 'ISBN_no' not in content.keys(): content['ISBN_no'] = pid #download image img_link = tree.xpath( "//meta[@property='og:image']")[0].attrib.get('content') r = requests.get(img_link, headers=self.header) image_status = False if r.status_code == 200: filename = '{}.jpg'.format(content['ISBN_no']) output_file = os.path.join(self.books_img_folder, filename) if not os.path.exists(output_file): output = open(output_file, "wb") output.write(r.content) output.close() image_status = True filename = "{}.json".format(content['ISBN_no']) file_path = os.path.join(self.books_json_folder, filename) output = open(file_path, "w") output.write(json.dumps(content, ensure_ascii=False)) output.close() return True, content['ISBN_no'], image_status except Exception as e: print(filename) print(traceback.format_exc()) return False, '', False
HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False HTML_CLEANER.remove_tags = MANUALLY_STRIPPED HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' if include_tables is False: MANUALLY_CLEANED.append('table') if include_images is False: # Many websites have <img> inside <figure> or <picture> or <source> tag MANUALLY_CLEANED.extend(['figure', 'picture', 'source']) MANUALLY_STRIPPED.append('img') for expression in MANUALLY_CLEANED: for element in tree.getiterator(expression): try: element.drop_tree()
stopwords.words('english')] exclude_list = [item for sublist in exclude for item in sublist] exclude_list.append('') for iff in infiles: wordslist = [] with open("all_data/" + iff, 'rb') as temp: soup = bs(temp) if len(soup) > 0: if soup.find('title') is not None: title = soup.find('title').contents[0] body = soup.findAll('p') cleaner = Cleaner() cleaner.remove_tags = ['p'] for x in body: document = lxml.html.document_fromstring(str(x)) wordslist.append(document.text_content()) wordslist = [re.sub("\\n",'',word) for word in wordslist] wordslist = [word.split(' ') for word in wordslist] wordslist = [item for sublist in wordslist for item in sublist] wordslist2 = [] for word in wordslist: try: word = word.translate(None, string.punctuation.translate(None, '"')).lower() wordslist2.append(word) except TypeError: pass wordslist = [word for word in wordslist2 if word not in set(exclude_list)]
def google_news_cut(link): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter page = get_web_page(link) soup = BeautifulSoup(page, 'html.parser') # all_news = soup.find_all('a', 'nuEeue hzdq5d ME7ew') all_news = soup.find_all('a', 'ipQwMb Q7tWef') key_str = "" titles_link = [] word_t_list = [] documents = [] for news in all_news: # print(news.string) # print(news['href']) if re.match('\./', news['href']) is None: link = news['href'] else: link = 'https://news.google.com/' + re.sub('\./', "", news['href']) titles_link.append({'title': news.string, 'link': link}) key_str = key_str + news.string + "\n" remove_words = [ 'mlb', 'nba', '新聞網', '中央社', '報紙', '聯合', '時報', '全網', '自己', '中時', '年月日', '直播', '三立', '聞網', '使用者', '中國時報', '自由時報', '關鍵字', '網站', '發表', '留言', '發言', '網小時', '自由' ] jieba.load_userdict("my_dict.txt") jieba.load_userdict("news_dict.txt") jieba.analyse.set_stop_words("stop_words.txt") jieba.analyse.set_stop_words("stop_words_sport.txt") for t_link in titles_link: print('get_web_page: ', t_link['title'], " ", t_link['link']) try: page = get_web_page_html(t_link['link']) # page = get_web_page(t_link['link']) except requests.exceptions.SSLError: continue except lxml.etree.ParserError: continue if page is None: continue cleaner.kill_tags = ['a', 'img'] cleaner.remove_tags = ['div', 'p'] cleaner.remove_unknown_tags = False cleaner.allow_tags = ['p'] result = html.tostring(cleaner.clean_html(page), encoding="utf-8", pretty_print=True, method="html") article_content = re.sub(' ', "", result.decode('utf-8')) # article_content = re.sub(u'[^\u4E00-\u9FA5]', " ", article_content) article_content = re.sub(r'[\n\xa0\W你妳我他她它們]', "", article_content) article_content = re.sub('自己', "", article_content) # print(article_content) words_t = jieba.cut(article_content, cut_all=False) word_t_list = [word for word in words_t if word not in remove_words] print(word_t_list) documents.append(word_t_list) return documents
import urllib2 import re def separatewords(text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] url = 'http://www.news.mail.ru/' c = urllib2.urlopen(url) html = c.read() doc = lxml.html.fromstring(html) path = '/html/body' body = doc.xpath(path)[0] words = cleaner.clean_html(body).text_content() print words print words.split() print [s for s in words.split()]