def tokenize(n, tagsDict): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 df = pandas.DataFrame(columns=[list(tagsDict)]) while (i < n): allVector = {} if (os.path.isfile("spam/%d.txt" % i)): try: for word in tagsDict: allVector[word] = 0 readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols allCopy = noSymbols.split() # allCopy is the set of words without symbols for tag in allCopy: df.ix[i[tag]] = df.ix[i[tag]] + 1 df.ix[i['isSpam']] = 'spam' except Exception, err: print traceback.format_exc() print sys.exc_info()[0] i = i + 1
def extract_text(self, url): try: if url.value.startswith('http') and '://' in url.value: prog = FloatProgress(min=0, max=100, description='Progress') display(widgets.HTML('<br/>'), prog) tr0 = time() site = self.browser.get(url.value, timeout=10) if site.ok: prog.value += 50 tr1 = time() - tr0 t0 = time() cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['header', 'footer'] source_tree = etree.HTML(cleaner.clean_html(site.content)) text = source_tree.itertext() t1 = time() - t0 self.text = '\n'.join( [n.strip() for n in text if n.strip()]) prog.value += 50 self.keywords_and_display(prog) else: display( widgets.HTML( '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>' )) else: self.text = url.value self.keywords_and_display(False) except Exception as e: print 'Error extracting text: %s' % (e)
def init_cleaner(): from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = False cleaner.style = False cleaner.kill_tags = ["pre", "code"] return cleaner
def create_word_frequencies(self): document = re.sub(find_doc_content_pattern, "", self.content) cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True # # cleaner.allow_tags = [''] # # cleaner.remove_unknown_tags = False try: document_visible_text = cleaner.clean_html(document) except UnicodeDecodeError: document_visible_text = "" print "Unicode Error" # document_visible_text = document word_list = document_visible_text.split() for word in word_list: word_stemmed = word.lower() try: self.word_frequencies[ word_stemmed] = self.word_frequencies[word_stemmed] + 1 except: self.word_frequencies[word_stemmed] = 1 self.total_word_count = self.total_word_count + 1
def getArticles(keyword): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'chrome')] term = keyword.replace(" ", "+") query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term htmltext = br.open(query).read() #print htmltext soup = BeautifulSoup(htmltext) search = soup.findAll('div', attrs={'id': 'search'}) #print search[0] searchtext = str(search[0]) soup1 = BeautifulSoup(searchtext) list_items = soup1.findAll('li') regex = "q=.*?&" pattern = re.compile(regex) results_array = [] for li in list_items: soup2 = BeautifulSoup(str(li)) links = soup2.findAll('a') source_link = links[0] #print source_link source_url = re.findall(pattern, str(source_link)) if len(source_url) > 0: results_array.append( str(source_url[0].replace("q=", "").replace("&", ""))) return results_array
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def clense(text, space_replacer=' ', to_lower=True, remove_punc=True): # remove HTML comments first as suggested in https://stackoverflow.com/questions/28208186/how-to-remove-html-comments-using-regex-in-python cleaner = Cleaner() cleaner.javascript = True cleaner.style = True text = cleaner.clean_html(text.encode("utf-8")).decode("utf-8") text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL) text = remove_tags(text) text = re.sub(r'[^\x00-\x7F]+', ' ', text) #remove non-ascii characters text = text.replace("&", "and") text = text.replace("&", "and") text.strip() text.rstrip() text = text.replace("\r\n", "") text = text.replace("\n", "") text = text.replace("\"", "") if to_lower: text = text.lower() if remove_punc: # from https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python text = re.sub(r'[^\w\s]', '', text) #remove punctuation marks and non-word text = text.replace(",", "") text = re.sub(' +', space_replacer, text) #if all(ord(char) < 128 for char in text) == False: # text = '' ''.join(i for i in text if ord(i) < 128) return text
def clean_and_update_html(html, images): cleaner = Cleaner() cleaner.javascript = True html = lxml.html.tostring( cleaner.clean_html(lxml.html.fromstring(html)), method='html', encoding='unicode', doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' ' "http://www.w3.org/TR/html4/strict.dtd">') # forwarding if "\n\n\n\n" in html: html = re.split("\n\n\n\n", html, 1, re.I)[1] block = re.findall("<img[^<]*src[^<]*>", html) for answer in block: start_quote = answer.index("src=") indices = [] #start/end indices of the link for i in range(start_quote, len(answer)): if (answer[i] == '"'): indices.append(i) website = answer[indices[0] + 1:indices[1]] if "cid:" in website: # Attachment! cid = website.split(":")[1] if (cid in images): html = html.replace( website, f"data:{images[cid].get_content_type()};base64,{images[cid].get_payload()}" ) else: html = html.replace(website, '') elif 'http' not in website: html = html.replace(answer, '') return html
def _statistica_(url_string): """Implementa la logica per estrarre documento e metadati da rivista-statistica """ url = urlparse.urlparse(url_string) conn = httplib.HTTPConnection(url.hostname) conn.request("GET", url.path) res = conn.getresponse() body = res.read() my_page = html.fromstring(body) # Rimuovi il banner dei cookie del ***** for el in my_page.xpath('//*[@id="cookiesAlert"]'): el.getparent().remove(el) # Rimuovi tutti i tag script e il loro contenuto cleaner = Cleaner() cleaner.javascript = True my_page = cleaner.clean_html(my_page) title = my_page.xpath('//*[@id="articleTitle"]/h3') full_content = my_page.xpath('//*[@id="content"]') doi = my_page.xpath('//*[@id="pub-id::doi"]') full_content = ''.join( [etree.tostring(fix_links(el, url_string)) for el in full_content]) result = { 'title': title[0].text_content(), 'content': full_content, 'doi': doi[0].text_content() } return json.JSONEncoder().encode(result)
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def strip_tags(web_content): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True text = BS(lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(web_content))), features="lxml") return text.getText()
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def remove_script_and_style(html_content): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['script'] clean_html = cleaner.clean_html(html_content) return clean_html
def get_context(web_link, answer): browser = webdriver.Firefox( executable_path='geckodriver-v0.26.0-win64/geckodriver') browser.get(web_link) html_source = browser.page_source get_context = BeautifulSoup(html_source, "lxml") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True web_page_text = '' for element in get_context: element_string = lxml.html.document_fromstring(str(element)) page_text = lxml.html.tostring(cleaner.clean_html(element_string)) page_text = re.sub("<.*?>", " ", str(page_text)) web_page_text = web_page_text + " " + page_text browser.close() matcher = difflib.SequenceMatcher(None, web_page_text, answer) match = matcher.find_longest_match(0, len(web_page_text), 0, len(answer)) if match.a > 1000: start_context = match.a - 999 else: start_context = 0 if len(web_page_text) > start_context + 2000: end_context = start_context + 2000 else: end_context = len(web_page_text) - 1 context = web_page_text[start_context:end_context] return context
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def getArticles(keyword): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True br = mechanize.Browser() br.set_handle_robots(False) br.addheaders=[('User-agent','chrome')] term = keyword.replace(" ", "+") query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term htmltext = br.open(query).read() #print htmltext soup = BeautifulSoup(htmltext) search = soup.findAll('div', attrs={'id': 'search'}) #print search[0] searchtext= str(search[0]) soup1=BeautifulSoup(searchtext) list_items=soup1.findAll('li') regex = "q=.*?&" pattern = re.compile(regex) results_array = [] for li in list_items: soup2 = BeautifulSoup(str(li)) links = soup2.findAll('a') source_link = links[0] #print source_link source_url = re.findall(pattern, str(source_link)) if len(source_url) > 0: results_array.append(str(source_url[0].replace("q=", "").replace("&", ""))) return results_array
def buildDicts(n): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 tagsDict = set() while (i < n): if (os.path.isfile("spam/%d.txt" % i)): try: readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0] if (os.path.isfile("notspam/%d.txt" % i)): try: readInFile = open("notspam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0]
def html2text(html): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True html_tree = cleaner.clean_html(lxml.html.fromstring(html)) el.strip_tags(html_tree, '*') return html_tree.text
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def cleanMe(text): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True #text = unicodedata.normalize("NFKD", text).encode('ascii','ignore') clean = cleaner.clean_html(text) return clean
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def raw_scraper(url, memoize): t1 = time.time() if should_exclude(url): # heuristic to make downloading faster return None, { "url": url, "scraper": "raw", } try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = minify(article.html) html = cleaner.clean_html(html) article.parse() except: return None, { "url": url, "scraper": "raw", } if article.text == "": return None, { "url": url, "scraper": "raw", } metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"} return html, metadata
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def _get_cleaner(self, print_style, print_js, remove_tags): c = Cleaner() c.scripts = not print_js c.javascript = not print_js c.style = not print_style c.remove_tags = remove_tags c.page_structure = False return c
def Content(content): doc = html.document_fromstring(content) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True doc = cleaner.clean_html(doc) plaintext = "\n".join(etree.XPath("//text()")(doc)) return plaintext
def __cleanhtml(raw_html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleantext = cleaner.clean_html(raw_html) cleantext = BeautifulSoup(cleantext, "lxml").text return cleantext
def trim_html(html): """Takes a html string as input and returns the html without any styles nor javascript""" cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True # Get rid of the javascript and the style cleaner.style = True return cleaner.clean_html(html)
def cleanInputString(self, htmlString): # "WITH JAVASCRIPT & STYLES" cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter # "WITHOUT JAVASCRIPT & STYLES" htmlClean = lxml.html.tostring(cleaner.clean_html(htmlString)) return htmlClean
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.allow_tags = allow_tags cleaner.remove_unknown_tags = False return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)), encoding='unicode')
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li', 'em', 'i', 'code', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def create_html_cleaner(self): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.remove_tags = [ 'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' ] return cleaner
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def _parseAndCleanHtml(rawHtml): # Parse html with lxml library parsedHtml = lh.fromstring(rawHtml) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True return cleaner.clean_html(parsedHtml)
def flipkart_extract_details(self, msg_fields, order_xpath, msg_params, acc_params): messages_dict = dict() items_dict = dict() messages_dict = {'store': 'Flipkart'} messages_dict.update(msg_fields) messages_dict.update(acc_params) messages_dict["user_contextio_uuid"] = messages_dict.pop("id") items_dict["store"] = messages_dict["store"] items_dict["user_contextio_uuid"] = messages_dict["user_contextio_uuid"] items_dict["user_email"] = messages_dict["user_email"] cleaner = Cleaner() cleaner.javascript = True cleaner.style = True tree = html.fromstring(msg_fields["body"]) #print order_xpath if msg_params["sender"] == "*****@*****.**" and "confirmation" in msg_fields["subject"].strip().lower(): #tree = html.fromstring(msg_fields["body"]) #address = tree.xpath(order_xpath["address"]) # print order_xpath["address"] print msg_fields["subject"] #print address address = list() bsObj = BeautifulSoup(msg_fields["body"]) add = bsObj.find(text="DELIVERY ADDRESS") if add: addr = add.findNext().get_text().encode("utf-8") #addr = addr.replace("\xa0", " ") address.append(addr) address.append(add.findNext().findNext().get_text().encode("utf-8")) print type(address) #print address items_dict["delivery_address"] = str_process(address) print items_dict["delivery_address"] else: items_dict["delivery_address"] = "" """for x in range(len(order_xpath["order_id"])): order_id = tree.xpath(order_xpath["order_id"][x]) if order_id: break""" order_id = re.search(r"\[(\w+)\]", msg_fields["subject"]) items_dict["order_id"] = order_id.group(1) print items_dict["order_id"] items_dict["item_title"] = str_process(tree.xpath(order_xpath["item_title"])) print items_dict["item_title"] item_price = tree.xpath(order_xpath["item_price"]) items_dict["item_price"] = amount_process(item_price) print items_dict["item_price"] items_dict["item_status"] = "confirmed" messages_dict["order_id"] = items_dict["order_id"] #insert_to_items_table(**items_dict) #insert_to_messages_table(**messages_dict) """
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() if data["qty"] < 0: data["qty"] = 0 return data
def cleaned_html(self): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.comments = True cleaner.style = True self.dom = cleaner.clean_html(self.dom) assert len(self.dom), 'The html needs to be parsed to get the cleaned html' return lxml.html.tostring(self.dom)
def url2count(title): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html= True r = requests.get(makeurl(title), timeout=5) #r.text lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' ')) text = nltk.clean_html(lxclean) collapsewhitespace = re.sub(r'\s{2,}', ' ', text) nonPunct = re.compile('.*[A-Za-z0-9].*') article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)] article_length = len(article_list) return(article_length)
def clean(self): cleaner= Cleaner(page_structure=False) cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.allow_tags = [] cleaner.remove_tags = ['p', 'div', 'a'] self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content() self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content() self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content() self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content() self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content() self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()
def GetTextData(self, htmlData, forUrl='<Mising URL info>'): '''Function to clean up html raw data and get the text from it. Keep it small. Not thread safe, returns an object that will go into the parsedData["text"] field for HandleData function above''' from lxml import html if self.RemoveJavaScriptAndCSS: try: from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True htmlData = cleaner.clean_html(htmlData) except: print("Could not remove style and js code for url :" + forUrl) return html.fromstring(htmlData).text_content()
def clean_cachefiles(): """Clean silly html from all cachefiles in the cachdir""" if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'): import lxml.html from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.style = True cleaner.scripts = True cleaner.javascript = True for file in _get_all_cache_files(): cfile = CompressedFile(file) data = cfile.read() cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data))) cfile.write(cleaned) logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
def validate(self, value): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content() data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content() data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content() # data['username']= cleaner.clean_html(data['username']) # data['storename']= cleaner.clean_html(data['storename']) # data['email']= cleaner.clean_html(data['email']) return data
def remove_scripts(self): if not self.clean_js: logger.debug('Scripts will not be removed') self.parser_modified_content = False return cleaner = Cleaner() # don't modify original page structure, eg, <head>, <html>, <body> ... cleaner.page_structure = False # don't remove inline javascript cleaner.javascript = False # remove <script> tags cleaner.scripts = True self.modified_doc = cleaner.clean_html(self.doc) self.parser_modified_content = True logger.debug('Scripts were successfully removed')
def get_content(url): if url is None: return 'Body is not found!' content = requests.get(url).content doc = html.fromstring(content) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True # cleaner.remove_tags = ['br'] content = html.tostring(cleaner.clean_html(doc)) return content
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def tokenize(n): reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 existingSpam = list() existingNotSpam = list() for file in os.listdir("./spam/"): if (i == n): Break else: spamPath = os.path.join("./spam", file) existingSpam.append(spamPath) i = i + 1 i=0 for file in os.listdir("./notspam/"): if (i == n): break else: spamPath = os.path.join("./notspam", file) existingNotSpam.append(spamPath) i = i+1 y1=['0'] * len(existingSpam) y2=['1'] * len(existingNotSpam) y = y1+y2 existingSpam = existingSpam + existingNotSpam vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore') spamFeatures = vectorizer.fit_transform(existingSpam) #print vectorizer.get_feature_names() print spamFeatures.shape, type(spamFeatures) #print notSpamFeatures.shape, type(notSpamFeatures) X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2) clf = LogisticRegression() clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) from sklearn import metrics print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted) print print metrics.classification_report(y_test, y_predicted) print print 'confusion matrix' print print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
def processDir(data_dir, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) # process every html document. file_list = os.listdir(data_dir); html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True word_dict = dict() def updateWordDict(word): if word_dict.has_key(word): word_dict[word] = word_dict[word]+1 else: word_dict[word] = 1 for file_name in file_list: if file_name[0] == '.': continue # remove html tags. parsetree = lxml.html.parse(data_dir+'/'+file_name) parsetree = html_cleaner.clean_html(parsetree) content = parsetree.getroot().text_content() # word extraction. words_raw = list(jieba.cut(content)) words = list() for word in words_raw: uchar = word[0] if uchar >= u'\u4e00' and uchar<=u'\u9fa5' : # chinese. words.append(word) updateWordDict(word) if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'): word = word.lower() words.append(word) updateWordDict(word) # print words text = ' '.join(words) # print text output = open(output_dir+file_name, 'w') output.write(text.encode('utf-8')) output.close() output = open(output_dir+'words.dict', 'w') for word in word_dict.keys(): output.write(word.encode('utf-8')+' '+str(word_dict[word])+'\n')
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content() data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content() data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content() data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content() data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content() data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content() if data[qty] < 0: data[qty] = 0 # self.name= cleaner.clean_html(self.name) return data
def get_url(self): """Get the relevant part of a web page.""" get_url = requests.get(self.data_path) page_data = get_url.content cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img'] # Remove these tags. # Store the cleaned up HTML. page_html = cleaner.clean_html(page_data) # Strip tags from final results. strip_tags = TagStripper() # Instantiate the HTML Tag Stripper. strip_tags.feed(page_html) # Strip all HTML tags. return strip_tags.get_html_data()
def get_content(self, pathName): try: file = open(pathName, "r") html_text = file.read() file.close() except: print("Fail to open the file located in {}".format(pathName)) return None try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True ######## Add cleaner.page_structure = False htmlData = cleaner.clean_html(html_text) except: print("Could not remove style and js code from the file located in {}".format(pathName)) return None ######## Add soup = BeautifulSoup(htmlData, "lxml") ######## Change return tuple (raw_content, soup) instead of raw_content return soup
def crawNews(self, url): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True tech_content = lxml.html.parse(url) tech_content = (lxml.html.tostring(tech_content)) re_title = re.compile(r'<h1.*>(.*)</h1', re.S) re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S) re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"') re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>') match_title = re.search(re_title, tech_content) match_content = re.search(re_content, tech_content) match_date = re.search(re_published, tech_content) match_author = re.search(re_author, tech_content) author_url = "http://techcrunch.com" + match_author.group(1) author_name = match_author.group(2) author_twitter = match_author.group(3) title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1))) title = re.sub(r'\s+', ' ', title) title = title.decode('utf-8').strip() content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1))) content = re.sub(r'\s+', ' ', content) content = content.decode('utf-8').strip() content = content.strip('\n') published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S') news = self.save_news(url, title, content, published_on) author = self.findAuthorByUrl(author_url) if (isinstance(author, Author) == False): author = self.save_author(author_url, author_name, author_twitter, '') self.newsAuthor(news, author)
def get_content(self, pathName): try: get_title = urllib.urlopen(pathName) soup = Soup(get_title) file = open(pathName, "r") html_text = file.read() file.close() except: print("Fail to open the file located in {}".format(pathName)) return None try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True htmlData = cleaner.clean_html(html_text) except: print("Could not remove style and js code from the file located in {}".format(pathName)) return None try: title = soup.title.string.encode("utf-8") except: title = "" return html.fromstring(htmlData).text_content() , title
def get_url(self): """Get the HTML body of a web page.""" # Create file-like object. outfile = StringIO.StringIO() cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img', 'li'] # Remove these tags. # Store the cleaned up HTML. page_html = lxml.html.tostring( cleaner.clean_html( lxml.html.parse(self.data_path) ) ) outfile.write(page_html) # Write the results to this file in memory. return outfile