def html2content(html, allowed_tags=["a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr"]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def html2text(html): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True html_tree = cleaner.clean_html(lxml.html.fromstring(html)) el.strip_tags(html_tree, '*') return html_tree.text
def clense(text, space_replacer=' ', to_lower=True, remove_punc=True): # remove HTML comments first as suggested in https://stackoverflow.com/questions/28208186/how-to-remove-html-comments-using-regex-in-python cleaner = Cleaner() cleaner.javascript = True cleaner.style = True text = cleaner.clean_html(text.encode("utf-8")).decode("utf-8") text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL) text = remove_tags(text) text = re.sub(r'[^\x00-\x7F]+', ' ', text) #remove non-ascii characters text = text.replace("&", "and") text = text.replace("&", "and") text.strip() text.rstrip() text = text.replace("\r\n", "") text = text.replace("\n", "") text = text.replace("\"", "") if to_lower: text = text.lower() if remove_punc: # from https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python text = re.sub(r'[^\w\s]', '', text) #remove punctuation marks and non-word text = text.replace(",", "") text = re.sub(' +', space_replacer, text) #if all(ord(char) < 128 for char in text) == False: # text = '' ''.join(i for i in text if ord(i) < 128) return text
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) f = open("testing.txt", 'w') f.write(page) clean_page = cleaner.clean_html(page) f.write("\n\n\nVS\n\n\n") f.write(clean_page) f.close() soup = BeautifulSoup(clean_page, "lxml") return extract(soup, url)
def create_word_frequencies(self): document = re.sub(find_doc_content_pattern, "", self.content) cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.style = True # # cleaner.allow_tags = [''] # # cleaner.remove_unknown_tags = False try: document_visible_text = cleaner.clean_html(document) except UnicodeDecodeError: document_visible_text = "" print "Unicode Error" # document_visible_text = document word_list = document_visible_text.split() for word in word_list: word_stemmed = word.lower() try: self.word_frequencies[ word_stemmed] = self.word_frequencies[word_stemmed] + 1 except: self.word_frequencies[word_stemmed] = 1 self.total_word_count = self.total_word_count + 1
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter( formatter.DumbWriter(textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) clean_page = cleaner.clean_html(page) soup = BeautifulSoup(clean_page, "lxml") extract(soup, url)
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def remove_script_and_style(html_content): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['script'] clean_html = cleaner.clean_html(html_content) return clean_html
def extract_text(self, url): try: if url.value.startswith('http') and '://' in url.value: prog = FloatProgress(min=0, max=100, description='Progress') display(widgets.HTML('<br/>'), prog) tr0 = time() site = self.browser.get(url.value, timeout=10) if site.ok: prog.value += 50 tr1 = time() - tr0 t0 = time() cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['header', 'footer'] source_tree = etree.HTML(cleaner.clean_html(site.content)) text = source_tree.itertext() t1 = time() - t0 self.text = '\n'.join( [n.strip() for n in text if n.strip()]) prog.value += 50 self.keywords_and_display(prog) else: display( widgets.HTML( '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>' )) else: self.text = url.value self.keywords_and_display(False) except Exception as e: print 'Error extracting text: %s' % (e)
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def get_text(self, html_content: str): cleaner = Cleaner() cleaner.style = True cleaner.inline_style = True cleaned = cleaner.clean_html(html_content) soup = BeautifulSoup(cleaned, 'lxml') text_lines = soup.findAll(text=True) text_lines_merged = [] merge_str = '' text_lines_merged.append(text_lines[0]) for line in text_lines[1:]: if '\n' == line or '' == line or ' ' == line: if merge_str is not '': text_lines_merged.append(merge_str) merge_str = '' else: merge_str += (' ' + line) text_lines_merged = [ self.strip(line) for line in text_lines_merged if len(self.strip(line)) > 128 ] print(' '.join(text_lines_merged))
def buildDicts(n): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 tagsDict = set() while (i < n): if (os.path.isfile("spam/%d.txt" % i)): try: readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0] if (os.path.isfile("notspam/%d.txt" % i)): try: readInFile = open("notspam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0]
def tokenize(n, tagsDict): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 df = pandas.DataFrame(columns=[list(tagsDict)]) while (i < n): allVector = {} if (os.path.isfile("spam/%d.txt" % i)): try: for word in tagsDict: allVector[word] = 0 readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols allCopy = noSymbols.split() # allCopy is the set of words without symbols for tag in allCopy: df.ix[i[tag]] = df.ix[i[tag]] + 1 df.ix[i['isSpam']] = 'spam' except Exception, err: print traceback.format_exc() print sys.exc_info()[0] i = i + 1
def raw_scraper(url, memoize): t1 = time.time() if should_exclude(url): # heuristic to make downloading faster return None, { "url": url, "scraper": "raw", } try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = minify(article.html) html = cleaner.clean_html(html) article.parse() except: return None, { "url": url, "scraper": "raw", } if article.text == "": return None, { "url": url, "scraper": "raw", } metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"} return html, metadata
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def init_cleaner(): from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = False cleaner.style = False cleaner.kill_tags = ["pre", "code"] return cleaner
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def getArticles(keyword): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True br = mechanize.Browser() br.set_handle_robots(False) br.addheaders=[('User-agent','chrome')] term = keyword.replace(" ", "+") query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term htmltext = br.open(query).read() #print htmltext soup = BeautifulSoup(htmltext) search = soup.findAll('div', attrs={'id': 'search'}) #print search[0] searchtext= str(search[0]) soup1=BeautifulSoup(searchtext) list_items=soup1.findAll('li') regex = "q=.*?&" pattern = re.compile(regex) results_array = [] for li in list_items: soup2 = BeautifulSoup(str(li)) links = soup2.findAll('a') source_link = links[0] #print source_link source_url = re.findall(pattern, str(source_link)) if len(source_url) > 0: results_array.append(str(source_url[0].replace("q=", "").replace("&", ""))) return results_array
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def getArticles(keyword): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'chrome')] term = keyword.replace(" ", "+") query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term htmltext = br.open(query).read() #print htmltext soup = BeautifulSoup(htmltext) search = soup.findAll('div', attrs={'id': 'search'}) #print search[0] searchtext = str(search[0]) soup1 = BeautifulSoup(searchtext) list_items = soup1.findAll('li') regex = "q=.*?&" pattern = re.compile(regex) results_array = [] for li in list_items: soup2 = BeautifulSoup(str(li)) links = soup2.findAll('a') source_link = links[0] #print source_link source_url = re.findall(pattern, str(source_link)) if len(source_url) > 0: results_array.append( str(source_url[0].replace("q=", "").replace("&", ""))) return results_array
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def cleanMe(text): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True #text = unicodedata.normalize("NFKD", text).encode('ascii','ignore') clean = cleaner.clean_html(text) return clean
def strip_tags(web_content): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True text = BS(lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(web_content))), features="lxml") return text.getText()
def get_context(web_link, answer): browser = webdriver.Firefox( executable_path='geckodriver-v0.26.0-win64/geckodriver') browser.get(web_link) html_source = browser.page_source get_context = BeautifulSoup(html_source, "lxml") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True web_page_text = '' for element in get_context: element_string = lxml.html.document_fromstring(str(element)) page_text = lxml.html.tostring(cleaner.clean_html(element_string)) page_text = re.sub("<.*?>", " ", str(page_text)) web_page_text = web_page_text + " " + page_text browser.close() matcher = difflib.SequenceMatcher(None, web_page_text, answer) match = matcher.find_longest_match(0, len(web_page_text), 0, len(answer)) if match.a > 1000: start_context = match.a - 999 else: start_context = 0 if len(web_page_text) > start_context + 2000: end_context = start_context + 2000 else: end_context = len(web_page_text) - 1 context = web_page_text[start_context:end_context] return context
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def Content(content): doc = html.document_fromstring(content) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True doc = cleaner.clean_html(doc) plaintext = "\n".join(etree.XPath("//text()")(doc)) return plaintext
def trim_html(html): """Takes a html string as input and returns the html without any styles nor javascript""" cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True # Get rid of the javascript and the style cleaner.style = True return cleaner.clean_html(html)
def cleanInputString(self, htmlString): # "WITH JAVASCRIPT & STYLES" cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter # "WITHOUT JAVASCRIPT & STYLES" htmlClean = lxml.html.tostring(cleaner.clean_html(htmlString)) return htmlClean
def __cleanhtml(raw_html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleantext = cleaner.clean_html(raw_html) cleantext = BeautifulSoup(cleantext, "lxml").text return cleantext
def _get_cleaner(self, print_style, print_js, remove_tags): c = Cleaner() c.scripts = not print_js c.javascript = not print_js c.style = not print_style c.remove_tags = remove_tags c.page_structure = False return c
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.allow_tags = allow_tags cleaner.remove_unknown_tags = False return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)), encoding='unicode')
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def _parseAndCleanHtml(rawHtml): # Parse html with lxml library parsedHtml = lh.fromstring(rawHtml) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True return cleaner.clean_html(parsedHtml)
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li', 'em', 'i', 'code', 'pre', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def create_html_cleaner(self): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.remove_tags = [ 'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' ] return cleaner
def flipkart_extract_details(self, msg_fields, order_xpath, msg_params, acc_params): messages_dict = dict() items_dict = dict() messages_dict = {'store': 'Flipkart'} messages_dict.update(msg_fields) messages_dict.update(acc_params) messages_dict["user_contextio_uuid"] = messages_dict.pop("id") items_dict["store"] = messages_dict["store"] items_dict["user_contextio_uuid"] = messages_dict["user_contextio_uuid"] items_dict["user_email"] = messages_dict["user_email"] cleaner = Cleaner() cleaner.javascript = True cleaner.style = True tree = html.fromstring(msg_fields["body"]) #print order_xpath if msg_params["sender"] == "*****@*****.**" and "confirmation" in msg_fields["subject"].strip().lower(): #tree = html.fromstring(msg_fields["body"]) #address = tree.xpath(order_xpath["address"]) # print order_xpath["address"] print msg_fields["subject"] #print address address = list() bsObj = BeautifulSoup(msg_fields["body"]) add = bsObj.find(text="DELIVERY ADDRESS") if add: addr = add.findNext().get_text().encode("utf-8") #addr = addr.replace("\xa0", " ") address.append(addr) address.append(add.findNext().findNext().get_text().encode("utf-8")) print type(address) #print address items_dict["delivery_address"] = str_process(address) print items_dict["delivery_address"] else: items_dict["delivery_address"] = "" """for x in range(len(order_xpath["order_id"])): order_id = tree.xpath(order_xpath["order_id"][x]) if order_id: break""" order_id = re.search(r"\[(\w+)\]", msg_fields["subject"]) items_dict["order_id"] = order_id.group(1) print items_dict["order_id"] items_dict["item_title"] = str_process(tree.xpath(order_xpath["item_title"])) print items_dict["item_title"] item_price = tree.xpath(order_xpath["item_price"]) items_dict["item_price"] = amount_process(item_price) print items_dict["item_price"] items_dict["item_status"] = "confirmed" messages_dict["order_id"] = items_dict["order_id"] #insert_to_items_table(**items_dict) #insert_to_messages_table(**messages_dict) """
def cleaned_html(self): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.comments = True cleaner.style = True self.dom = cleaner.clean_html(self.dom) assert len(self.dom), 'The html needs to be parsed to get the cleaned html' return lxml.html.tostring(self.dom)
def url2count(title): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html= True r = requests.get(makeurl(title), timeout=5) #r.text lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' ')) text = nltk.clean_html(lxclean) collapsewhitespace = re.sub(r'\s{2,}', ' ', text) nonPunct = re.compile('.*[A-Za-z0-9].*') article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)] article_length = len(article_list) return(article_length)
def GetTextData(self, htmlData, forUrl='<Mising URL info>'): '''Function to clean up html raw data and get the text from it. Keep it small. Not thread safe, returns an object that will go into the parsedData["text"] field for HandleData function above''' from lxml import html if self.RemoveJavaScriptAndCSS: try: from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True htmlData = cleaner.clean_html(htmlData) except: print("Could not remove style and js code for url :" + forUrl) return html.fromstring(htmlData).text_content()
def clean_cachefiles(): """Clean silly html from all cachefiles in the cachdir""" if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'): import lxml.html from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.style = True cleaner.scripts = True cleaner.javascript = True for file in _get_all_cache_files(): cfile = CompressedFile(file) data = cfile.read() cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data))) cfile.write(cleaned) logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
def get_content(url): if url is None: return 'Body is not found!' content = requests.get(url).content doc = html.fromstring(content) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True # cleaner.remove_tags = ['br'] content = html.tostring(cleaner.clean_html(doc)) return content
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def tokenize(n): reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 existingSpam = list() existingNotSpam = list() for file in os.listdir("./spam/"): if (i == n): Break else: spamPath = os.path.join("./spam", file) existingSpam.append(spamPath) i = i + 1 i=0 for file in os.listdir("./notspam/"): if (i == n): break else: spamPath = os.path.join("./notspam", file) existingNotSpam.append(spamPath) i = i+1 y1=['0'] * len(existingSpam) y2=['1'] * len(existingNotSpam) y = y1+y2 existingSpam = existingSpam + existingNotSpam vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore') spamFeatures = vectorizer.fit_transform(existingSpam) #print vectorizer.get_feature_names() print spamFeatures.shape, type(spamFeatures) #print notSpamFeatures.shape, type(notSpamFeatures) X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2) clf = LogisticRegression() clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) from sklearn import metrics print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted) print print metrics.classification_report(y_test, y_predicted) print print 'confusion matrix' print print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def processDir(data_dir, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) # process every html document. file_list = os.listdir(data_dir); html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True word_dict = dict() def updateWordDict(word): if word_dict.has_key(word): word_dict[word] = word_dict[word]+1 else: word_dict[word] = 1 for file_name in file_list: if file_name[0] == '.': continue # remove html tags. parsetree = lxml.html.parse(data_dir+'/'+file_name) parsetree = html_cleaner.clean_html(parsetree) content = parsetree.getroot().text_content() # word extraction. words_raw = list(jieba.cut(content)) words = list() for word in words_raw: uchar = word[0] if uchar >= u'\u4e00' and uchar<=u'\u9fa5' : # chinese. words.append(word) updateWordDict(word) if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'): word = word.lower() words.append(word) updateWordDict(word) # print words text = ' '.join(words) # print text output = open(output_dir+file_name, 'w') output.write(text.encode('utf-8')) output.close() output = open(output_dir+'words.dict', 'w') for word in word_dict.keys(): output.write(word.encode('utf-8')+' '+str(word_dict[word])+'\n')
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
def extract_content(bytehtml, doc): """ extracts blog post content from html """ lxmldoc = lxml.html.document_fromstring(bytehtml) cleaner = Cleaner() cleaner.scripts = True cleaner.comments = True cleaner.style = True #cleaner.page_structure = True cleaner.kill_tags = ['head', 'noscript'] cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote'] cleaner(lxmldoc) content_el = find_content_element(lxmldoc) if content_el: debug(3, 'content quality {}'.format(content_el._quality)) text = tidy_content(content_el.text_content()) return text else: debug(2, 'no content found!') raise Exception('no content')
def get_url(self): """Get the relevant part of a web page.""" get_url = requests.get(self.data_path) page_data = get_url.content cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img'] # Remove these tags. # Store the cleaned up HTML. page_html = cleaner.clean_html(page_data) # Strip tags from final results. strip_tags = TagStripper() # Instantiate the HTML Tag Stripper. strip_tags.feed(page_html) # Strip all HTML tags. return strip_tags.get_html_data()
def crawNews(self, url): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True tech_content = lxml.html.parse(url) tech_content = (lxml.html.tostring(tech_content)) re_title = re.compile(r'<h1.*>(.*)</h1', re.S) re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S) re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"') re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>') match_title = re.search(re_title, tech_content) match_content = re.search(re_content, tech_content) match_date = re.search(re_published, tech_content) match_author = re.search(re_author, tech_content) author_url = "http://techcrunch.com" + match_author.group(1) author_name = match_author.group(2) author_twitter = match_author.group(3) title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1))) title = re.sub(r'\s+', ' ', title) title = title.decode('utf-8').strip() content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1))) content = re.sub(r'\s+', ' ', content) content = content.decode('utf-8').strip() content = content.strip('\n') published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S') news = self.save_news(url, title, content, published_on) author = self.findAuthorByUrl(author_url) if (isinstance(author, Author) == False): author = self.save_author(author_url, author_name, author_twitter, '') self.newsAuthor(news, author)
def get_content(self, pathName): try: file = open(pathName, "r") html_text = file.read() file.close() except: print("Fail to open the file located in {}".format(pathName)) return None try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True ######## Add cleaner.page_structure = False htmlData = cleaner.clean_html(html_text) except: print("Could not remove style and js code from the file located in {}".format(pathName)) return None ######## Add soup = BeautifulSoup(htmlData, "lxml") ######## Change return tuple (raw_content, soup) instead of raw_content return soup
def get_content(self, pathName): try: get_title = urllib.urlopen(pathName) soup = Soup(get_title) file = open(pathName, "r") html_text = file.read() file.close() except: print("Fail to open the file located in {}".format(pathName)) return None try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True htmlData = cleaner.clean_html(html_text) except: print("Could not remove style and js code from the file located in {}".format(pathName)) return None try: title = soup.title.string.encode("utf-8") except: title = "" return html.fromstring(htmlData).text_content() , title
def get_url(self): """Get the HTML body of a web page.""" # Create file-like object. outfile = StringIO.StringIO() cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img', 'li'] # Remove these tags. # Store the cleaned up HTML. page_html = lxml.html.tostring( cleaner.clean_html( lxml.html.parse(self.data_path) ) ) outfile.write(page_html) # Write the results to this file in memory. return outfile
def tokenize(n): reload(sys) sys.setdefaultencoding('utf8') cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 existingSpam = list() existingNotSpam = list() while (i < n): if (os.path.isfile("spam/%d.txt" % i)): existingSpam.append("spam/" + str(i) + ".txt") if (os.path.isfile("notspam/%d.txt" % i)): existingNotSpam.append("notspam/" + str(i) + ".txt") i = i + 1 y1=['spam'] * len(existingSpam) y2=['notSpam'] * len(existingNotSpam) y = y1+y2 existingSpam = existingSpam + existingNotSpam vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore') spamFeatures = vectorizer.fit_transform(existingSpam) #print vectorizer.get_feature_names() print spamFeatures.shape, type(spamFeatures) #print notSpamFeatures.shape, type(notSpamFeatures) X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2) clf = LogisticRegression() clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) from sklearn import metrics print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted) print print metrics.classification_report(y_test, y_predicted) print print 'confusion matrix' print print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc["content"] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = " ".join([title for title in tree.xpath("//title/text()")]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = " ".join(tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc["url"], desc=description, rank=doc["rank"], content="\n".join([title, doc["url"], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since