Esempio n. 1
0
def tokenize(n, tagsDict):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	df = pandas.DataFrame(columns=[list(tagsDict)])

	while (i < n):
		allVector = {}
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				for word in tagsDict:
					allVector[word] = 0
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				allCopy = noSymbols.split()  # allCopy is the set of words without symbols
				for tag in allCopy:
					df.ix[i[tag]] = df.ix[i[tag]]  + 1
				df.ix[i['isSpam']] = 'spam'
				
			except Exception, err:
				print traceback.format_exc()
    			print sys.exc_info()[0]
		
		i = i + 1		
Esempio n. 2
0
 def extract_text(self, url):
     try:
         if url.value.startswith('http') and '://' in url.value:
             prog = FloatProgress(min=0, max=100, description='Progress')
             display(widgets.HTML('<br/>'), prog)
             tr0 = time()
             site = self.browser.get(url.value, timeout=10)
             if site.ok:
                 prog.value += 50
                 tr1 = time() - tr0
                 t0 = time()
                 cleaner = Cleaner()
                 cleaner.javascript = True
                 cleaner.style = True
                 cleaner.kill_tags = ['header', 'footer']
                 source_tree = etree.HTML(cleaner.clean_html(site.content))
                 text = source_tree.itertext()
                 t1 = time() - t0
                 self.text = '\n'.join(
                     [n.strip() for n in text if n.strip()])
                 prog.value += 50
                 self.keywords_and_display(prog)
             else:
                 display(
                     widgets.HTML(
                         '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>'
                     ))
         else:
             self.text = url.value
             self.keywords_and_display(False)
     except Exception as e:
         print 'Error extracting text: %s' % (e)
Esempio n. 3
0
 def init_cleaner():
     from lxml.html.clean import Cleaner
     cleaner = Cleaner()
     cleaner.javascript = False
     cleaner.style = False
     cleaner.kill_tags = ["pre", "code"]
     return cleaner
    def create_word_frequencies(self):

        document = re.sub(find_doc_content_pattern, "", self.content)

        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True
        # # cleaner.allow_tags = ['']
        # # cleaner.remove_unknown_tags = False

        try:
            document_visible_text = cleaner.clean_html(document)
        except UnicodeDecodeError:
            document_visible_text = ""
            print "Unicode Error"
        # document_visible_text = document

        word_list = document_visible_text.split()
        for word in word_list:
            word_stemmed = word.lower()
            try:
                self.word_frequencies[
                    word_stemmed] = self.word_frequencies[word_stemmed] + 1
            except:
                self.word_frequencies[word_stemmed] = 1
            self.total_word_count = self.total_word_count + 1
Esempio n. 5
0
File: wapa.py Progetto: mtamer/wapa
def getArticles(keyword):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'chrome')]

    term = keyword.replace(" ", "+")
    query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term
    htmltext = br.open(query).read()
    #print htmltext

    soup = BeautifulSoup(htmltext)

    search = soup.findAll('div', attrs={'id': 'search'})
    #print search[0]
    searchtext = str(search[0])
    soup1 = BeautifulSoup(searchtext)
    list_items = soup1.findAll('li')

    regex = "q=.*?&amp"
    pattern = re.compile(regex)
    results_array = []
    for li in list_items:
        soup2 = BeautifulSoup(str(li))
        links = soup2.findAll('a')
        source_link = links[0]
        #print source_link
        source_url = re.findall(pattern, str(source_link))
        if len(source_url) > 0:
            results_array.append(
                str(source_url[0].replace("q=", "").replace("&amp", "")))
    return results_array
Esempio n. 6
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
Esempio n. 7
0
def clense(text, space_replacer=' ', to_lower=True, remove_punc=True):
    # remove HTML comments first as suggested in https://stackoverflow.com/questions/28208186/how-to-remove-html-comments-using-regex-in-python

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    text = cleaner.clean_html(text.encode("utf-8")).decode("utf-8")

    text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL)
    text = remove_tags(text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  #remove non-ascii characters
    text = text.replace("&amp;", "and")
    text = text.replace("&", "and")
    text.strip()
    text.rstrip()
    text = text.replace("\r\n", "")
    text = text.replace("\n", "")
    text = text.replace("\"", "")
    if to_lower:
        text = text.lower()

    if remove_punc:
        # from https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
        text = re.sub(r'[^\w\s]', '',
                      text)  #remove punctuation marks and non-word
        text = text.replace(",", "")

    text = re.sub(' +', space_replacer, text)
    #if  all(ord(char) < 128 for char in text) == False:
    #    text = ''
    ''.join(i for i in text if ord(i) < 128)
    return text
Esempio n. 8
0
def clean_and_update_html(html, images):
    cleaner = Cleaner()
    cleaner.javascript = True
    html = lxml.html.tostring(
        cleaner.clean_html(lxml.html.fromstring(html)),
        method='html',
        encoding='unicode',
        doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
        ' "http://www.w3.org/TR/html4/strict.dtd">')
    # forwarding
    if "\n\n\n\n" in html:
        html = re.split("\n\n\n\n", html, 1, re.I)[1]

    block = re.findall("<img[^<]*src[^<]*>", html)
    for answer in block:
        start_quote = answer.index("src=")
        indices = []  #start/end indices of the link
        for i in range(start_quote, len(answer)):
            if (answer[i] == '"'):
                indices.append(i)
        website = answer[indices[0] + 1:indices[1]]
        if "cid:" in website:
            # Attachment!
            cid = website.split(":")[1]
            if (cid in images):
                html = html.replace(
                    website,
                    f"data:{images[cid].get_content_type()};base64,{images[cid].get_payload()}"
                )
            else:
                html = html.replace(website, '')
        elif 'http' not in website:
            html = html.replace(answer, '')

    return html
Esempio n. 9
0
def _statistica_(url_string):
    """Implementa la logica per estrarre documento
    e metadati da rivista-statistica
    """
    url = urlparse.urlparse(url_string)
    conn = httplib.HTTPConnection(url.hostname)
    conn.request("GET", url.path)
    res = conn.getresponse()
    body = res.read()

    my_page = html.fromstring(body)

    # Rimuovi il banner dei cookie del *****
    for el in my_page.xpath('//*[@id="cookiesAlert"]'):
        el.getparent().remove(el)

    # Rimuovi tutti i tag script e il loro contenuto
    cleaner = Cleaner()
    cleaner.javascript = True
    my_page = cleaner.clean_html(my_page)

    title = my_page.xpath('//*[@id="articleTitle"]/h3')
    full_content = my_page.xpath('//*[@id="content"]')
    doi = my_page.xpath('//*[@id="pub-id::doi"]')

    full_content = ''.join(
        [etree.tostring(fix_links(el, url_string)) for el in full_content])

    result = {
        'title': title[0].text_content(),
        'content': full_content,
        'doi': doi[0].text_content()
        }

    return json.JSONEncoder().encode(result)
    def get_clean_html(self, html_text, text_only=True):
        try:
            etree = lxml.html.document_fromstring(html_text)

            self._is_etree(etree)
            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(etree)
            if text_only:
                return ' '.join(html.text_content().split())
                # return html.text_content()

            res = lxml.html.tostring(html)
        except Exception as e:
            logger.error(f"While parsing email in get_clean_html {e}")
            res = "junk"

        return res
Esempio n. 11
0
def strip_tags(web_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    text = BS(lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(web_content))), features="lxml")
    return text.getText()
Esempio n. 12
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 13
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 14
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 15
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 16
0
def remove_script_and_style(html_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['script']
    clean_html = cleaner.clean_html(html_content)
    return clean_html
def get_context(web_link, answer):
    browser = webdriver.Firefox(
        executable_path='geckodriver-v0.26.0-win64/geckodriver')
    browser.get(web_link)
    html_source = browser.page_source
    get_context = BeautifulSoup(html_source, "lxml")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    web_page_text = ''

    for element in get_context:
        element_string = lxml.html.document_fromstring(str(element))
        page_text = lxml.html.tostring(cleaner.clean_html(element_string))
        page_text = re.sub("<.*?>", " ", str(page_text))
        web_page_text = web_page_text + " " + page_text
    browser.close()
    matcher = difflib.SequenceMatcher(None, web_page_text, answer)
    match = matcher.find_longest_match(0, len(web_page_text), 0, len(answer))
    if match.a > 1000:
        start_context = match.a - 999
    else:
        start_context = 0

    if len(web_page_text) > start_context + 2000:
        end_context = start_context + 2000
    else:
        end_context = len(web_page_text) - 1

    context = web_page_text[start_context:end_context]
    return context
Esempio n. 18
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Esempio n. 19
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 20
0
def _statistica_(url_string):
    """Implementa la logica per estrarre documento
    e metadati da rivista-statistica
    """
    url = urlparse.urlparse(url_string)
    conn = httplib.HTTPConnection(url.hostname)
    conn.request("GET", url.path)
    res = conn.getresponse()
    body = res.read()

    my_page = html.fromstring(body)

    # Rimuovi il banner dei cookie del *****
    for el in my_page.xpath('//*[@id="cookiesAlert"]'):
        el.getparent().remove(el)

    # Rimuovi tutti i tag script e il loro contenuto
    cleaner = Cleaner()
    cleaner.javascript = True
    my_page = cleaner.clean_html(my_page)

    title = my_page.xpath('//*[@id="articleTitle"]/h3')
    full_content = my_page.xpath('//*[@id="content"]')
    doi = my_page.xpath('//*[@id="pub-id::doi"]')

    full_content = ''.join(
        [etree.tostring(fix_links(el, url_string)) for el in full_content])

    result = {
        'title': title[0].text_content(),
        'content': full_content,
        'doi': doi[0].text_content()
    }

    return json.JSONEncoder().encode(result)
Esempio n. 21
0
File: wapa.py Progetto: mtamer/wapa
def getArticles(keyword):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True

	br = mechanize.Browser()
	br.set_handle_robots(False)
	br.addheaders=[('User-agent','chrome')]

	term = keyword.replace(" ", "+")
	query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term 
	htmltext = br.open(query).read()
	#print htmltext

	soup = BeautifulSoup(htmltext)

	search = soup.findAll('div', attrs={'id': 'search'})
	#print search[0]
	searchtext= str(search[0])
	soup1=BeautifulSoup(searchtext)
	list_items=soup1.findAll('li')

	regex = "q=.*?&amp"	
	pattern = re.compile(regex)
	results_array = []
	for li in list_items:
		soup2 = BeautifulSoup(str(li))
		links = soup2.findAll('a')
		source_link = links[0]
		#print source_link
		source_url = re.findall(pattern, str(source_link))
		if len(source_url) > 0:
				results_array.append(str(source_url[0].replace("q=", "").replace("&amp", "")))
	return results_array
Esempio n. 22
0
def buildDicts(n):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0 
	tagsDict = set()
	while (i < n):
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
		if (os.path.isfile("notspam/%d.txt" % i)):
			try:
				readInFile = open("notspam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
Esempio n. 23
0
def html2text(html):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    html_tree = cleaner.clean_html(lxml.html.fromstring(html))
    el.strip_tags(html_tree, '*')
    return html_tree.text
Esempio n. 24
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 25
0
def cleanMe(text):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    #text = unicodedata.normalize("NFKD", text).encode('ascii','ignore')
    clean = cleaner.clean_html(text)
    return clean
Esempio n. 26
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Esempio n. 27
0
def raw_scraper(url, memoize):
    t1 = time.time()
    if should_exclude(url):
        # heuristic to make downloading faster
        return None, {
            "url": url,
            "scraper": "raw",
        }

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, {
            "url": url,
            "scraper": "raw",
        }
    if article.text == "":
        return None, {
            "url": url,
            "scraper": "raw",
        }

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata
Esempio n. 28
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 29
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 30
0
 def _get_cleaner(self, print_style, print_js, remove_tags):
     c = Cleaner()
     c.scripts = not print_js
     c.javascript = not print_js
     c.style = not print_style
     c.remove_tags = remove_tags
     c.page_structure = False
     return c
Esempio n. 31
0
def Content(content):
    doc = html.document_fromstring(content)
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    doc = cleaner.clean_html(doc)
    plaintext = "\n".join(etree.XPath("//text()")(doc))
    return plaintext
Esempio n. 32
0
def __cleanhtml(raw_html):
    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True

    cleantext = cleaner.clean_html(raw_html)
    cleantext = BeautifulSoup(cleantext, "lxml").text
    return cleantext
Esempio n. 33
0
def trim_html(html):
    """Takes a html string as input and returns the html without any styles nor javascript"""
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.javascript = True  # Get rid of the javascript and the style
    cleaner.style = True

    return cleaner.clean_html(html)
Esempio n. 34
0
 def cleanInputString(self, htmlString):
     # "WITH JAVASCRIPT & STYLES"
     cleaner = Cleaner()
     cleaner.javascript = True  # This is True because we want to activate the javascript filter
     cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
     # "WITHOUT JAVASCRIPT & STYLES"
     htmlClean = lxml.html.tostring(cleaner.clean_html(htmlString))
     return htmlClean
Esempio n. 35
0
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.allow_tags = allow_tags
    cleaner.remove_unknown_tags = False
    return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)),
                              encoding='unicode')
Esempio n. 36
0
def clean_article_html(cls, node):
    article_cleaner = Cleaner()
    article_cleaner.javascript = True
    article_cleaner.style = True
    article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li',
                                  'em', 'i', 'code', 'pre', 'blockquote', 'h1',
                                  'h2', 'h3', 'h4', 'h5', 'h6']
    article_cleaner.remove_unknown_tags = False
    return article_cleaner.clean_html(node)
Esempio n. 37
0
 def create_html_cleaner(self):
     cleaner = Cleaner()
     cleaner.javascript = True
     cleaner.style = True
     cleaner.remove_tags = [
         'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed',
         'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'
     ]
     return cleaner
Esempio n. 38
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b',
             'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1',
             'h2', 'h3', 'h4', 'h5', 'h6']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 39
0
def _parseAndCleanHtml(rawHtml):
    # Parse html with lxml library
    parsedHtml = lh.fromstring(rawHtml)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    return cleaner.clean_html(parsedHtml)
Esempio n. 40
0
	def flipkart_extract_details(self, msg_fields, order_xpath, msg_params, acc_params):
		messages_dict = dict()
		items_dict = dict()
		messages_dict = {'store': 'Flipkart'}
		messages_dict.update(msg_fields)
		messages_dict.update(acc_params)
		messages_dict["user_contextio_uuid"] = messages_dict.pop("id")
		items_dict["store"] = messages_dict["store"]
		items_dict["user_contextio_uuid"] = messages_dict["user_contextio_uuid"]
		items_dict["user_email"] = messages_dict["user_email"]
		cleaner = Cleaner()
		cleaner.javascript = True
		cleaner.style = True

		tree = html.fromstring(msg_fields["body"])
		#print order_xpath
		if msg_params["sender"] == "*****@*****.**" and "confirmation" in msg_fields["subject"].strip().lower():
			#tree = html.fromstring(msg_fields["body"])
			#address = tree.xpath(order_xpath["address"])
#			print order_xpath["address"]
			print msg_fields["subject"]
			#print address
			address = list()
			bsObj = BeautifulSoup(msg_fields["body"])
			add = bsObj.find(text="DELIVERY ADDRESS")
			
			if add:
				addr = add.findNext().get_text().encode("utf-8")
				#addr = addr.replace("\xa0", " ")
				address.append(addr)
				address.append(add.findNext().findNext().get_text().encode("utf-8"))
				print type(address)
				#print address
				items_dict["delivery_address"] = str_process(address)
				print items_dict["delivery_address"]
			else:
				items_dict["delivery_address"] = ""
			"""for x in range(len(order_xpath["order_id"])):
				order_id = tree.xpath(order_xpath["order_id"][x])
				if order_id:
					break"""
			order_id = re.search(r"\[(\w+)\]", msg_fields["subject"])
			items_dict["order_id"] = order_id.group(1)
			print items_dict["order_id"]
			items_dict["item_title"] = str_process(tree.xpath(order_xpath["item_title"]))
			print items_dict["item_title"]
			item_price = tree.xpath(order_xpath["item_price"])
			items_dict["item_price"] = amount_process(item_price)
			print items_dict["item_price"]
			items_dict["item_status"] = "confirmed"
			messages_dict["order_id"] = items_dict["order_id"]
			#insert_to_items_table(**items_dict)
			#insert_to_messages_table(**messages_dict)
			
			
			"""
Esempio n. 41
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()

        if data["qty"] < 0:
            data["qty"] = 0
        return data
Esempio n. 42
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Esempio n. 43
0
def url2count(title):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html= True
    
    r = requests.get(makeurl(title), timeout=5) #r.text
    lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' '))
    text = nltk.clean_html(lxclean)
    collapsewhitespace = re.sub(r'\s{2,}', ' ', text)
    nonPunct = re.compile('.*[A-Za-z0-9].*') 
    article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)]
    article_length = len(article_list)
    return(article_length)
Esempio n. 44
0
	def clean(self):
		cleaner= Cleaner(page_structure=False)
		cleaner.javascript = True
		cleaner.scripts = True
		cleaner.frames = True
		cleaner.allow_tags = []
		cleaner.remove_tags = ['p', 'div', 'a']
		self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
		self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content()
		self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content()
		self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content()
		self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content()
		self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content()
		self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()		
Esempio n. 45
0
 def GetTextData(self, htmlData, forUrl='<Mising URL info>'):
     '''Function to clean up html raw data and get the text from it. Keep it small.
     Not thread safe, returns an object that will go into the parsedData["text"] field for HandleData function above'''
     from lxml import html
     if self.RemoveJavaScriptAndCSS:
       try:
         from lxml.html.clean import Cleaner
         cleaner = Cleaner()
         cleaner.javascript = True
         cleaner.style = True
         htmlData = cleaner.clean_html(htmlData)
       except:
         print("Could not remove style and js code for url :" + forUrl)
     return html.fromstring(htmlData).text_content()
Esempio n. 46
0
def clean_cachefiles():
    """Clean silly html from all cachefiles in the cachdir"""
    if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'):
        import lxml.html
        from lxml.html.clean import Cleaner
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.scripts = True
        cleaner.javascript = True
        for file in _get_all_cache_files():
            cfile = CompressedFile(file)
            data = cfile.read()
            cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data)))
            cfile.write(cleaned)
            logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
Esempio n. 47
0
    def validate(self, value):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content()
        data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content()
        data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content()

        # 		data['username']=  cleaner.clean_html(data['username'])
        #               data['storename']= cleaner.clean_html(data['storename'])
        #              data['email']= cleaner.clean_html(data['email'])

        return data
Esempio n. 48
0
 def remove_scripts(self):
     if not self.clean_js:
         logger.debug('Scripts will not be removed')
         self.parser_modified_content = False
         return
     cleaner = Cleaner()
     # don't modify original page structure, eg, <head>, <html>, <body> ...
     cleaner.page_structure = False
     # don't remove inline javascript
     cleaner.javascript = False
     # remove <script> tags
     cleaner.scripts = True
     self.modified_doc = cleaner.clean_html(self.doc)
     self.parser_modified_content = True
     logger.debug('Scripts were successfully removed')
Esempio n. 49
0
def get_content(url):

    if url is None:
        return 'Body is not found!'

    content = requests.get(url).content
    doc = html.fromstring(content)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    # cleaner.remove_tags = ['br']

    content = html.tostring(cleaner.clean_html(doc))

    return content
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    stuff = lxml.html.tostring(cleaner.clean_html(data))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text
Esempio n. 51
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 52
0
def tokenize(n):
	reload(sys)
	sys.setdefaultencoding('utf8')
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	existingSpam = list()
	existingNotSpam = list()
	for file in os.listdir("./spam/"):
		if (i == n):
			Break
		else:
			spamPath = os.path.join("./spam", file)
			existingSpam.append(spamPath)
			i = i + 1
	i=0
	for file in os.listdir("./notspam/"):
		if (i == n):
			break
		else:
			spamPath = os.path.join("./notspam", file)
			existingNotSpam.append(spamPath) 
			i = i+1
	y1=['0'] * len(existingSpam)
	y2=['1'] * len(existingNotSpam)
	y = y1+y2	
	existingSpam = existingSpam + existingNotSpam
	vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore')
	spamFeatures = vectorizer.fit_transform(existingSpam)
	#print vectorizer.get_feature_names()
	print spamFeatures.shape, type(spamFeatures)
	#print notSpamFeatures.shape, type(notSpamFeatures)
	X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2)  
	clf = LogisticRegression()
	clf.fit(X_train, y_train)
	y_predicted = clf.predict(X_test)
	from sklearn import metrics
	print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted)
	print
	print metrics.classification_report(y_test, y_predicted)
	print
	print 'confusion matrix'
	print
	print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
Esempio n. 53
0
def processDir(data_dir, output_dir):
	if not os.path.exists(output_dir):
	    os.makedirs(output_dir)

	# process every html document.
	file_list = os.listdir(data_dir);
	html_cleaner = Cleaner()
	html_cleaner.javascript = True
	html_cleaner.style = True
	word_dict = dict()
	def updateWordDict(word):
		if word_dict.has_key(word):
			word_dict[word] = word_dict[word]+1  
		else:
			word_dict[word] = 1
	for file_name in file_list:
		if file_name[0] == '.':
			continue
		# remove html tags.
		parsetree = lxml.html.parse(data_dir+'/'+file_name)
		parsetree = html_cleaner.clean_html(parsetree)
		content = parsetree.getroot().text_content()

		# word extraction.
		words_raw = list(jieba.cut(content))
		words = list()
		for word in words_raw:
			uchar = word[0]
			if uchar >= u'\u4e00' and uchar<=u'\u9fa5' : # chinese.
				words.append(word)
				updateWordDict(word)
			if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
				word = word.lower()
				words.append(word)
				updateWordDict(word)
		# print words
		text  = ' '.join(words)
		# print text
		output = open(output_dir+file_name, 'w')
		output.write(text.encode('utf-8'))
		output.close()
	output = open(output_dir+'words.dict', 'w')
	for word in word_dict.keys():
		output.write(word.encode('utf-8')+' '+str(word_dict[word])+'\n')
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    doc = UnicodeDammit(data, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    root = html.document_fromstring(data, parser=parser)
    stuff = lxml.html.tostring(cleaner.clean_html(root))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text.encode('ascii', 'ignore')
Esempio n. 55
0
    def validate(self, data):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]

        # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content()
        data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content()
        data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content()
        data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content()
        data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content()
        data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content()
        data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content()
        data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content()

        if data[qty] < 0:
            data[qty] = 0

        #           self.name= cleaner.clean_html(self.name)
        return data
Esempio n. 56
0
    def get_url(self):
        """Get the relevant part of a web page."""

        get_url = requests.get(self.data_path)
        page_data = get_url.content

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = cleaner.clean_html(page_data)

        # Strip tags from final results.
        strip_tags = TagStripper()  # Instantiate the HTML Tag Stripper.
        strip_tags.feed(page_html)  # Strip all HTML tags.

        return strip_tags.get_html_data()
Esempio n. 57
0
    def get_content(self, pathName):
        try:
            file = open(pathName, "r")
            html_text = file.read()
            file.close()
        except:
            print("Fail to open the file located in {}".format(pathName))
            return None
        try:
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
########    Add
            cleaner.page_structure = False 
            htmlData = cleaner.clean_html(html_text)
        except:
            print("Could not remove style and js code from the file located in {}".format(pathName))
            return None
########    Add
        soup = BeautifulSoup(htmlData, "lxml")
########    Change return tuple (raw_content, soup) instead of raw_content
        return soup
Esempio n. 58
0
    def crawNews(self, url):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.comments = True

        tech_content = lxml.html.parse(url)
        tech_content = (lxml.html.tostring(tech_content))

        re_title = re.compile(r'<h1.*>(.*)</h1', re.S)
        re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S)
        re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"')
        re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>')

        match_title = re.search(re_title, tech_content)
        match_content = re.search(re_content, tech_content)
        match_date = re.search(re_published, tech_content)
        match_author = re.search(re_author, tech_content)

        author_url = "http://techcrunch.com" + match_author.group(1)
        author_name = match_author.group(2)
        author_twitter = match_author.group(3)

        title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1)))
        title = re.sub(r'\s+', ' ', title)
        title = title.decode('utf-8').strip()
        content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1)))
        content = re.sub(r'\s+', ' ', content)
        content = content.decode('utf-8').strip()
        content = content.strip('\n')
        published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S')

        news = self.save_news(url, title, content, published_on)

        author = self.findAuthorByUrl(author_url)
        if (isinstance(author, Author) == False):
            author = self.save_author(author_url, author_name, author_twitter, '')

        self.newsAuthor(news, author)
Esempio n. 59
0
 def get_content(self, pathName):
     try:
         get_title = urllib.urlopen(pathName)
         soup = Soup(get_title)
         file = open(pathName, "r")
         html_text = file.read()
         file.close()
     except:
         print("Fail to open the file located in {}".format(pathName))
         return None
     try:
         cleaner = Cleaner()
         cleaner.javascript = True
         cleaner.style = True
         htmlData = cleaner.clean_html(html_text)
     except:
         print("Could not remove style and js code from the file located in {}".format(pathName))
         return None
     try:
         title = soup.title.string.encode("utf-8")
     except:
         title = ""
     return html.fromstring(htmlData).text_content() , title
Esempio n. 60
0
    def get_url(self):
        """Get the HTML body of a web page."""

        # Create file-like object.
        outfile = StringIO.StringIO()

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img', 'li']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = lxml.html.tostring(
            cleaner.clean_html(
                lxml.html.parse(self.data_path)
            )
        )

        outfile.write(page_html)  # Write the results to this file in memory.

        return outfile