Esempio n. 1
0
def html2content(html, allowed_tags=["a", "abbr", "article", "aside",
                                     "b", "base", "blockquote", "body",
                                     "br", "caption", "cite", "code", "col", "colgroup",
                                     "dd", "del", "dfn", "dl", "dt",
                                     "em", "embed", "figcaption", "figure", "footer",
                                     "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
                                     "i", "img",
                                     "li",
                                     "map", "mark", "math", "meta", "meter",
                                     "nav", "noscript",
                                     "object", "ol", "optgroup", "option", "output",
                                     "p", "param", "pre", "progress",
                                     "q", "rp", "rt", "ruby",
                                     "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg",
                                     "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track",
                                     "u", "ul",
                                     "var", "video",
                                     "wbr"]):
    cleaner = Cleaner()
    cleaner.allow_tags = allowed_tags
    cleaner.remove_unknown_tags = False
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.style = True
    cleaner.embeded = False
    return cleaner.clean_html(html)
Esempio n. 2
0
def html2text(html):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    html_tree = cleaner.clean_html(lxml.html.fromstring(html))
    el.strip_tags(html_tree, '*')
    return html_tree.text
Esempio n. 3
0
def clense(text, space_replacer=' ', to_lower=True, remove_punc=True):
    # remove HTML comments first as suggested in https://stackoverflow.com/questions/28208186/how-to-remove-html-comments-using-regex-in-python

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    text = cleaner.clean_html(text.encode("utf-8")).decode("utf-8")

    text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL)
    text = remove_tags(text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  #remove non-ascii characters
    text = text.replace("&amp;", "and")
    text = text.replace("&", "and")
    text.strip()
    text.rstrip()
    text = text.replace("\r\n", "")
    text = text.replace("\n", "")
    text = text.replace("\"", "")
    if to_lower:
        text = text.lower()

    if remove_punc:
        # from https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
        text = re.sub(r'[^\w\s]', '',
                      text)  #remove punctuation marks and non-word
        text = text.replace(",", "")

    text = re.sub(' +', space_replacer, text)
    #if  all(ord(char) < 128 for char in text) == False:
    #    text = ''
    ''.join(i for i in text if ord(i) < 128)
    return text
Esempio n. 4
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Esempio n. 5
0
def visit(url):
    if url.startswith(base_url) == False:
        return

    try:
        resp = urlopen(url)
    except URLError as e:
        return

    page = resp.read()
    cleaner = Cleaner()
    cleaner.javasript = True
    cleaner.style = True
    cleaner.kill_tags = ELEMENTS_TO_IGNORE

    # soup = BeautifulSoup(page, "lxml")
    # for link in soup.findAll('a'):
    # 	if link.has_attr('href'):
    # 		if link.has_attr('class') and 'history' in link['class']:
    # 			continue
    # 		next_link = urljoin(url,link['href'])
    # 		next_link = urldefrag(next_link)[0]
    # 		if next_link not in visited_pages:
    # 			visited_pages.append(next_link)
    # 			pages_to_visit.append(next_link)
    f = open("testing.txt", 'w')
    f.write(page)

    clean_page = cleaner.clean_html(page)
    f.write("\n\n\nVS\n\n\n")
    f.write(clean_page)
    f.close()
    soup = BeautifulSoup(clean_page, "lxml")
    return
    extract(soup, url)
    def create_word_frequencies(self):

        document = re.sub(find_doc_content_pattern, "", self.content)

        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.style = True
        # # cleaner.allow_tags = ['']
        # # cleaner.remove_unknown_tags = False

        try:
            document_visible_text = cleaner.clean_html(document)
        except UnicodeDecodeError:
            document_visible_text = ""
            print "Unicode Error"
        # document_visible_text = document

        word_list = document_visible_text.split()
        for word in word_list:
            word_stemmed = word.lower()
            try:
                self.word_frequencies[
                    word_stemmed] = self.word_frequencies[word_stemmed] + 1
            except:
                self.word_frequencies[word_stemmed] = 1
            self.total_word_count = self.total_word_count + 1
Esempio n. 7
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
Esempio n. 8
0
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)
Esempio n. 9
0
    def clearTag_old(self, text: str) -> str:
        import lxml
        from lxml.html.clean import Cleaner

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.links = True
        cleaner.meta = True
        cleaner.forms = True
        cleaner.embedded = True
        cleaner.frames = True
        cleaner.remove_unknown_tags = True
        cleaner.kill_tags = ["img"]
        cleaner.remove_tags = [
            "strong",
            "div",
            "body",
            "br",
            "a",
            "p",
            "blockquote",
            "h3",
            "ol",
            "li",
            "font",
        ]
        return cleaner.clean_html(
            lxml.html.document_fromstring(text)).decode("utf-8")
Esempio n. 10
0
def remove_script_and_style(html_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['script']
    clean_html = cleaner.clean_html(html_content)
    return clean_html
Esempio n. 11
0
 def extract_text(self, url):
     try:
         if url.value.startswith('http') and '://' in url.value:
             prog = FloatProgress(min=0, max=100, description='Progress')
             display(widgets.HTML('<br/>'), prog)
             tr0 = time()
             site = self.browser.get(url.value, timeout=10)
             if site.ok:
                 prog.value += 50
                 tr1 = time() - tr0
                 t0 = time()
                 cleaner = Cleaner()
                 cleaner.javascript = True
                 cleaner.style = True
                 cleaner.kill_tags = ['header', 'footer']
                 source_tree = etree.HTML(cleaner.clean_html(site.content))
                 text = source_tree.itertext()
                 t1 = time() - t0
                 self.text = '\n'.join(
                     [n.strip() for n in text if n.strip()])
                 prog.value += 50
                 self.keywords_and_display(prog)
             else:
                 display(
                     widgets.HTML(
                         '<div style="font-size: 1.5em; margin-top:1em; margin-bottom:1em">404 - bad URL</div>'
                     ))
         else:
             self.text = url.value
             self.keywords_and_display(False)
     except Exception as e:
         print 'Error extracting text: %s' % (e)
Esempio n. 12
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 13
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 14
0
    def get_text(self, html_content: str):
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.inline_style = True

        cleaned = cleaner.clean_html(html_content)

        soup = BeautifulSoup(cleaned, 'lxml')
        text_lines = soup.findAll(text=True)

        text_lines_merged = []
        merge_str = ''

        text_lines_merged.append(text_lines[0])
        for line in text_lines[1:]:
            if '\n' == line or '' == line or ' ' == line:
                if merge_str is not '':
                    text_lines_merged.append(merge_str)
                merge_str = ''
            else:
                merge_str += (' ' + line)

        text_lines_merged = [
            self.strip(line) for line in text_lines_merged
            if len(self.strip(line)) > 128
        ]
        print(' '.join(text_lines_merged))
Esempio n. 15
0
def buildDicts(n):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0 
	tagsDict = set()
	while (i < n):
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
		if (os.path.isfile("notspam/%d.txt" % i)):
			try:
				readInFile = open("notspam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
Esempio n. 16
0
def tokenize(n, tagsDict):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	df = pandas.DataFrame(columns=[list(tagsDict)])

	while (i < n):
		allVector = {}
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				for word in tagsDict:
					allVector[word] = 0
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				allCopy = noSymbols.split()  # allCopy is the set of words without symbols
				for tag in allCopy:
					df.ix[i[tag]] = df.ix[i[tag]]  + 1
				df.ix[i['isSpam']] = 'spam'
				
			except Exception, err:
				print traceback.format_exc()
    			print sys.exc_info()[0]
		
		i = i + 1		
Esempio n. 17
0
def raw_scraper(url, memoize):
    t1 = time.time()
    if should_exclude(url):
        # heuristic to make downloading faster
        return None, {
            "url": url,
            "scraper": "raw",
        }

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, {
            "url": url,
            "scraper": "raw",
        }
    if article.text == "":
        return None, {
            "url": url,
            "scraper": "raw",
        }

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata
Esempio n. 18
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Esempio n. 19
0
 def init_cleaner():
     from lxml.html.clean import Cleaner
     cleaner = Cleaner()
     cleaner.javascript = False
     cleaner.style = False
     cleaner.kill_tags = ["pre", "code"]
     return cleaner
Esempio n. 20
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 21
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Esempio n. 22
0
File: wapa.py Progetto: mtamer/wapa
def getArticles(keyword):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True

	br = mechanize.Browser()
	br.set_handle_robots(False)
	br.addheaders=[('User-agent','chrome')]

	term = keyword.replace(" ", "+")
	query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term 
	htmltext = br.open(query).read()
	#print htmltext

	soup = BeautifulSoup(htmltext)

	search = soup.findAll('div', attrs={'id': 'search'})
	#print search[0]
	searchtext= str(search[0])
	soup1=BeautifulSoup(searchtext)
	list_items=soup1.findAll('li')

	regex = "q=.*?&amp"	
	pattern = re.compile(regex)
	results_array = []
	for li in list_items:
		soup2 = BeautifulSoup(str(li))
		links = soup2.findAll('a')
		source_link = links[0]
		#print source_link
		source_url = re.findall(pattern, str(source_link))
		if len(source_url) > 0:
				results_array.append(str(source_url[0].replace("q=", "").replace("&amp", "")))
	return results_array
    def get_clean_html(self, html_text, text_only=True):
        try:
            etree = lxml.html.document_fromstring(html_text)

            self._is_etree(etree)
            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(etree)
            if text_only:
                return ' '.join(html.text_content().split())
                # return html.text_content()

            res = lxml.html.tostring(html)
        except Exception as e:
            logger.error(f"While parsing email in get_clean_html {e}")
            res = "junk"

        return res
Esempio n. 24
0
File: wapa.py Progetto: mtamer/wapa
def getArticles(keyword):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'chrome')]

    term = keyword.replace(" ", "+")
    query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term
    htmltext = br.open(query).read()
    #print htmltext

    soup = BeautifulSoup(htmltext)

    search = soup.findAll('div', attrs={'id': 'search'})
    #print search[0]
    searchtext = str(search[0])
    soup1 = BeautifulSoup(searchtext)
    list_items = soup1.findAll('li')

    regex = "q=.*?&amp"
    pattern = re.compile(regex)
    results_array = []
    for li in list_items:
        soup2 = BeautifulSoup(str(li))
        links = soup2.findAll('a')
        source_link = links[0]
        #print source_link
        source_url = re.findall(pattern, str(source_link))
        if len(source_url) > 0:
            results_array.append(
                str(source_url[0].replace("q=", "").replace("&amp", "")))
    return results_array
Esempio n. 25
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Esempio n. 26
0
def cleanMe(text):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    #text = unicodedata.normalize("NFKD", text).encode('ascii','ignore')
    clean = cleaner.clean_html(text)
    return clean
Esempio n. 27
0
def strip_tags(web_content):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    text = BS(lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(web_content))), features="lxml")
    return text.getText()
def get_context(web_link, answer):
    browser = webdriver.Firefox(
        executable_path='geckodriver-v0.26.0-win64/geckodriver')
    browser.get(web_link)
    html_source = browser.page_source
    get_context = BeautifulSoup(html_source, "lxml")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    web_page_text = ''

    for element in get_context:
        element_string = lxml.html.document_fromstring(str(element))
        page_text = lxml.html.tostring(cleaner.clean_html(element_string))
        page_text = re.sub("<.*?>", " ", str(page_text))
        web_page_text = web_page_text + " " + page_text
    browser.close()
    matcher = difflib.SequenceMatcher(None, web_page_text, answer)
    match = matcher.find_longest_match(0, len(web_page_text), 0, len(answer))
    if match.a > 1000:
        start_context = match.a - 999
    else:
        start_context = 0

    if len(web_page_text) > start_context + 2000:
        end_context = start_context + 2000
    else:
        end_context = len(web_page_text) - 1

    context = web_page_text[start_context:end_context]
    return context
Esempio n. 29
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 30
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 31
0
def Content(content):
    doc = html.document_fromstring(content)
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    doc = cleaner.clean_html(doc)
    plaintext = "\n".join(etree.XPath("//text()")(doc))
    return plaintext
Esempio n. 32
0
def trim_html(html):
    """Takes a html string as input and returns the html without any styles nor javascript"""
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.javascript = True  # Get rid of the javascript and the style
    cleaner.style = True

    return cleaner.clean_html(html)
Esempio n. 33
0
 def cleanInputString(self, htmlString):
     # "WITH JAVASCRIPT & STYLES"
     cleaner = Cleaner()
     cleaner.javascript = True  # This is True because we want to activate the javascript filter
     cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter
     # "WITHOUT JAVASCRIPT & STYLES"
     htmlClean = lxml.html.tostring(cleaner.clean_html(htmlString))
     return htmlClean
Esempio n. 34
0
def __cleanhtml(raw_html):
    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True

    cleantext = cleaner.clean_html(raw_html)
    cleantext = BeautifulSoup(cleantext, "lxml").text
    return cleantext
Esempio n. 35
0
 def _get_cleaner(self, print_style, print_js, remove_tags):
     c = Cleaner()
     c.scripts = not print_js
     c.javascript = not print_js
     c.style = not print_style
     c.remove_tags = remove_tags
     c.page_structure = False
     return c
Esempio n. 36
0
def html_strict_cleaning(html, allow_tags=['p', 'br', 'a', 'img', 'div']):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.allow_tags = allow_tags
    cleaner.remove_unknown_tags = False
    return lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(html)),
                              encoding='unicode')
Esempio n. 37
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b',
             'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1',
             'h2', 'h3', 'h4', 'h5', 'h6']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 38
0
def _parseAndCleanHtml(rawHtml):
    # Parse html with lxml library
    parsedHtml = lh.fromstring(rawHtml)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True

    return cleaner.clean_html(parsedHtml)
Esempio n. 39
0
def clean_article_html(cls, node):
    article_cleaner = Cleaner()
    article_cleaner.javascript = True
    article_cleaner.style = True
    article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'ul', 'ol', 'li',
                                  'em', 'i', 'code', 'pre', 'blockquote', 'h1',
                                  'h2', 'h3', 'h4', 'h5', 'h6']
    article_cleaner.remove_unknown_tags = False
    return article_cleaner.clean_html(node)
Esempio n. 40
0
 def create_html_cleaner(self):
     cleaner = Cleaner()
     cleaner.javascript = True
     cleaner.style = True
     cleaner.remove_tags = [
         'br', 'hr', 'img', 'basefont', 'area', 'base', 'col', 'embed',
         'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'
     ]
     return cleaner
Esempio n. 41
0
	def flipkart_extract_details(self, msg_fields, order_xpath, msg_params, acc_params):
		messages_dict = dict()
		items_dict = dict()
		messages_dict = {'store': 'Flipkart'}
		messages_dict.update(msg_fields)
		messages_dict.update(acc_params)
		messages_dict["user_contextio_uuid"] = messages_dict.pop("id")
		items_dict["store"] = messages_dict["store"]
		items_dict["user_contextio_uuid"] = messages_dict["user_contextio_uuid"]
		items_dict["user_email"] = messages_dict["user_email"]
		cleaner = Cleaner()
		cleaner.javascript = True
		cleaner.style = True

		tree = html.fromstring(msg_fields["body"])
		#print order_xpath
		if msg_params["sender"] == "*****@*****.**" and "confirmation" in msg_fields["subject"].strip().lower():
			#tree = html.fromstring(msg_fields["body"])
			#address = tree.xpath(order_xpath["address"])
#			print order_xpath["address"]
			print msg_fields["subject"]
			#print address
			address = list()
			bsObj = BeautifulSoup(msg_fields["body"])
			add = bsObj.find(text="DELIVERY ADDRESS")
			
			if add:
				addr = add.findNext().get_text().encode("utf-8")
				#addr = addr.replace("\xa0", " ")
				address.append(addr)
				address.append(add.findNext().findNext().get_text().encode("utf-8"))
				print type(address)
				#print address
				items_dict["delivery_address"] = str_process(address)
				print items_dict["delivery_address"]
			else:
				items_dict["delivery_address"] = ""
			"""for x in range(len(order_xpath["order_id"])):
				order_id = tree.xpath(order_xpath["order_id"][x])
				if order_id:
					break"""
			order_id = re.search(r"\[(\w+)\]", msg_fields["subject"])
			items_dict["order_id"] = order_id.group(1)
			print items_dict["order_id"]
			items_dict["item_title"] = str_process(tree.xpath(order_xpath["item_title"]))
			print items_dict["item_title"]
			item_price = tree.xpath(order_xpath["item_price"])
			items_dict["item_price"] = amount_process(item_price)
			print items_dict["item_price"]
			items_dict["item_status"] = "confirmed"
			messages_dict["order_id"] = items_dict["order_id"]
			#insert_to_items_table(**items_dict)
			#insert_to_messages_table(**messages_dict)
			
			
			"""
Esempio n. 42
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Esempio n. 43
0
def url2count(title):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html= True
    
    r = requests.get(makeurl(title), timeout=5) #r.text
    lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' '))
    text = nltk.clean_html(lxclean)
    collapsewhitespace = re.sub(r'\s{2,}', ' ', text)
    nonPunct = re.compile('.*[A-Za-z0-9].*') 
    article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)]
    article_length = len(article_list)
    return(article_length)
Esempio n. 44
0
 def GetTextData(self, htmlData, forUrl='<Mising URL info>'):
     '''Function to clean up html raw data and get the text from it. Keep it small.
     Not thread safe, returns an object that will go into the parsedData["text"] field for HandleData function above'''
     from lxml import html
     if self.RemoveJavaScriptAndCSS:
       try:
         from lxml.html.clean import Cleaner
         cleaner = Cleaner()
         cleaner.javascript = True
         cleaner.style = True
         htmlData = cleaner.clean_html(htmlData)
       except:
         print("Could not remove style and js code for url :" + forUrl)
     return html.fromstring(htmlData).text_content()
Esempio n. 45
0
def clean_cachefiles():
    """Clean silly html from all cachefiles in the cachdir"""
    if input('Do you really want to strip all cache files from bloating tags such as <script> and <style>? ').startswith('y'):
        import lxml.html
        from lxml.html.clean import Cleaner
        cleaner = Cleaner()
        cleaner.style = True
        cleaner.scripts = True
        cleaner.javascript = True
        for file in _get_all_cache_files():
            cfile = CompressedFile(file)
            data = cfile.read()
            cleaned = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(data)))
            cfile.write(cleaned)
            logger.info('Cleaned {}. Size before: {}, after {}'.format(file, len(data), len(cleaned)))
Esempio n. 46
0
def get_content(url):

    if url is None:
        return 'Body is not found!'

    content = requests.get(url).content
    doc = html.fromstring(content)

    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    # cleaner.remove_tags = ['br']

    content = html.tostring(cleaner.clean_html(doc))

    return content
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    stuff = lxml.html.tostring(cleaner.clean_html(data))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text
Esempio n. 48
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Esempio n. 49
0
def tokenize(n):
	reload(sys)
	sys.setdefaultencoding('utf8')
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	existingSpam = list()
	existingNotSpam = list()
	for file in os.listdir("./spam/"):
		if (i == n):
			Break
		else:
			spamPath = os.path.join("./spam", file)
			existingSpam.append(spamPath)
			i = i + 1
	i=0
	for file in os.listdir("./notspam/"):
		if (i == n):
			break
		else:
			spamPath = os.path.join("./notspam", file)
			existingNotSpam.append(spamPath) 
			i = i+1
	y1=['0'] * len(existingSpam)
	y2=['1'] * len(existingNotSpam)
	y = y1+y2	
	existingSpam = existingSpam + existingNotSpam
	vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore')
	spamFeatures = vectorizer.fit_transform(existingSpam)
	#print vectorizer.get_feature_names()
	print spamFeatures.shape, type(spamFeatures)
	#print notSpamFeatures.shape, type(notSpamFeatures)
	X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2)  
	clf = LogisticRegression()
	clf.fit(X_train, y_train)
	y_predicted = clf.predict(X_test)
	from sklearn import metrics
	print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted)
	print
	print metrics.classification_report(y_test, y_predicted)
	print
	print 'confusion matrix'
	print
	print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
Esempio n. 50
0
 def gettextonly(self, html, url):
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.style = True
     cleaner.links = True
     cleaner.meta = False
     cleaner.page_structure = False
     cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                    'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr',
                    'table', 'a', 'p', 'br', 'li', 'ul']
     doc = lxml.html.fromstring(html)
     path = '/html/body'
     try:
         body = doc.xpath(path)[0]
     except Exception as detail:
         print detail
         return False
     return cleaner.clean_html(body).text_content().split()
Esempio n. 51
0
def processDir(data_dir, output_dir):
	if not os.path.exists(output_dir):
	    os.makedirs(output_dir)

	# process every html document.
	file_list = os.listdir(data_dir);
	html_cleaner = Cleaner()
	html_cleaner.javascript = True
	html_cleaner.style = True
	word_dict = dict()
	def updateWordDict(word):
		if word_dict.has_key(word):
			word_dict[word] = word_dict[word]+1  
		else:
			word_dict[word] = 1
	for file_name in file_list:
		if file_name[0] == '.':
			continue
		# remove html tags.
		parsetree = lxml.html.parse(data_dir+'/'+file_name)
		parsetree = html_cleaner.clean_html(parsetree)
		content = parsetree.getroot().text_content()

		# word extraction.
		words_raw = list(jieba.cut(content))
		words = list()
		for word in words_raw:
			uchar = word[0]
			if uchar >= u'\u4e00' and uchar<=u'\u9fa5' : # chinese.
				words.append(word)
				updateWordDict(word)
			if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
				word = word.lower()
				words.append(word)
				updateWordDict(word)
		# print words
		text  = ' '.join(words)
		# print text
		output = open(output_dir+file_name, 'w')
		output.write(text.encode('utf-8'))
		output.close()
	output = open(output_dir+'words.dict', 'w')
	for word in word_dict.keys():
		output.write(word.encode('utf-8')+' '+str(word_dict[word])+'\n')
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    doc = UnicodeDammit(data, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    root = html.document_fromstring(data, parser=parser)
    stuff = lxml.html.tostring(cleaner.clean_html(root))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text.encode('ascii', 'ignore')
Esempio n. 53
0
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
Esempio n. 54
0
    def get_url(self):
        """Get the relevant part of a web page."""

        get_url = requests.get(self.data_path)
        page_data = get_url.content

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = cleaner.clean_html(page_data)

        # Strip tags from final results.
        strip_tags = TagStripper()  # Instantiate the HTML Tag Stripper.
        strip_tags.feed(page_html)  # Strip all HTML tags.

        return strip_tags.get_html_data()
Esempio n. 55
0
    def crawNews(self, url):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.comments = True

        tech_content = lxml.html.parse(url)
        tech_content = (lxml.html.tostring(tech_content))

        re_title = re.compile(r'<h1.*>(.*)</h1', re.S)
        re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S)
        re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"')
        re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>')

        match_title = re.search(re_title, tech_content)
        match_content = re.search(re_content, tech_content)
        match_date = re.search(re_published, tech_content)
        match_author = re.search(re_author, tech_content)

        author_url = "http://techcrunch.com" + match_author.group(1)
        author_name = match_author.group(2)
        author_twitter = match_author.group(3)

        title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1)))
        title = re.sub(r'\s+', ' ', title)
        title = title.decode('utf-8').strip()
        content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1)))
        content = re.sub(r'\s+', ' ', content)
        content = content.decode('utf-8').strip()
        content = content.strip('\n')
        published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S')

        news = self.save_news(url, title, content, published_on)

        author = self.findAuthorByUrl(author_url)
        if (isinstance(author, Author) == False):
            author = self.save_author(author_url, author_name, author_twitter, '')

        self.newsAuthor(news, author)
Esempio n. 56
0
    def get_content(self, pathName):
        try:
            file = open(pathName, "r")
            html_text = file.read()
            file.close()
        except:
            print("Fail to open the file located in {}".format(pathName))
            return None
        try:
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
########    Add
            cleaner.page_structure = False 
            htmlData = cleaner.clean_html(html_text)
        except:
            print("Could not remove style and js code from the file located in {}".format(pathName))
            return None
########    Add
        soup = BeautifulSoup(htmlData, "lxml")
########    Change return tuple (raw_content, soup) instead of raw_content
        return soup
Esempio n. 57
0
 def get_content(self, pathName):
     try:
         get_title = urllib.urlopen(pathName)
         soup = Soup(get_title)
         file = open(pathName, "r")
         html_text = file.read()
         file.close()
     except:
         print("Fail to open the file located in {}".format(pathName))
         return None
     try:
         cleaner = Cleaner()
         cleaner.javascript = True
         cleaner.style = True
         htmlData = cleaner.clean_html(html_text)
     except:
         print("Could not remove style and js code from the file located in {}".format(pathName))
         return None
     try:
         title = soup.title.string.encode("utf-8")
     except:
         title = ""
     return html.fromstring(htmlData).text_content() , title
Esempio n. 58
0
    def get_url(self):
        """Get the HTML body of a web page."""

        # Create file-like object.
        outfile = StringIO.StringIO()

        cleaner = Cleaner()
        cleaner.javascript = True  # Remove JavaScript code from HTML.
        cleaner.scripts = True  # Remove other code from HTML.
        cleaner.style = True  # Remove CSS and styles from HTML.
        cleaner.links = True  # Remove Links from HTML.
        cleaner.kill_tags = ['a', 'img', 'li']  # Remove these tags.

        # Store the cleaned up HTML.
        page_html = lxml.html.tostring(
            cleaner.clean_html(
                lxml.html.parse(self.data_path)
            )
        )

        outfile.write(page_html)  # Write the results to this file in memory.

        return outfile
Esempio n. 59
0
def tokenize(n):
	reload(sys)
	sys.setdefaultencoding('utf8')
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	existingSpam = list()
	existingNotSpam = list()
	while (i < n):
		if (os.path.isfile("spam/%d.txt" % i)):
			existingSpam.append("spam/" + str(i) + ".txt")
		if (os.path.isfile("notspam/%d.txt" % i)):
			existingNotSpam.append("notspam/" + str(i) + ".txt") 
		i = i + 1
	y1=['spam'] * len(existingSpam)
	y2=['notSpam'] * len(existingNotSpam)
	y = y1+y2	
	existingSpam = existingSpam + existingNotSpam
	vectorizer = CountVectorizer(analyzer='word', input='filename', min_df=3, decode_error='ignore')
	spamFeatures = vectorizer.fit_transform(existingSpam)
	#print vectorizer.get_feature_names()
	print spamFeatures.shape, type(spamFeatures)
	#print notSpamFeatures.shape, type(notSpamFeatures)
	X_train, X_test, y_train, y_test = train_test_split(spamFeatures, y, test_size=0.2)  
	clf = LogisticRegression()
	clf.fit(X_train, y_train)
	y_predicted = clf.predict(X_test)
	from sklearn import metrics
	print 'Accuracy:', metrics.accuracy_score(y_test, y_predicted)
	print
	print metrics.classification_report(y_test, y_predicted)
	print
	print 'confusion matrix'
	print
	print pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted))
Esempio n. 60
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc["content"]
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = " ".join([title for title in tree.xpath("//title/text()")])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = " ".join(tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc["url"],
                desc=description,
                rank=doc["rank"],
                content="\n".join([title, doc["url"], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since