Esempio n. 1
0
def sanitize_html(html, bad_tags=['body']):
    """Removes identified malicious HTML content from the given string."""
    if html is None or html == '':
        return html
    cleaner = Cleaner(style=False, page_structure=True, remove_tags=bad_tags,
                      safe_attrs_only=False)
    return cleaner.clean_html(html)
Esempio n. 2
0
File: wapa.py Progetto: mtamer/wapa
def getArticles(keyword):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True

	br = mechanize.Browser()
	br.set_handle_robots(False)
	br.addheaders=[('User-agent','chrome')]

	term = keyword.replace(" ", "+")
	query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term 
	htmltext = br.open(query).read()
	#print htmltext

	soup = BeautifulSoup(htmltext)

	search = soup.findAll('div', attrs={'id': 'search'})
	#print search[0]
	searchtext= str(search[0])
	soup1=BeautifulSoup(searchtext)
	list_items=soup1.findAll('li')

	regex = "q=.*?&amp"	
	pattern = re.compile(regex)
	results_array = []
	for li in list_items:
		soup2 = BeautifulSoup(str(li))
		links = soup2.findAll('a')
		source_link = links[0]
		#print source_link
		source_url = re.findall(pattern, str(source_link))
		if len(source_url) > 0:
				results_array.append(str(source_url[0].replace("q=", "").replace("&amp", "")))
	return results_array
Esempio n. 3
0
def html_cleanup(input):
    cleaner = Cleaner(
        scripts = True,
        javascript = True,
        comments = True,
        style = False,
        links = True,
        meta = True,
        page_structure = True,
        processing_instructions = True,
        embedded = False,
        frames = False,
        forms = True,
        annoying_tags = True,
        allow_tags = ['a', 'img', 'span', 'div', 'p', 'br', 'iframe', # for google cal
                      'strong', 'em', 'b', 'i', 'u', 'strike', 'blockquote', 'sub', 'sup',
                      'ul', 'ol', 'li', 'table', 'tdata', 'tr', 'th', 'td',
                      'h1', 'h2', 'h3', 'h4'],
        remove_unknown_tags = False,
        safe_attrs_only = True,
        host_whitelist = ['youtube.com', 'www.google.com'],
        whitelist_tags = ['iframe', 'embed', 'script', 'img']
        )
    sane = cleaner.clean_html("<div>%s</div>"%input)
    return sane[len('<div>'):-len('</div>')]
Esempio n. 4
0
    def parse(self, content):
        """Clean and parse HTML content."""

        cleaner = Cleaner(style=True, links=False, page_structure=False, meta=True,
            safe_attrs_only=False, remove_unknown_tags=False)
        clean_content = cleaner.clean_html(content)

        html = etree.iterparse(StringIO(clean_content), events=("start", "end"))
        level = -1
        css = ''

        # We do not want to style these elements.
        ignore_tags = ['html', 'body', 'head', 'meta', 'title', 'script']

        if self.options.delimiter == 'spaces':
            delimiter = '  '
        else:
            delimiter = '\t'

        for action, elem in html:
            if (action == 'start'):
                identifier = self.identify_ruleset(elem)

                if elem.tag not in ignore_tags:
                    level += 1
                    css += delimiter * level + identifier + ' {\n'
                    if not self.options.clean_mode:
                        css += delimiter + delimiter * level + '/* enter your CSS here... */\n'
            else:
                if elem.tag not in ignore_tags:
                    css += delimiter * level + '}\n'
                    level -= 1

        return css.strip()
Esempio n. 5
0
    def _get_breakdowns(self):
        """
        returns breakdowns from GWDG in given timewindow
        """
        #load feed first, since not working with lxml directly
        r = requests.get(URL)
        
        #load url and parse it with html parser
        root = lxml.etree.fromstring(r.text.encode("utf-8"))
        
        #get items
        items = []
        for x in root.findall("channel/item"):
            pubdate = datetime.datetime.fromtimestamp(
                email.utils.mktime_tz(
                    email.utils.parsedate_tz(
                        x.find("pubDate").text[:-6]
                    )
                )
            )
            if pubdate >= OLDEST_NEWS:
                cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)
                title = cleaner.clean_html(x.find("title").text)[5:-6]
                content = cleaner.clean_html(x.find("description").text)[5:-6] 
                item = {
                    "title"   : title,
                    "pubdate" : str(pubdate),
                    "content" : content,
                }
                items.append(item)

        return sorted(items, key=lambda x: x["pubdate"], reverse=True)
Esempio n. 6
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Esempio n. 7
0
def truncate(content, max_length=DEFAULT_TRUNCATE_LENGTH, allowed_tags=ALLOWED_TAGS, full_link=None):
    """ truncate a body of text to the expected 'max_length' and strip
        the body of text of all html tags that are not in 'allowed tags'. You
        can also specify a 'strip' value (True -> strip html tags, False ->
        escape html tags and leave them in text)
    """
    if not content:
        return ''

    cleaner = Cleaner(
        page_structure=False,
        links=True,
        safe_attrs_only=True,
        remove_unknown_tags=False,
        allow_tags=allowed_tags
    )

    content = defaultfilters.truncatechars_html(cleaner.clean_html(content), max_length)
    if full_link:
        try:
            insert_point = content.rindex('</p>')
        except ValueError:
            insert_point = content.rindex('<')

        ending = content[insert_point:]
        content = content[:insert_point]

        content += '&nbsp;<a href="' + full_link + '">(Read More)</a>' + ending
    return content
Esempio n. 8
0
    def __init__(self, input):
        self.title = input.get('post_title')
        self.content = input.get('post_content')
        self.category = input.get('post_category')
        self.is_public = input.get('post_is_public')

        if self.is_public:
            self.is_public = True
        else:
            self.is_public = False

        if self.category not in config.get('post_categories'):
            raise exceptions.CantValidateForm

        if self.title:
            # strip markup
            html_string = lxml.html.fromstring(self.title)
            self.title = unicode(html_string.text_content())
        else:
            self.title = ''

        if self.content:
            # clean markup
            cleaner = Cleaner(**post_rules)
            self.content = cleaner.clean_html(self.content)
            # replace newlines
            self.content = self.content.replace('\r\n', '<br />')
        else:
            raise exceptions.CantValidateForm
Esempio n. 9
0
    def test_allow_tags(self):
        html = """
            <html>
            <head>
            </head>
            <body>
            <p>some text</p>
            <table>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            </table>
            <img>
            </body>
            </html>
            """

        html_root = lxml.html.document_fromstring(html)
        cleaner = Cleaner(
            remove_unknown_tags = False,
            allow_tags = ['table', 'tr', 'td'])
        result = cleaner.clean_html(html_root)

        self.assertEqual(12-5+1, len(list(result.iter())))
Esempio n. 10
0
 def get_content(self, site):
     sel = None
     if site.id_type == "css":  # translates csspath into xpath
         s = CSSSelector(site.identifier)
         sel = s.path
     else:
         sel = site.identifier
     try:
         page = requests.get(site.url)
         parser = le.HTMLParser()
         tree = le.parse(StringIO(page.text), parser)
         xp = tree.xpath(sel)
         if len(xp) < 1:
             return None
         html = lxml.html.tostring(xp[0])
         cleaner = Cleaner(style=True, links=False,
                           page_structure=False, embedded=False,
                           frames=False, forms=False)
         cleaned_html = cleaner.clean_html(html)
         self._print("Cleaning html: " + str(len(html)) +
                     " -> " + str(len(cleaned_html)))
         return cleaned_html
     except Exception as e:
         self._print("EXCEPTION! " + str(e.message))
         return None
Esempio n. 11
0
def analyze(request):
    url = request.GET['url']
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')]
    response = opener.open(url)
    raw_html = response.read()

    cleaner = Cleaner(kill_tags = ['style', 'script', 'head'], allow_tags = [''], remove_unknown_tags = False)
    raw_text = cleaner.clean_html(raw_html)
    ptn = re.compile('<div>|</div>')
    raw_text = re.sub(ptn, '', raw_text)
    ptn = re.compile('\s+')
    raw_text = re.sub(ptn, ' ', raw_text)
    raw_text = raw_text.strip().lower()
    prd, score = MLearn.predict(raw_text)
    donut = score * 100
    results = MLearn.predict_other(raw_text)
    related_headline = results[0][2]
    related_verdict = results[0][0]
    related_score = results[0][1] * 100

    context = {
    	'url': url,
    	'verdict': prd,
    	'score': donut,
    	'related_headline': related_headline,
    	'related_verdict': related_verdict,
    	'related_score': related_score,
    	'results': results,
    }

    return render(request, 'results.html', context)
Esempio n. 12
0
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)
Esempio n. 13
0
def get_intro_text(text):
    """ Returns only the first <p> tag and preceding nodes
    """

    #cut the text to the first paragraph
    index = text.lower().find('</p>', 1000)
    if index != -1:
        text = text[:index] +'</p>'

    cleaner = Cleaner(
            scripts=False,
            javascript=False,
            comments=False,
            style=False,
            links=False,
            meta=False,
            page_structure=False,
            processing_instructions=False,
            embedded=False,
            forms=False,
            remove_unknown_tags=True,
            )
    text = cleaner.clean_html(text)

    return text
def sanitize(html):
    if not html:
        return html
    cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False)
    html = autolink_html(cleaner.clean_html(html))

    parts = re.split('(<.*?>)', html)

    output = ''
    in_a_tag = False
    for part in parts:
        if not len(part):
            continue

        is_tag = part[0] == '<'
        if is_tag or in_a_tag:
            output += part
            if part[0:2].lower() == '<a':
                in_a_tag = True
            elif part[0:3].lower() == '</a':
                in_a_tag = False
            continue

        part = re.sub("([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part)

        # After linking up emails, only look for twitter in the remaining parts
        sub_parts = re.split('(<.*?>)', part)
        part = ''
        for sub_part in sub_parts:
            part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part)

        output += part

    return output
Esempio n. 15
0
    def _remove_tags(self):
        cleaner = Cleaner(
            scripts = True ,
            javascript = True ,
            comments = True ,
            style = False ,
            links = True ,
            meta = False,
            page_structure = None ,
            processing_instructions = True ,
            embedded = True ,
            frames = True ,
            forms = True ,
        #    annoying_tags = True ,
        #    remove_tags = None ,
        #    allow_tags = allowed_tags ,
            remove_unknown_tags = False ,
        #    safe_attrs_only = True ,
        #    add_nofollow = False ,
        )
#        patch to add space in tags
        for el in self.root.iter():
            if el is not None and el.text:
                el.text = el.text+' '
            if el is not None and el.tail:
                el.tail = el.tail+' '
#        remove tags
        self.root = cleaner.clean_html(self.root)
        for el in self.root.iter():
            if el.tag=='a' and el.get('rel')=='nofollow':
                el.text = ''
                el.drop_tag()
Esempio n. 16
0
def getFormatHtml(htmlContent):
    try:
        dom = soupparser.fromstring(htmlContent)
    except Exception, e:
        cleaner = Cleaner()
        htmlContent = cleaner.clean_html(htmlContent)
        doc = soupparser.fromstring(htmlContent)
Esempio n. 17
0
    def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
Esempio n. 18
0
    def gettextonly(self, tree):
        cleaner = Cleaner(style=True, links=True, add_nofollow=True,
                          page_structure=False, safe_attrs_only=False)
        try:
            v = tostring(tree,method='text',encoding=unicode)
        except:
            v = None
            
        if v == None:
            c = lxml.html.tostring(tree)
            print 'v== null' 
#            resulttext = ''
#            for t in c:
#                subtext = self.gettextonly(t)
#                resulttext += subtext + '\n'
#            return resulttext
            return c
        else:
            # Clean up the javascript and comment.
            try:
                v = cleaner.clean_html(v)
            except:
                # Ignore clean error
                pass
            return v.strip()
Esempio n. 19
0
def _statistica_(url_string):
    """Implementa la logica per estrarre documento
    e metadati da rivista-statistica
    """
    url = urlparse.urlparse(url_string)
    conn = httplib.HTTPConnection(url.hostname)
    conn.request("GET", url.path)
    res = conn.getresponse()
    body = res.read()

    my_page = html.fromstring(body)

    # Rimuovi il banner dei cookie del *****
    for el in my_page.xpath('//*[@id="cookiesAlert"]'):
        el.getparent().remove(el)

    # Rimuovi tutti i tag script e il loro contenuto
    cleaner = Cleaner()
    cleaner.javascript = True
    my_page = cleaner.clean_html(my_page)

    title = my_page.xpath('//*[@id="articleTitle"]/h3')
    full_content = my_page.xpath('//*[@id="content"]')
    doi = my_page.xpath('//*[@id="pub-id::doi"]')

    full_content = ''.join(
        [etree.tostring(fix_links(el, url_string)) for el in full_content])

    result = {
        'title': title[0].text_content(),
        'content': full_content,
        'doi': doi[0].text_content()
        }

    return json.JSONEncoder().encode(result)
Esempio n. 20
0
def clean_html(html, safe_attrs=('src', 'href'),
               input_encoding='unicode',
               output_encoding='unicode',
               **kwargs):
    """
    Fix HTML structure and remove non-allowed attributes from all tags.
    """

    from lxml.html.clean import Cleaner

    # Convert HTML to Unicode
    html = render_html(parse_html(html, encoding=input_encoding), make_unicode=True)

    # Strip some shit with default lxml tools
    cleaner = Cleaner(page_structure=True, **kwargs)
    html = cleaner.clean_html(html)

    # Keep only allowed attributes
    tree = parse_html(html)
    for elem in tree.xpath('./descendant-or-self::*'):
        for key in elem.attrib.keys():
            if safe_attrs:
                if key not in safe_attrs:
                    del elem.attrib[key]

    return render_html(tree, encoding=output_encoding)
def createPages():
    items = source.contentItems()
    for item in items:
        doc = parse(item).getroot()
        cleaner = Cleaner(style=True, links=False, page_structure=True, safe_attrs_only=False)
        cleaned = cleaner.clean_html(doc)
    # get the pagetitle

        titles = cleaned.find_class('Pagetitle')
    # snag the page title - method returns list. . there's really only one
        title = titles[0].text_content()

    # get the description
        descrips = cleaned.find_class('Summarytext')
        descrip = descrips[0].text_content()
    #Need to have temporary id
        id = str(random.randint(0, 99999999))

        target.invokeFactory("Document", id=uid)
        obj = target[uid]
        obj.setTitle(title)
        obj.setDescription(descrip)
        obj.setText.getBodyText()


# Will finish Archetypes content item creation process,
# rename-after-creation and such
        obj.processForm()

        return obj
Esempio n. 22
0
def sanitize_payload(payload):
    "Sanitize HTML"
    if not payload:
        return '', ''
    styles = []
    payload = clean_payload(payload)
    body_style, body_class = get_body_style(payload)
    if body_style:
        styles.append(body_style)
    safe_attrs = set(defs.safe_attrs)
    safe_attrs.add('style')
    cleaner = Cleaner(remove_tags=UNCLEANTAGS,
                    safe_attrs_only=True,
                    safe_attrs=safe_attrs)
    payload = HTMLTITLE_RE.sub('', payload)
    try:
        html = cleaner.clean_html(payload)
    except ValueError:
        payload = bytes(bytearray(payload, encoding='utf-8'))
        html = cleaner.clean_html(payload)
    except XMLSyntaxError:
        html = ''
    mainstyle = sanitize_css(get_style(html))
    if mainstyle:
        styles.append(decode(mainstyle))
    style = u'\n'.join(styles)
    html = clean_styles(CSS_COMMENT_RE.sub('', html))
    html = set_body_class(html, body_class)
    return html.strip(), style.strip()
Esempio n. 23
0
def handle_item(path):
  # url="http://news.39.net/"+path.split("/root/39_data/news.39.net/")[1]
  flag,title,text=False,"",""
  try:
    # request=requests.get(url,proxies=get_proxy(),timeout=5)
    # if request.status_code!=200: raise
    with open(path,"r") as file:
      content=file.read()
    html=lxml.html.fromstring(content.decode("gbk"))
    try:
      if re.search("utf",html.xpath("//meta/@charset")[0]): 
       html=lxml.html.fromstring(r.content.decode("utf-8"))
    except: pass
    try:
      if len(html.xpath("//div[@class='art_box']/h1/text()"))>0:
        title=html.xpath("//div[@class='art_box']/h1/text()")[0]
      else:
        title=html.xpath("//div[@class='artbox']/h1/text()")[0]
    except:
      title=""
    print("title:%s"%title)
    if len(html.xpath("//div[@id='contentText']"))>0: div1=html.xpath("//div[@id='contentText']")[0]
    elif len(html.xpath("//div[@class='article']"))>0: div1=html.xpath("//div[@class='article']")[0]
    else: raise
    cleaner = Cleaner(scripts = True)
    for p in div1.xpath("./p"):
      p=cleaner.clean_html(p)
      try:
        text+=p.text_content().strip()+"\n"
      except: pass
    print("text:%s"%text)
    flag=True
  except Exception,e:
    print(e)
Esempio n. 24
0
    def parse(self, response):
        item = JournalItem()

        base_url = "http://journals.ametsoc.org"

        journalTitle = response.xpath('//*[@id="journalBlurbPanel"]/div[2]/h3/text()').extract_first()
        item['title'] = journalTitle

        journalIssue = response.xpath('//*[@id="articleToolsHeading"]/text()').extract_first().strip()  # remove whitespace at start and end
        item['issue'] = journalIssue

        # setup html cleaner to strip html tags from string (journal titles often use sub/superscript and splits article title)
        html_cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)

        journalDescription = response.xpath('//*[@id="journalBlurbPanel"]/div[4]').extract()
        journalDescription = "".join(journalDescription)
        journalDescription = html_cleaner.clean_html(journalDescription)[5:-6]  # remove any html tags and then trim the <div> tags that the cleaner inserts
        journalDescription = removeNewlines(journalDescription)  # remove any \n\r\t characters
        journalDescription = journalDescription.strip()
        item['description'] = journalDescription

        coverImage = response.xpath('//*[@id="smallIssueCover"]/img/@src').extract_first().strip()
        print(coverImage)
        item['coverURL'] = base_url + coverImage

        yield item
    def learn_stopwords(self):
        req = urllib2.Request(self.html_url, headers={'Host':'github.com', 'Referer':'https://github.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36'})
        r = urllib2.urlopen(req)
        page = r.read()
        tree = html.fromstring(page)

        # get readme part
        readme_tree = tree.xpath('//*[@id="readme"]/article')
        if len(readme_tree) < 1:
            return

        readme_tree = readme_tree[0]
        self.origin_readme = readme_tree.text_content()
        cleaner = Cleaner(allow_tags=['p','h1','h2','h3','h4','h5','pre'], remove_unknown_tags=False)
        readme_tree = cleaner.clean_html(readme_tree)

        header = ""
        # iterate each header and paragraph
        for sub in readme_tree.iterchildren():
            if sub is None:
                break

            if sub.tag == 'pre' and header:
                self.add_stopwords(self.filter_all(header))
                header = ""
            elif sub.tag in ['h1','h2','h3','h4'] and sub.text is not None:
                header = sub.text.strip().lower()
Esempio n. 26
0
def strip_comments__lxml(html_string=""):
    if not html_string: return html_string
    
    params = {
        'comments': True,
        'scripts': False,
        'javascript': False,
        'style': False,
        'links': False,
        'meta': False,
        'page_structure': False,
        'processing_instructions': False,
        'embedded': False,
        'frames': False,
        'forms': False,
        'annoying_tags': False,
        'remove_tags': None,
        'allow_tags': None,
        'remove_unknown_tags': True,
        'safe_attrs_only': False,
    }
    try:
        cleaner = Cleaner(**params)
        html = lxml.html.fromstring(html_string)
        clean_html = cleaner.clean_html(html)

        return lxml.etree.tostring(clean_html)
    except (XMLSyntaxError, ParserError):
        return html_string
Esempio n. 27
0
    def parse(self, response):
        sel = Selector(response)

        # urls = sel.xpath('//@href').extract()
        urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()

        item = ZiwuItem()
        item['url'] = response.url
        item['title'] = ''.join(sel.xpath('//div[@id="article_details"]/div[@class="article_title"]/h1/span/a/text()').extract())

        itemcontent = ''.join(sel.xpath('//div[@id="article_details"]/div[@id="article_content"]/node()').extract())

        cleaner = Cleaner(page_structure=False, links=False, safe_attrs_only=True, safe_attrs = frozenset([]))
        cleansed = cleaner.clean_html(itemcontent)

        item['content'] = cleansed

        yield item

        for url in urls:
            utf8_url = url.encode('utf-8')
            base_url = get_base_url(response)

            """The following regex to match the prefix and postfix of urls"""
            postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$')
            prefix = re.compile(r'^((javascript:)|(openapi)).+')

            if postfix.match(utf8_url):
                continue
            if prefix.match(utf8_url):
                continue
            if not utf8_url.startswith('http://'):
                weburl = urljoin_rfc(base_url, utf8_url)

            yield Request(weburl, callback=self.parse)
Esempio n. 28
0
def buildDicts(n):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0 
	tagsDict = set()
	while (i < n):
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
		if (os.path.isfile("notspam/%d.txt" % i)):
			try:
				readInFile = open("notspam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				tags = set(noSymbols.split())  # allCopy is the set of words without symbols
				tagsDict = tagsDict.union(tags)
			except Exception, err:
				print traceback.format_exc()
				print sys.exc_info()[0]
 def get_hidden_comments(post_id_with_minus):
     """ Загружает скрытые комментарии и возвращает список объектов CommentPostInfo """
     params = urllib.urlencode({
         'act': 'get_replies', 
         'al': 1,
         'count': 'false',
         'post': post_id_with_minus
     })
     request = urllib2.Request('http://vkontakte.ru/al_wall.php')
     request.add_header("X-Requested-With", "XMLHttpRequest")
     request.add_header("Origin", "http://vkontakte.ru")
     
     data = urllib2.urlopen(request).read()
     
     with open('b:/1.html', 'w') as f:
         f.write(data)
     data = data.decode('cp1251')
     #возвращает элемент, а не дерево
     html = lxml.html.document_fromstring( data )
     cleaner = Cleaner( style=True, page_structure=False )
     cleaned_html = cleaner.clean_html( html )
     hidden_comments = list()
     for reply_element in cleaned_html.cssselect('div.reply.clear'):
         hidden_comments.append( VkontakteGroupNewsReader.get_reply_from_response_part( reply_element ) )
     return hidden_comments
Esempio n. 30
0
def tokenize(n, tagsDict):
	cleaner = Cleaner()
	cleaner.javascript = True
	cleaner.style = True
	i = 0
	df = pandas.DataFrame(columns=[list(tagsDict)])

	while (i < n):
		allVector = {}
		if (os.path.isfile("spam/%d.txt" % i)):
			try:
				for word in tagsDict:
					allVector[word] = 0
				readInFile = open("spam/%d.txt" % i)
				content = readInFile.read()
				noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower())  # noSymbols is stripped of symbols
				allCopy = noSymbols.split()  # allCopy is the set of words without symbols
				for tag in allCopy:
					df.ix[i[tag]] = df.ix[i[tag]]  + 1
				df.ix[i['isSpam']] = 'spam'
				
			except Exception, err:
				print traceback.format_exc()
    			print sys.exc_info()[0]
		
		i = i + 1		
Esempio n. 31
0
    def clean_cachefiles(self):
        """Clean silly html from all cachefiles in the cachdir"""
        if input(
                'Do you really want to strip all cache files from bloating tags such as <script> and <style>? '
        ).startswith('y'):
            import lxml.html
            from lxml.html.clean import Cleaner

            cleaner = Cleaner()
            cleaner.style = True
            cleaner.scripts = True
            cleaner.javascript = True
            for file in self._get_all_cache_files():
                cfile = CompressedFile(file)
                data = cfile.read()
                cleaned = lxml.html.tostring(
                    cleaner.clean_html(lxml.html.fromstring(data)))
                cfile.write(cleaned)
                logger.info('Cleaned {}. Size before: {}, after {}'.format(
                    file, len(data), len(cleaned)))
Esempio n. 32
0
dataset_dir = os.path.dirname(os.path.abspath(__file__)) + '/dataset/'

lines = open(dataset_dir + 'long_list_stop_words.txt', 'r').readlines()
stop_words = [
    word[:-1].lower().encode('utf-8', 'ignore')
    for word in lines if word[-1] == '\n'
] + [
    word.lower().encode('utf-8', 'ignore')
    for word in lines if word[-1] != '\n'
]
dictionary = pickle.load(open(dataset_dir + 'dictionary.pickle', "rb"))
reverse_dictionary = pickle.load(
    open(dataset_dir + 'reverse_dictionary.pickle', "rb"))
#embeddings = pickle.load(open(dataset_dir+'embeddings.pickle', "rb"))
idf_values = pickle.load(open(dataset_dir + 'idf_values.pickle', "rb"))

topics = [
    'Science', 'Space', 'Astronomy', 'Photo', 'Comics', 'Cooking',
    'Neuroscience', 'Politics', 'Technology', 'Videogames', 'IT', 'Devices',
    'Management', 'Marketing', 'Design', 'Food', 'Startup', 'Console',
    'Economics', 'Education', 'YouTube'
]

cleaner = Cleaner(meta=False,
                  page_structure=False,
                  style=False,
                  kill_tags=['style', 'script', 'iframe', 'video'],
                  safe_attrs_only=False,
                  remove_unknown_tags=False)
img_cleaner = Cleaner(kill_tags=['img'], remove_unknown_tags=False)
Esempio n. 33
0
def reflect_html(key: int, day: str, digest: str) -> Union[None, bool]:
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    """
    1. すでに処理したファイルが存在していたらスキップ
    """
    out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}"
    if Path(out_filename).exists():
        return True
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("window-size=1024x1024")
    options.add_argument(
        f"user-data-dir=/tmp/{FILE.replace('.py', '')}_{key:06d}")
    options.binary_location = shutil.which("google-chrome")
    try:
        driver = webdriver.Chrome(executable_path=shutil.which("chromedriver"),
                                  options=options)
        driver.get(f"http://localhost/twitter/input/{day}/{digest}")
        print('ebug', f"http://localhost/twitter/input/{day}/{digest}")
        html = driver.page_source
        time.sleep(5)
        html = driver.page_source
        driver.save_screenshot(f"/home/gimpei/{digest}.png")
        driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
        # elm = driver.find_element_by_xpath("/html")
        time.sleep(1)
        inner_html = driver.page_source
        # print("inner", inner_html)

        # inner_html = driver.page_source
        # print(html)
        """get shadow-root"""
        # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""")
        # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""")
        # inner_html = elm.get_attribute("innerHTML")
        cleaner = Cleaner(style=True,
                          links=True,
                          add_nofollow=True,
                          page_structure=False,
                          safe_attrs_only=False)
        # print(inner_html)
        soup = BeautifulSoup(inner_html, "lxml")
        imported_csses = [
            el for el in soup.find_all("style", {"type": "text/css"})
        ]

        # replace css text to local css
        for css in imported_csses:
            if "@import url" in css.text:
                css_url = re.search(r'url\("(.*?)"\)', css.text).group(1)
                css_digest = GetDigest.get_digest(css_url)
                # print(css_url, css_digest)
                with requests.get(css_url) as r:
                    css_text = r.text
                Path(f"{TOP_DIR}/var/Twitter/css").mkdir(exist_ok=True,
                                                         parents=True)
                with open(f"{TOP_DIR}/var/Twitter/css/{css_digest}",
                          "w") as fp:
                    fp.write(css_text)
                css.string = f'@import url("/twitter/css/{css_digest}")'

        # replace image src
        for img in soup.find_all(attrs={"src": True}):
            url = img.get("src")
            o = urlparse(url)
            if o.scheme == "":
                o = o._replace(scheme="https")
            url = o.geturl()

            url_digest = GetDigest.get_digest(url)
            if "format=jpg" in url or re.search(".jpg$", url) or re.search(
                    ".jpeg$", url) or re.search(".JPG$", url):
                with requests.get(url, timeout=30) as r:
                    binary = r.content
                Path(f"{TOP_DIR}/mnt/twitter_jpgs").mkdir(exist_ok=True,
                                                          parents=True)
                with open(f"{TOP_DIR}/mnt/twitter_jpgs/{url_digest}",
                          "wb") as fp:
                    fp.write(binary)
                # print(f"downloaded! {TOP_DIR}/mnt/twitter_jpgs/{url_digest}")
                img["src"] = f"/twitter/jpgs/{url_digest}"
            elif "format=png" in url or re.search(".png$", url):
                with requests.get(url, timeout=30) as r:
                    binary = r.content
                Path(f"{TOP_DIR}/var/Twitter/pngs").mkdir(exist_ok=True,
                                                          parents=True)
                with open(f"{TOP_DIR}/var/Twitter/pngs/{url_digest}",
                          "wb") as fp:
                    fp.write(binary)
                img["src"] = f"/twitter/pngs/{url_digest}"
            elif "normal" in url or ".js" in url or ".svg" in url:
                continue
            else:
                continue
                # raise Exception(f"unsupported image! url={url}")
        """adhoc style edit"""
        if soup.find(attrs={"class": "EmbeddedTweet"}):
            soup.find(attrs={"class": "EmbeddedTweet"
                             })["style"] = "margin: 0 auto; margin-top: 150px;"

        out_dir = f"{TOP_DIR}/var/Twitter/tweet/{day}"
        Path(out_dir).mkdir(exist_ok=True, parents=True)
        with open(f"{out_dir}/{digest}", "w") as fp:
            fp.write(soup.__str__())
        driver.close()
        # if E.get("DEBUG"):
        print(
            f"[{NAME}] ordinally done, day = {day} digest = {digest}, filename = {out_dir}/{digest}"
        )
    except Exception as exc:
        tb_lineno = sys.exc_info()[2].tb_lineno
        print(
            f"[{NAME}] exc = {exc}, tb_lineno = {tb_lineno}, day = {day}, digest = {digest}, filename = {out_filename}",
            file=sys.stderr)
        out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}"
        Path(f"{TOP_DIR}/var/Twitter/tweet/{day}").mkdir(exist_ok=True,
                                                         parents=True)
        # パースに失敗したやつを無視する時、有効にする
        # Path(out_filename).touch()
        time.sleep(5)
        return None
    return f"/twitter/tweet/{day}/{digest}"
Esempio n. 34
0
def _clean_html_body(request, email, body, charset):
    """Clean up a html part as best we can

    Doesn't catch LXML errors
    """
    html_tree = lxml_html.fromstring(body)

    # if the HTML doc says its a different encoding, use that
    for meta_tag in html_tree.xpath("/html/head/meta"):
        if meta_tag.get("http-equiv", None) == "Content-Type":
            try:
                content = meta_tag.attrib["content"]
                content = content.split(";", 1)[1]
                charset = dict(HEADER_PARAMS.findall(content))["charset"]
                break
            except (KeyError, IndexError):
                pass
        elif "charset" in meta_tag.attrib:
            charset = meta_tag.attrib["charset"]
            break

    try:
        # check there's a body for premailer
        if html_tree.find("body") is not None:
            html_tree = InboxenPremailer(html_tree).transform()
    except Exception as exc:
        # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
        messages.info(request, _("Part of this message could not be parsed - it may not display correctly"))
        _log.warning("Failed to render CSS for %s: %s", email["eid"], exc)

    # Mail Pile uses this, give back if you come up with something better
    cleaner = Cleaner(
        allow_tags=HTML_ALLOW_TAGS,
        kill_tags=["style"],  # remove style tags, not attrs
        remove_unknown_tags=False,
        safe_attrs=HTML_SAFE_ATTRS,
        safe_attrs_only=True,
        style=False,  # keep style attrs
    )

    html_tree = cleaner.clean_html(html_tree)

    # filter images if we need to
    if not email["display_images"]:
        for img in html_tree.xpath("//img"):
            try:
                # try to delete src first - we don't want to add a src where there wasn't one already
                del img.attrib["src"]
                # replace image with 1px png
                img.attrib["src"] = staticfiles_storage.url("imgs/placeholder.svg")
                email["has_images"] = True
            except KeyError:
                pass

    for link in html_tree.xpath("//a"):
        try:
            # proxy link
            url = link.attrib["href"]
            link.attrib["href"] = proxy_url(url)
        except KeyError:
            pass

        # open link in tab
        link.attrib["target"] = "_blank"
        # and prevent window.opener bug (noopener is only supported in newer
        # browsers, plus we already set noreferrer in the head)
        link.attrib["rel"] = "noreferrer"

    # finally, export to unicode
    body = unicode_damnit(etree.tostring(html_tree, method="html"), charset)
    return safestring.mark_safe(body)
Esempio n. 35
0
class Topic(models.Model):

    author = models.ForeignKey(Member,
                               related_name='topic_created',
                               verbose_name=u"演讲者")
    in_event = models.ForeignKey(Event,
                                 related_name='topic_shown_in',
                                 blank=True,
                                 null=True,
                                 verbose_name=u"已安排在此活动中")
    description = models.TextField(u"简介", max_length=200, blank=False)
    content = models.TextField(u"内容", blank=True)
    html = models.TextField(u'HTML', blank=True, null=True)
    content_type = models.CharField(blank=False, default='html', max_length=30)
    accepted = models.BooleanField(
        default=False)  #该话题是否已经被管理员接受,True才能在活动正式的公布页面显示, 同时in_event才能显示

    name = models.CharField("名称", max_length=255, blank=False)
    created = models.DateTimeField(auto_now_add=True,
                                   auto_now=True,
                                   blank=True,
                                   null=True)
    last_modified = models.DateTimeField(auto_now_add=True,
                                         auto_now=True,
                                         blank=True,
                                         null=True)
    last_modified_by = models.ForeignKey(
        Member, related_name='%(class)s_last_modified')
    #aggrgated
    total_votes = models.PositiveIntegerField(default=0)
    total_favourites = models.PositiveIntegerField(default=0, editable=False)

    html_cleaner = Cleaner(style=False, embedded=False, safe_attrs_only=False)

    def set_author(self, user):
        author = user.get_profile()
        self.last_modified_by = author  # last_modified_by 总是author?
        self.author = author
        return self

    @property
    def poll_status(self):
        if self.in_event:
            if self.accepted:
                if self.in_event.is_upcoming:
                    return u'网络投票进行中'
                elif self.in_event.is_off:
                    return u'本话题所属活动已经结束'
            else:
                return u'活动等待管理员审核中,审核完毕后即可开始投票'
        else:
            return u'该话题尚未加入任何活动,无法开始投票'

        return u'我们也不知道怎么了'

    @property
    def rendered_content(self):
        if self.content_type == 'restructuredtext':
            '''暂时取消restructuredtext的处理'''
            #return restructuredtext(self.content)

            #创建lxml的html过滤器,保留object,embed,去除js,iframe
            return self.html_cleaner.clean_html(self.content)  #使用过滤器,返回安全的html
        elif self.content_type == 'html':
            return self.html
        else:
            return restructuredtext(self.content)

    @property
    def is_shown(self):
        '''该话题所属活动是否正在进行或已经结束'''
        return self.in_event and (self.in_event.is_off
                                  or self.in_event.is_running)

    @property
    def is_arranged(self):
        '''该话题是否已经加入到活动,并且活动尚未开始'''
        return self.in_event and (self.in_event.is_upcoming == True)

    @property
    def content_text(self):
        try:
            content = self.content.decode('utf-8')
        except UnicodeEncodeError:
            content = self.content

        content_element = html.fromstring(content)

        return content_element.text_content()

    @property
    def summary(self):
        content = self.content_text

        if len(content) > 60:
            return '%s...' % content[:60]
        else:
            return content

    def style_seed(self, range=4):
        '''用来显示一些随机的样式'''
        return self.id % range

    def get_absolute_url(self):
        return reverse('topic', args=[self.id])

    def send_notification_mail(self, type):
        '''在话题提交及更新时发送提醒邮件'''

        type_dict = {
            'created': u'建立',
            'updated': u'更新',
        }
        subject = u"[Open Party] 话题%(type)s:%(name)s" % {
            'type': type_dict[type.lower()],
            'name': self.name
        }

        ctx = {
            'topic': self,
            'action': type_dict[type.lower()],
            'modification_date': str(datetime.now()),
            'site': settings.SITE_URL
        }

        message = render_to_string('core/topic_notification_email.txt', ctx)

        admin_user_set = User.objects.filter(is_staff=True)  #给具有管理权限的用户发信
        #没有用mail_admins(),更灵活一些
        mail_queue = []
        for each_admin in admin_user_set:
            email = EmailMessage(subject,
                                 message,
                                 settings.DEFAULT_FROM_EMAIL,
                                 [each_admin.email],
                                 '',
                                 headers={'Reply-To': each_admin.email})
            email.content_subtype = "plain"
            mail_queue.append(email)

        #使用单次SMTP连接批量发送邮件
        connection = mail.get_connection()  # Use default e-mail connection
        connection.send_messages(mail_queue)

        return True

    def __unicode__(self):
        return self.name

    votes = generic.GenericRelation('Vote')

    #TODO Add a custom manager for most web voted & unshown topics, to add to a upcoming event

    def save(self, *args, **kwargs):
        self.total_votes = self.votes.count()
        if not self.content or self.content.strip() == '':
            self.content = self.description
        super(Topic, self).save(*args, **kwargs)

    class Meta:
        app_label = 'core'
Esempio n. 36
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Esempio n. 37
0
def as_clean_html(value):
    try:
        return Cleaner(style=True, scripts=True).clean_html(value.strip())
    except LxmlError:
        return '<p></p>'
Esempio n. 38
0
def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]
Esempio n. 39
0
                        html = requests.get(url).text
                        doc = fromstring(html)
                        tags = [
                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span',
                            'img', 'area', 'map'
                        ]
                        args = {
                            'meta': False,
                            'safe_attrs_only': False,
                            'page_structure': False,
                            'scripts': True,
                            'style': True,
                            'links': True,
                            'remove_tags': tags
                        }
                        cleaner = Cleaner(**args)
                        path = '/html/body'
                        body = doc.xpath(path)[0]
                        result = cleaner.clean_html(
                            body).text_content().encode('ascii', 'ignore')

                        dict_result[el] += "\n\n " + " ".join(
                            str(result).split(" ")[:count_of_words])
            except:
                print("error at ", el[:100])
                dict_result[el] = ""
        else:
            dict_result[el] = ""

        idx += 1
        count += 1
Esempio n. 40
0
single_blank_pat = re.compile(r'\s')
all_digit_pat = re.compile(r'^\d*$')
title_sep_pat = re.compile(r'[-_|-]')  #—
site_name_end_pat = re.compile(r'(网|在线|门户|频道|栏目|站点?|新闻|政府|办公室)$')
escape_pat = re.compile(r'&(nbsp|lt|gt);')
single_punc_pat = re.compile(r'[^ 0-9A-Za-z\u4E00-\u9FFF]')
article_date_pat = re.compile(
    r'(?:^|[^-+\d])((?:19|20)?\d{2})([\./\-_年]?)(1[0-2]|0?[1-9])([\./\-_月]?)([1-2][0-9]|3[0-1]|0?[1-9])'
    +
    r'(?:[^-+:\d](?:\s*((?:1|0?)[0-9]|2[0-3])[:点时]((?:[1-5]|0?)[0-9])(?:[:分]((?:[1-5]|0?)[0-9]))?(?:[^-+:\d]|$))?|$)'
)
blank_date_pat = re.compile(r'(?<!\d)\s|\s(?!\d)')
time_prefix_pat = re.compile(r'时间|日期|时期|发表|发布|提交|上传|于')
html_body_pat = re.compile(r'<\s*/\s*(html|body)\s*>', re.IGNORECASE)

cleaner = Cleaner()
cleaner.javascript = True  # This is True because we want to activate the javascript filter
cleaner.style = True  # This is True because we want to activate the styles & stylesheet filter


def remove_blanks(text):
    text = blank_pat.sub(' ', text)
    return blank_date_pat.sub('', text)


def strip_site_names_from_title(title):
    title = title_sep_pat.split(title, 1)[0].strip()
    parts = title.split()
    while len(parts) > 1:
        if site_name_end_pat.search(parts[-1]):
            parts.pop()
Esempio n. 41
0
import logging
import re

from lxml import etree
from lxml.html.clean import Cleaner

from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED
from .utils import trim

LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
Esempio n. 42
0
    def get_chapters(self, extract_images=True, images_path=None, as_list=False, clean=True, cleaner_params={}):
        """
        Extracts content of all files from epub into a single string and returns it.
        
        Args:
            extract_images (bool): If it should extract images from epub. Defaults to True.
            images_path (str): A path where all images src's should lead to. 
                If not set, uses self.epub_info["images"], which should be set by self.extract_images() if extract_images = False.
                If self.epub_info["images"] is not set, uses "images/".
            as_list (bool): Return chapters as a list or as an HTML-string. Defaults to False.
            clean (bool): If chapters should be cleaned off of malicious HTML.
            cleaner_params (dict): Dictionary of cleaner params, 
                a full list of which is available in the documentation to lxml.html.clean.Cleaner class.
        Returns:
            chapters (str|list): String or list of strings containing the text of the book formatted as html.
            None: if input file is not found.
        Raises:
            KeyError: if a file is not found in the epub archive.
        """
        if not self.ifile: return
            
        #set paths to images in chapters' markup
        epub_images = self.get_epub_info().get("images")
        if images_path:
            images_path =  images_path
        else:
            if epub_images:
                images_path = epub_images
            else:
                images_path = "images/"
                
        #extract images
        if extract_images:
            self.extract_images()

        files = self.__get_files()
        
        #create a cleaner
        cleaner = Cleaner(**cleaner_params) if clean else None
        
        if as_list:
            chapters = []
            for filename in files:
                if ".htm" in filename or ".xml" in filename:
                    original = find_file(self.ifile,filename)
                    try:
                        with self.ifile.open(original) as f:
                            chapter = build_chapter(f, images_path=images_path, cleaner=cleaner)
                            chapter.attrib["id"]=filename
                            chapters.append(html.tostring(chapter).decode('utf-8'))
                    except KeyError as e:
                        handle_error(e)
        else:
            chapters = etree.Element("div")
            for filename in files:
                if ".htm" in filename or ".xml" in filename:
                    original = find_file(self.ifile,filename)
                    try:
                        with self.ifile.open(original) as f:
                            chapter = build_chapter(f, images_path=images_path, cleaner=cleaner)
                            chapter.attrib["id"]=filename
                            chapters.append(chapter)
                    except KeyError as e:
                        handle_error(e)
            chapters = html.tostring(chapters).decode('utf-8')
        return chapters
Esempio n. 43
0

RE_BLANK_LINE = re.compile(r'(\n\s*)(\n\s*)+')

lxml_html_parser = lxml.html.HTMLParser(remove_blank_text=True,
                                        remove_comments=True,
                                        collect_ids=False)

lxml_text_html_cleaner = Cleaner(
    scripts=True,
    javascript=True,
    comments=True,
    style=True,
    links=True,
    meta=True,
    page_structure=True,
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=True,
    annoying_tags=True,
    remove_tags=set(['body']),
    kill_tags=set(['code', 'pre', 'img', 'video', 'noscript']),
)


def story_html_to_text(content, clean=True):
    """
    >>> content = '''<html><body>
    ... <pre>hello world</pre>
    ...
    ...
def mainPage(board):
    url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid={0}".format(
        board)
    #url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid=voyage"
    useragnet = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
    header = {'User-Agent': useragnet}
    #proxies = {"http": "http://spys.one/en/","https": "https://free-proxy-list.net/",}

    for i in range(2, 3):
        res = requests.get(url, headers=header)
        cleaner = Cleaner(style=True,
                          scripts=True,
                          comments=True,
                          javascript=True,
                          page_structure=False,
                          safe_attrs_only=False)
        content = cleaner.clean_html(
            res.content.decode('utf-8')).encode('utf-8')
        #print(content)
        bs = BeautifulSoup(content, 'html.parser')
        body = bs.find("tbody", {"id": "threadbits_forum_60"})
        print(type(body))
        trSec = body.find_all("tr")
        print(trSec)
        artileList = []

        for tr in trSec:
            articleDict = {}
            t = tr.find_all("td",
                            {"id": re.compile(r"^td_threadtitle_[0-9]+")})

            tdSec1 = tr.select("td", {"class": "alt1"})
            tag = tdSec1[1].select("div")
            # print(tag)
            tdSec2 = tr.select("td", {"class": "alt2", "title": "回覆"})
            # print(tdSec)
            for td in tdSec2:
                date = td.select("div.smallfont")
                if len(date) > 0:
                    #print(date[0].text.strip())
                    articleDict["date"] = date[0].text.strip().split("\n")[0]
                timer = td.select("div.smallfont > span.time")
                if len(timer) > 0:
                    #print(time[0].text.strip())
                    articleDict["time"] = timer[0].text.strip()
                user = td.select("div.smallfont > span.byx")
                if len(user) > 0:
                    #print(user[0].text.strip())
                    articleDict["author"] = user[0].text.strip()

                href = td.select("a")
                # print(href)
                for hr in href:
                    if hr["href"] != "#":
                        print("title:", hr.text)
                        if hr.text == "":
                            continue

                        articleDict["title"] = hr.text
                        articleDict[
                            "url"] = "https://www.backpackers.com.tw/forum/{0}".format(
                                hr["href"])
                        time.sleep(random.randint(1, 5))
                        # 爬每篇文章,回傳內文跟圖片連結
                        article, imgList = CrawlArticle(
                            articleDict["url"], header)
                        # 儲存圖片&回傳儲存路徑
                        ipath = getImage(imgList)
                        articleDict["content"] = article
                        articleDict["imgPath"] = ipath

        if len(articleDict.keys()) > 0:
            artileList.append(articleDict)

        print(artileList)
        #for a in artileList:
        #print(a)
        #    InsertMongo(a, "Backpacker")

        time.sleep(random.randint(1, 5))
        url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid={0}&order=desc&page={1}".format(
            board, i)
Esempio n. 45
0
    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
    'nav', 'table', 'tr'
])
DOUBLE_NEWLINE_TAGS = frozenset([
    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
    'p', 'pre', 'title', 'ul'
])

_clean_html = Cleaner(
    scripts=True,
    javascript=False,  # onclick attributes are fine
    comments=True,
    style=True,
    links=True,
    meta=True,
    page_structure=False,  # <title> may be nice to have
    processing_instructions=True,
    embedded=True,
    frames=True,
    forms=False,  # keep forms
    annoying_tags=False,
    remove_unknown_tags=False,
    safe_attrs_only=False,
).clean_html


def _cleaned_html_tree(html):
    if isinstance(html, lxml.html.HtmlElement):
        tree = html
    else:
        tree = parse_html(html)
    return _clean_html(tree)
Esempio n. 46
0
def clean_attributes(html):
    while htmlstrip.search(html):
        html = htmlstrip.sub('<\\1\\2>', html)
    return html


def normalize_spaces(s):
    if not s: return ''
    """replace any sequence of whitespace
    characters with a single space"""
    return ' '.join(s.split())


html_cleaner = Cleaner(scripts=True,
                       javascript=True,
                       comments=True,
                       style=True,
                       links=True,
                       meta=False,
                       add_nofollow=False,
                       page_structure=False,
                       processing_instructions=True,
                       embedded=False,
                       frames=False,
                       forms=False,
                       annoying_tags=False,
                       remove_tags=None,
                       remove_unknown_tags=False,
                       safe_attrs_only=False)
Esempio n. 47
0
class UclCopySpider(scrapy.contrib.spiders.CrawlSpider):

    # Track parsed items
    lock = threading.Lock()
    parsed = {}

    # Setup html cleaner to remove js and css
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.page_structure = False

    # Store a counter of files parsed to save a unique filename
    counter = 0
    name = "ucl"

    # Define the allowed domains for crawling
    allowed_domains = [
        "advancedteaching.cs.ucl.ac.uk", "blogs.ucl.ac.uk",
        "busics.cs.ucl.ac.uk", "ccs.chem.ucl.ac.uk", "crest.cs.ucl.ac.uk",
        "crf.casa.ucl.ac.uk", "discovery.ucl.ac.uk", "geometry.cs.ucl.ac.uk",
        "haig.cs.ucl.ac.uk", "iris.ucl.ac.uk", "is.cs.ucl.ac.uk",
        "mediafutures.cs.ucl.ac.uk", "nrg.cs.ucl.ac.uk",
        "onlinestore.ucl.ac.uk", "pplv.cs.ucl.ac.uk", "readinglists.ucl.ac.uk",
        "reality.cs.ucl.ac.uk", "sec.cs.ucl.ac.uk", "vecg.cs.ucl.ac.uk",
        "vis.cs.ucl.ac.uk", "web4.cs.ucl.ac.uk", "www-mice.cs.ucl.ac.uk",
        "www.bartlett.ucl.ac.uk", "www.cege.ucl.ac.uk", "www.chem.ucl.ac.uk",
        "www.cs.ucl.ac.uk", "www.csml.ucl.ac.uk", "www.ee.ucl.ac.uk",
        "www.engineering.ucl.ac.uk", "www.gatsby.ucl.ac.uk",
        "www.geog.ucl.ac.uk", "www.grad.ucl.ac.uk", "www.homepages.ucl.ac.uk",
        "www.icn.ucl.ac.uk", "www.igp.ucl.ac.uk", "www.laws.ucl.ac.uk",
        "www.london-in-motion.ucl.ac.uk", "www.mailinglists.ucl.ac.uk",
        "www.mecheng.ucl.ac.uk", "www.meng.ucl.ac.uk", "www.phon.ucl.ac.uk",
        "www.silva-sandbox.ucl.ac.uk", "www.star.ucl.ac.uk", "www.ucl.ac.uk",
        "www0.cs.ucl.ac.uk", "zuserver2.star.ucl.ac.uk"
    ]

    # Define the starting pages to crawl
    start_urls = [
        "http://advancedteaching.cs.ucl.ac.uk", "http://blogs.ucl.ac.uk",
        "http://busics.cs.ucl.ac.uk", "http://ccs.chem.ucl.ac.uk",
        "http://crest.cs.ucl.ac.uk", "http://crf.casa.ucl.ac.uk",
        "http://discovery.ucl.ac.uk", "http://geometry.cs.ucl.ac.uk",
        "http://haig.cs.ucl.ac.uk", "http://iris.ucl.ac.uk",
        "http://is.cs.ucl.ac.uk", "http://mediafutures.cs.ucl.ac.uk",
        "http://nrg.cs.ucl.ac.uk", "http://onlinestore.ucl.ac.uk",
        "http://pplv.cs.ucl.ac.uk", "http://readinglists.ucl.ac.uk",
        "http://reality.cs.ucl.ac.uk", "http://sec.cs.ucl.ac.uk",
        "http://vecg.cs.ucl.ac.uk", "http://vis.cs.ucl.ac.uk",
        "http://web4.cs.ucl.ac.uk", "http://www-mice.cs.ucl.ac.uk",
        "http://www.bartlett.ucl.ac.uk", "http://www.cege.ucl.ac.uk",
        "http://www.chem.ucl.ac.uk", "http://www.cs.ucl.ac.uk",
        "http://www.csml.ucl.ac.uk", "http://www.ee.ucl.ac.uk",
        "http://www.engineering.ucl.ac.uk", "http://www.gatsby.ucl.ac.uk",
        "http://www.geog.ucl.ac.uk", "http://www.grad.ucl.ac.uk",
        "http://www.homepages.ucl.ac.uk", "http://www.icn.ucl.ac.uk",
        "http://www.igp.ucl.ac.uk", "http://www.laws.ucl.ac.uk",
        "http://www.london-in-motion.ucl.ac.uk",
        "http://www.mailinglists.ucl.ac.uk", "http://www.mecheng.ucl.ac.uk",
        "http://www.meng.ucl.ac.uk", "http://www.phon.ucl.ac.uk",
        "http://www.silva-sandbox.ucl.ac.uk", "http://www.star.ucl.ac.uk",
        "http://www.ucl.ac.uk", "http://www0.cs.ucl.ac.uk",
        "http://zuserver2.star.ucl.ac.uk"
    ]

    # Define additional rules to limit crawlable_domains within allowed domains
    crawlable_domains = [
        "http://advancedteaching.cs.ucl.ac.uk/.*", "http://blogs.ucl.ac.uk/.*",
        "http://busics.cs.ucl.ac.uk/.*", "http://ccs.chem.ucl.ac.uk/.*",
        "http://crest.cs.ucl.ac.uk/.*", "http://crf.casa.ucl.ac.uk/.*",
        "http://discovery.ucl.ac.uk/.*", "http://geometry.cs.ucl.ac.uk/.*",
        "http://haig.cs.ucl.ac.uk/.*", "http://iris.ucl.ac.uk/.*",
        "http://is.cs.ucl.ac.uk/.*", "http://mediafutures.cs.ucl.ac.uk/.*",
        "http://nrg.cs.ucl.ac.uk/.*", "http://onlinestore.ucl.ac.uk/.*",
        "http://pplv.cs.ucl.ac.uk/.*", "http://readinglists.ucl.ac.uk/.*",
        "http://reality.cs.ucl.ac.uk/.*", "http://sec.cs.ucl.ac.uk/.*",
        "http://vecg.cs.ucl.ac.uk/.*", "http://vis.cs.ucl.ac.uk/.*",
        "http://web4.cs.ucl.ac.uk/.*", "http://www-mice.cs.ucl.ac.uk/.*",
        "http://www.bartlett.ucl.ac.uk/.*", "http://www.cege.ucl.ac.uk/.*",
        "http://www.chem.ucl.ac.uk/.*", "http://www.cs.ucl.ac.uk/.*",
        "http://www.csml.ucl.ac.uk/.*", "http://www.ee.ucl.ac.uk/.*",
        "http://www.engineering.ucl.ac.uk/.*",
        "http://www.gatsby.ucl.ac.uk/.*", "http://www.geog.ucl.ac.uk/.*",
        "http://www.grad.ucl.ac.uk/.*", "http://www.homepages.ucl.ac.uk/.*",
        "http://www.icn.ucl.ac.uk/.*", "http://www.igp.ucl.ac.uk/.*",
        "http://www.laws.ucl.ac.uk/.*",
        "http://www.london-in-motion.ucl.ac.uk/.*",
        "http://www.mailinglists.ucl.ac.uk/.*",
        "http://www.mecheng.ucl.ac.uk/.*", "http://www.meng.ucl.ac.uk/.*",
        "http://www.phon.ucl.ac.uk/.*",
        "http://www.silva-sandbox.ucl.ac.uk/.*",
        "http://www.star.ucl.ac.uk/.*", "http://www.ucl.ac.uk/.*",
        "http://www0.cs.ucl.ac.uk/.*", "http://zuserver2.star.ucl.ac.uk/.*"
    ]

    rules = (Rule(LinkExtractor(allow_domains=crawlable_domains),
                  callback='parse'), )

    # The method called on a document retrieval
    def parse(self, response):

        # Ignore non html responses
        if not isinstance(response, HtmlResponse):
            return

    # Clean html responses of non-html
        clean_html = self.cleaner.clean_html(response.body)
        soup = BeautifulSoup(clean_html, "lxml")

        # Use a lock whilst tracking document numbers and urls crawled
        self.lock.acquire()
        try:
            with open('sitescrawled.txt', "a") as myfile:
                myfile.write(response.url + "\n")
            with open('sites/url' + str(self.counter), 'wb') as f:
                # Output BeautifulSoup formatted html, with additonal <url> header tag
                f.write("<url>" + response.url + "</url>\n" +
                        soup.prettify("utf-8"))
            self.counter += 1
        finally:
            self.lock.release()

        for href in response.css("a::attr('href')"):
            # Ignore php items and hyperlink tags in the url header
            url = response.urljoin(href.extract())
            url = url.split('?')[0].split('#')[0]
            yield scrapy.Request(url)
Esempio n. 48
0
  def get_message_tree(self):
    tree = {
      'id': self.get_msg_info(self.index.MSG_ID),
      'tags': self.get_msg_info(self.index.MSG_TAGS).split(','),
      'summary': self.get_msg_summary(),
      'headers': {},
      'headers_lc': {},
      'attributes': {},
      'text_parts': [],
      'html_parts': [],
      'attachments': [],
      'conversation': [],
    }

    conv_id = self.get_msg_info(self.index.MSG_CONV_MID)
    if conv_id:
      conv = Email(self.index, int(conv_id, 36))
      tree['conversation'] = convs = [conv.get_msg_summary()]
      for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','):
        if rid:
          convs.append(Email(self.index, int(rid, 36)).get_msg_summary())

    # FIXME: Decide if this is strict enough or too strict...?
    html_cleaner = Cleaner(page_structure=True, meta=True, links=True,
                           javascript=True, scripts=True, frames=True,
                           embedded=True, safe_attrs_only=True)

    msg = self.get_msg()
    for hdr in msg.keys():
      tree['headers'][hdr] = self.index.hdr(msg, hdr)
      tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr)

    # Note: count algorithm must match that used in extract_attachment above
    count = 0
    for part in msg.walk():
      mimetype = part.get_content_type()
      if mimetype.startswith('multipart/'):
        continue

      count += 1
      if (part.get('content-disposition', 'inline') == 'inline'
      and mimetype in ('text/plain', 'text/html')):
        payload, charset, openpgp = self.decode_payload(part)
        if (mimetype == 'text/html' or
            '<html>' in payload or
            '</body>' in payload):
          print "Adding stuff to pgp tree!"
          tree['html_parts'].append({
            'openpgp_status': openpgp and openpgp[0] or '',
            'openpgp_data': openpgp and openpgp[1] or '',
            'charset': charset,
            'type': 'html',
            'data': (payload.strip() and html_cleaner.clean_html(payload)) or ''
          })
          tree['text_parts'][0]["openpgp_status"] = openpgp and openpgp[0] or ''
          tree['text_parts'][0]["openpgp_data"] = openpgp and openpgp[1] or ''
        else:
          tree['text_parts'].extend(self.parse_text_part(payload, charset,
                                                         openpgp))
      else:
        tree['attachments'].append({
          'mimetype': mimetype,
          'count': count,
          'part': part,
          'length': len(part.get_payload(None, True) or ''),
          'content-id': part.get('content-id', ''),
          'filename': part.get_filename() or ''
        })

    if self.is_editable():
      tree['is_editable'] = True
      tree['editing_strings'] = self.get_editing_strings(tree)

    return tree
Esempio n. 49
0
    f = ArchiveIterator(options.input.buffer)
else:
    f = ArchiveIterator(open(options.input, 'rb'))

if options.output == sys.stdout:
    fo = WARCWriter(options.output.buffer, gzip=True)
else:
    fo = WARCWriter(open(options.output, 'wb'), gzip=True)

if options.pdfpass is not None:
    po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True)

if not options.pdfpass and options.pdfextract:
    extractor = ExtrP()

cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

if options.output == sys.stdout:
    filename = options.input
else:
    filename = options.output

fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'}))

for record in f:
    # Initial checks
    if record.rec_type != 'response' and record.rec_type != 'resource':
        continue
    if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>':
        url = record.rec_headers.get_header('WARC-Target-URI')[1:-1]
    else:
Esempio n. 50
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Esempio n. 51
0
class HTMLSupport(object):
    """Provides helpers for HTML file context extraction."""
    # this is from lxml/apihelpers.pxi
    RE_XML_ENCODING = re.compile(
        ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)',
        re.U)  # noqa

    cleaner = Cleaner(
        page_structure=True,
        scripts=True,
        javascript=True,
        style=True,
        links=True,
        embedded=True,
        forms=True,
        frames=True,
        meta=True,
        # remove_tags=['a'],
        kill_tags=['head'])

    def get_meta(self, doc, field):
        for field_attr in ('property', 'name'):
            for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)):
                content = collapse_spaces(el.get('content'))
                if content is not None and len(content):
                    return content

    def extract_html_header(self, doc):
        """Get metadata from the HTML head element."""
        self.update('summary', self.get_meta(doc, 'og:title'))
        self.update('title', doc.findtext('.//title'))
        self.update('summary', self.get_meta(doc, 'og:description'))
        self.update('summary', self.get_meta(doc, 'description'))
        self.update('author', self.get_meta(doc, 'author'))
        self.update('author', self.get_meta(doc, 'og:site_name'))
        self.update('published_at',
                    self.get_meta(doc, 'artcile:published_time'))  # noqa
        self.update('modified_at', self.get_meta(doc, 'artcile:modified_time'))

        for field in ['keywords', 'news_keywords']:
            content = self.get_meta(doc, field)
            if content is not None:
                content = [collapse_spaces(c) for c in content.split(',')]
                content = [c for c in content if len(c)]
                self.result.keywords.extend(content)

    def extract_html_text(self, doc):
        """Get all text from a DOM, also used by the XML parser."""
        text = ' '.join(self.extract_html_elements(doc))
        text = collapse_spaces(text)
        if len(text):
            return text

    def extract_html_elements(self, el):
        yield el.text or ' '
        for child in el:
            for text in self.extract_html_elements(child):
                yield text
        yield el.tail or ' '

    def extract_html_content(self, html_body, fix_html=True):
        """Ingestor implementation."""
        if html_body is None:
            return
        try:
            try:
                doc = html.fromstring(html_body)
            except ValueError:
                # Ship around encoding declarations.
                # https://stackoverflow.com/questions/3402520
                html_body = self.RE_XML_ENCODING.sub('', html_body, count=1)
                doc = html.fromstring(html_body)
        except (ParserError, ParseError, ValueError):
            raise ProcessingException("HTML could not be parsed.")

        self.extract_html_header(doc)
        self.cleaner(doc)
        text = self.extract_html_text(doc)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text)
Esempio n. 52
0
from pyquery import PyQuery as pq
from lxml import html

import re
from lxml.html.clean import Cleaner
import logging
from urllib.parse import urlsplit, parse_qs


cleaner = Cleaner(javascript=True, scripts=True, style=True)
SinglePost = re.compile(r"http:\/\/www\.news\.uestc\.edu\.cn\/\?n\=UestcNews\.Front\.Document\.ArticlePage\&Id\=(\d+)")
Column = re.compile(r"http:\/\/www\.news\.uestc\.edu\.cn\/\?n\=UestcNews.Front.Category.Page&CatId=42")


logger = logging.getLogger("parser")


def makeParser(content, encoding="utf8"):
    content = content.decode(encoding, "ignore") if isinstance(content, bytes) else str(content)
    return pq(content)


def tostring(node):
    """
    Convert to the html string, and clean the html
    """
    return cleaner.clean_html(html.tostring(node, method="html", encoding="utf8").decode()).strip()


def convertUrl(url, strict=False):
    logger.debug(url)
Esempio n. 53
0
    require(request.authz.can(collection['id'], action))
    return collection


def get_url_path(url):
    try:
        return url_parse(url).replace(netloc='', scheme='').to_url() or '/'
    except Exception:
        return '/'


CLEANER = Cleaner(style=True,
                  meta=True,
                  links=False,
                  remove_tags=['body', 'form'],
                  kill_tags=[
                      'area', 'audio', 'base', 'bgsound', 'embed', 'frame',
                      'frameset', 'head', 'img', 'iframe', 'input', 'link',
                      'map', 'meta', 'nav', 'object', 'plaintext', 'track',
                      'video'
                  ])


def sanitize_html(html_text, base_url, encoding=None):
    """Remove anything from the given HTML that must not show up in the UI."""
    if html_text is None or not len(html_text.strip()):
        return
    try:
        cleaned = CLEANER.clean_html(html_text)
        encoding = encoding or 'utf-8'
        parser = html.HTMLParser(encoding=encoding)
        data = cleaned.encode(encoding, 'replace')
Esempio n. 54
0
plainTextFile = lzma.open(options.outDir + "/" + options.prefix + "plain_text.xz", "w")
# Boilerpipe cleaning is optional
if options.boilerpipe:
    deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w")

for record in f:
    # We convert into UTF8 first of all
    orig_encoding, text = convert_encoding(record.payload.read())
    url = record.url

    if orig_encoding is None:
        logging.info("Encoding of document " + url + " could not be identified")

    if len(text) > 0:
        # HTML is then normalized
        cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

        tree=""
        try:
            cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
            document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False)
            tree = etree.tostring(document)
        except:
            continue

        tree = etree.tostring(document)
        cleantree = tree.decode("utf8").replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        lang = guess_lang_from_data2(cleantree)
Esempio n. 55
0
    telefono = "".join(links[1].text_content().split())
    fax = "".join(links[2].text_content().split())

    if len(links[3].cssselect("a")[0].attrib['href']) > len('http://'):

        web = links[3].cssselect("a")[0].attrib['href']

    else:

        web = ""

    return direccion, telefono, fax, web


cleaner = Cleaner()
cleaner.kill_tags = ['strong']

for i in range(1, 45):
    base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag=' + str(
        i)

    html = scraperwiki.scrape(base_url)
    root = lxml.html.fromstring(html)
    links = root.cssselect("ul#listado-productos li")

    for link in links:

        record = {}

        name = link.cssselect("a")[0].text_content()
Esempio n. 56
0
from lxml.html.clean import Cleaner

from urlparse import urlparse, parse_qs
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from .functions import parent_tag, block_length, number_pattern, url_edit_distance
from .preprocess import Tagset

_cleaner = Cleaner(style=True,
                   scripts=True,
                   embedded=True,
                   links=True,
                   page_structure=False,
                   remove_unknown_tags=False,
                   meta=False,
                   safe_attrs_only=False)


def tokenize(text):
    return text.split()


def get_text(anchor):
    return anchor.text


def get_attr_text(anchor):
    return anchor.get('class', '') + anchor.get('id', '')
from lxml.html.clean import Cleaner
import lxml
from lxml import etree
from lxml.html import HtmlElement
from lxml.html import tostring as lxml_html_tostring
from lxml.html.soupparser import fromstring as soup_parse
from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode

DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older']

cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True

useragent = None

# Typing.
_Find = Union[List['Element'], 'Element']
_XPath = Union[List[str], List['Element'], str, 'Element']
_Result = Union[List['Result'], 'Result']
_HTML = Union[str, bytes]
_BaseHTML = str
_UserAgent = str
_DefaultEncoding = str
_URL = str
_RawHTML = bytes
Esempio n. 58
0
def clean_html(text):
    cleaner = Cleaner(style=False)
    return cleaner.clean_html(text)
Esempio n. 59
0
        #print(content.decode('utf-8'))
        data['text'] = content.decode('utf-8')
    except Exception as e:
        raise e
    return data


if __name__ == '__main__':
    url = 'http://www.lajyzx.cn/Bulletin/BulletinBrowse.aspx?id=9888'
    HEADERS = {#'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        'User-Agent': 'Mozilla/5.0 (Linux; U; Android 7.1.1; zh-cn; MI 6 Build/NMF26X) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',

    }

    cleaner = Cleaner(page_structure=False,
                      links=False,
                      style=True,
                      scripts=True)
    response = requests.get(url, headers=HEADERS)
    response.encoding = 'utf-8'
    time.sleep(2)
    #print(response.text)
    body = lxml.html.fromstring(response.text)
    #
    #element = element.replace('DownLoad(','').replace(')','')
    #做判断
    #杭州
    #elements = re.findall('onclick="DownLoad\((.*?)\)"',response.text)
    #del_ele = tree.xpath('//div[@class="MainTitle"]')
    #for ele in del_ele:
    #    ele.clear()
    #node_elems = tree.xpath('//div/ul/li/a[@href="javascript:void(0);"]')
Esempio n. 60
0
        return [item[0].text for item in root]

    @staticmethod
    def _text_cleanup(text):
        global cleaner
        raw_html = cleaner.clean_html(text)
        # TODO: remove links
        # TODO: remove code
        # TODO: remove Latin
        # TODO: remove garbage
        # TODO: replace Latin omographs with Cyrillic
        # TODO: unify qmarks
        if len(raw_html) > 0:
            unwrap_regex = re.compile(r'\A<div>(.*?)</div>\Z',
                                      flags=re.MULTILINE | re.DOTALL)
            cut_html = unwrap_regex.match(raw_html).group(1)
            return cut_html
        else:
            return None

    @staticmethod
    def texts_cleanup(texts):
        return list(
            filter(lambda s: len(s) > 0, map(ScrapyImport._text_cleanup,
                                             texts)))


cleaner = Cleaner(allow_tags=[''],
                  remove_unknown_tags=None,
                  kill_tags=['code', 'blockquote', 's', 'strike'])