Esempio n. 1
0
def sanitize_payload(payload):
    "Sanitize HTML"
    if not payload:
        return '', ''
    styles = []
    payload = clean_payload(payload)
    body_style, body_class = get_body_style(payload)
    if body_style:
        styles.append(body_style)
    safe_attrs = set(defs.safe_attrs)
    safe_attrs.add('style')
    cleaner = Cleaner(remove_tags=UNCLEANTAGS,
                    safe_attrs_only=True,
                    safe_attrs=safe_attrs)
    payload = HTMLTITLE_RE.sub('', payload)
    try:
        html = cleaner.clean_html(payload)
    except ValueError:
        payload = bytes(bytearray(payload, encoding='utf-8'))
        html = cleaner.clean_html(payload)
    except XMLSyntaxError:
        html = ''
    mainstyle = sanitize_css(get_style(html))
    if mainstyle:
        styles.append(decode(mainstyle))
    style = u'\n'.join(styles)
    html = clean_styles(CSS_COMMENT_RE.sub('', html))
    html = set_body_class(html, body_class)
    return html.strip(), style.strip()
Esempio n. 2
0
    def _get_breakdowns(self):
        """
        returns breakdowns from GWDG in given timewindow
        """
        #load feed first, since not working with lxml directly
        r = requests.get(URL)
        
        #load url and parse it with html parser
        root = lxml.etree.fromstring(r.text.encode("utf-8"))
        
        #get items
        items = []
        for x in root.findall("channel/item"):
            pubdate = datetime.datetime.fromtimestamp(
                email.utils.mktime_tz(
                    email.utils.parsedate_tz(
                        x.find("pubDate").text[:-6]
                    )
                )
            )
            if pubdate >= OLDEST_NEWS:
                cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)
                title = cleaner.clean_html(x.find("title").text)[5:-6]
                content = cleaner.clean_html(x.find("description").text)[5:-6] 
                item = {
                    "title"   : title,
                    "pubdate" : str(pubdate),
                    "content" : content,
                }
                items.append(item)

        return sorted(items, key=lambda x: x["pubdate"], reverse=True)
Esempio n. 3
0
def merge_docx(docx_list=None, out_htmlpath=None):
    """
    docx_list is a list of strings which contains the (absolute) path of DOC/DOCX files to be merged.
    MERGE_DOCX() will follow the index order of docx_list for appending.
    Returns the HTML file as string. 
    If OUT_HTMLPATH is given, write the HTML file out as well.
    """
    if docx_list is None:
        return None
    
    cleaner = Cleaner()
    parser = HTMLParser(encoding='utf-8')
    html_list = []
    for path in docx_list:
        try:
            tmp_html =  PyDocX.to_html(path)
            html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser)))
        except:
            #'MalformedDocxException'
            try:
                # Pretend it is a html
                html_file = '{}.html'.format(path)
                with open(html_file, 'rb') as tmp:
                    tmp_html = tmp.read()
                tmp_html = tmp_html.decode('utf-8')
                html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser)))
            except:
                # Cannot convert
                continue
    
    #print html_list
    if len(html_list)>1:
        #Append element at the end of first body
        main_body = html_list[0].xpath('./body')[0]
        for tree in html_list[1:]:
            elem_list = tree.xpath('./body/*')
            for elem in elem_list:
                main_body.append(elem)
    elif len(html_list)==1:
        main_body = html_list[0].xpath('./body')[0]
    else:
        try:
            main_body = html_list[0].xpath('./body')[0]
        except IndexError:
            # no body content. Most likely just an image/appendix
            return None
    
    # Convert ElementTree back to string
    # in this way we will lose the 'style' info in html_list[0][0], which is usually in header,
    # but not sure if it will cause any differences to parser later on. Probably not.
    html_str = lxml.etree.tostring(main_body)
    
    if out_htmlpath is not None:
        with open(out_htmlpath, 'wb') as tmp:
            tmp.write(html_str.encode('utf-8'))
                
    return html_str
        
Esempio n. 4
0
 def readable(self, html, url=None):
     self.url = url
     html = self.smart_decode(html)
     cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, safe_attrs_only=True)
     html = cleaner.clean_html(html)
     tree = lxml.html.fromstring(html)
     body = tree.xpath("//body")[0]
     article = self.grab_article(body)
     return cleaner.clean_html(article)
Esempio n. 5
0
    def validate(self, value):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.scripts = True
        cleaner.frames = True
        cleaner.remove_tags = ["p", "div", "a"]
        data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content()
        data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content()
        data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content()

        # 		data['username']=  cleaner.clean_html(data['username'])
        #               data['storename']= cleaner.clean_html(data['storename'])
        #              data['email']= cleaner.clean_html(data['email'])

        return data
Esempio n. 6
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Esempio n. 7
0
def sanitize_html(html, bad_tags=['body']):
    """Removes identified malicious HTML content from the given string."""
    if html is None or html == '':
        return html
    cleaner = Cleaner(style=False, page_structure=True, remove_tags=bad_tags,
                      safe_attrs_only=False)
    return cleaner.clean_html(html)
def sanitize(html):
    if not html:
        return html
    cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False)
    html = autolink_html(cleaner.clean_html(html))

    parts = re.split('(<.*?>)', html)

    output = ''
    in_a_tag = False
    for part in parts:
        if not len(part):
            continue

        is_tag = part[0] == '<'
        if is_tag or in_a_tag:
            output += part
            if part[0:2].lower() == '<a':
                in_a_tag = True
            elif part[0:3].lower() == '</a':
                in_a_tag = False
            continue

        part = re.sub("([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part)

        # After linking up emails, only look for twitter in the remaining parts
        sub_parts = re.split('(<.*?>)', part)
        part = ''
        for sub_part in sub_parts:
            part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part)

        output += part

    return output
Esempio n. 9
0
def get_intro_text(text):
    """ Returns only the first <p> tag and preceding nodes
    """

    #cut the text to the first paragraph
    index = text.lower().find('</p>', 1000)
    if index != -1:
        text = text[:index] +'</p>'

    cleaner = Cleaner(
            scripts=False,
            javascript=False,
            comments=False,
            style=False,
            links=False,
            meta=False,
            page_structure=False,
            processing_instructions=False,
            embedded=False,
            forms=False,
            remove_unknown_tags=True,
            )
    text = cleaner.clean_html(text)

    return text
Esempio n. 10
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Esempio n. 11
0
def strip_comments__lxml(html_string=""):
    if not html_string: return html_string
    
    params = {
        'comments': True,
        'scripts': False,
        'javascript': False,
        'style': False,
        'links': False,
        'meta': False,
        'page_structure': False,
        'processing_instructions': False,
        'embedded': False,
        'frames': False,
        'forms': False,
        'annoying_tags': False,
        'remove_tags': None,
        'allow_tags': None,
        'remove_unknown_tags': True,
        'safe_attrs_only': False,
    }
    try:
        cleaner = Cleaner(**params)
        html = lxml.html.fromstring(html_string)
        clean_html = cleaner.clean_html(html)

        return lxml.etree.tostring(clean_html)
    except (XMLSyntaxError, ParserError):
        return html_string
Esempio n. 12
0
    def __init__(self, input):
        self.title = input.get('post_title')
        self.content = input.get('post_content')
        self.category = input.get('post_category')
        self.is_public = input.get('post_is_public')

        if self.is_public:
            self.is_public = True
        else:
            self.is_public = False

        if self.category not in config.get('post_categories'):
            raise exceptions.CantValidateForm

        if self.title:
            # strip markup
            html_string = lxml.html.fromstring(self.title)
            self.title = unicode(html_string.text_content())
        else:
            self.title = ''

        if self.content:
            # clean markup
            cleaner = Cleaner(**post_rules)
            self.content = cleaner.clean_html(self.content)
            # replace newlines
            self.content = self.content.replace('\r\n', '<br />')
        else:
            raise exceptions.CantValidateForm
Esempio n. 13
0
def truncate(content, max_length=DEFAULT_TRUNCATE_LENGTH, allowed_tags=ALLOWED_TAGS, full_link=None):
    """ truncate a body of text to the expected 'max_length' and strip
        the body of text of all html tags that are not in 'allowed tags'. You
        can also specify a 'strip' value (True -> strip html tags, False ->
        escape html tags and leave them in text)
    """
    if not content:
        return ''

    cleaner = Cleaner(
        page_structure=False,
        links=True,
        safe_attrs_only=True,
        remove_unknown_tags=False,
        allow_tags=allowed_tags
    )

    content = defaultfilters.truncatechars_html(cleaner.clean_html(content), max_length)
    if full_link:
        try:
            insert_point = content.rindex('</p>')
        except ValueError:
            insert_point = content.rindex('<')

        ending = content[insert_point:]
        content = content[:insert_point]

        content += '&nbsp;<a href="' + full_link + '">(Read More)</a>' + ending
    return content
Esempio n. 14
0
def clean_html(html, safe_attrs=('src', 'href'),
               input_encoding='unicode',
               output_encoding='unicode',
               **kwargs):
    """
    Fix HTML structure and remove non-allowed attributes from all tags.
    """

    from lxml.html.clean import Cleaner

    # Convert HTML to Unicode
    html = render_html(parse_html(html, encoding=input_encoding), make_unicode=True)

    # Strip some shit with default lxml tools
    cleaner = Cleaner(page_structure=True, **kwargs)
    html = cleaner.clean_html(html)

    # Keep only allowed attributes
    tree = parse_html(html)
    for elem in tree.xpath('./descendant-or-self::*'):
        for key in elem.attrib.keys():
            if safe_attrs:
                if key not in safe_attrs:
                    del elem.attrib[key]

    return render_html(tree, encoding=output_encoding)
Esempio n. 15
0
    def parse(self, response):
        item = JournalItem()

        base_url = "http://journals.ametsoc.org"

        journalTitle = response.xpath('//*[@id="journalBlurbPanel"]/div[2]/h3/text()').extract_first()
        item['title'] = journalTitle

        journalIssue = response.xpath('//*[@id="articleToolsHeading"]/text()').extract_first().strip()  # remove whitespace at start and end
        item['issue'] = journalIssue

        # setup html cleaner to strip html tags from string (journal titles often use sub/superscript and splits article title)
        html_cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False)

        journalDescription = response.xpath('//*[@id="journalBlurbPanel"]/div[4]').extract()
        journalDescription = "".join(journalDescription)
        journalDescription = html_cleaner.clean_html(journalDescription)[5:-6]  # remove any html tags and then trim the <div> tags that the cleaner inserts
        journalDescription = removeNewlines(journalDescription)  # remove any \n\r\t characters
        journalDescription = journalDescription.strip()
        item['description'] = journalDescription

        coverImage = response.xpath('//*[@id="smallIssueCover"]/img/@src').extract_first().strip()
        print(coverImage)
        item['coverURL'] = base_url + coverImage

        yield item
Esempio n. 16
0
 def clean_article_html(cls, node):
     article_cleaner = Cleaner()
     article_cleaner.javascript = True
     article_cleaner.style = True
     article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em']
     article_cleaner.remove_unknown_tags = False
     return article_cleaner.clean_html(node)
Esempio n. 17
0
    def parse(self, content):
        """Clean and parse HTML content."""

        cleaner = Cleaner(style=True, links=False, page_structure=False, meta=True,
            safe_attrs_only=False, remove_unknown_tags=False)
        clean_content = cleaner.clean_html(content)

        html = etree.iterparse(StringIO(clean_content), events=("start", "end"))
        level = -1
        css = ''

        # We do not want to style these elements.
        ignore_tags = ['html', 'body', 'head', 'meta', 'title', 'script']

        if self.options.delimiter == 'spaces':
            delimiter = '  '
        else:
            delimiter = '\t'

        for action, elem in html:
            if (action == 'start'):
                identifier = self.identify_ruleset(elem)

                if elem.tag not in ignore_tags:
                    level += 1
                    css += delimiter * level + identifier + ' {\n'
                    if not self.options.clean_mode:
                        css += delimiter + delimiter * level + '/* enter your CSS here... */\n'
            else:
                if elem.tag not in ignore_tags:
                    css += delimiter * level + '}\n'
                    level -= 1

        return css.strip()
Esempio n. 18
0
 def get_content(self, site):
     sel = None
     if site.id_type == "css":  # translates csspath into xpath
         s = CSSSelector(site.identifier)
         sel = s.path
     else:
         sel = site.identifier
     try:
         page = requests.get(site.url)
         parser = le.HTMLParser()
         tree = le.parse(StringIO(page.text), parser)
         xp = tree.xpath(sel)
         if len(xp) < 1:
             return None
         html = lxml.html.tostring(xp[0])
         cleaner = Cleaner(style=True, links=False,
                           page_structure=False, embedded=False,
                           frames=False, forms=False)
         cleaned_html = cleaner.clean_html(html)
         self._print("Cleaning html: " + str(len(html)) +
                     " -> " + str(len(cleaned_html)))
         return cleaned_html
     except Exception as e:
         self._print("EXCEPTION! " + str(e.message))
         return None
Esempio n. 19
0
    def gettextonly(self, tree):
        cleaner = Cleaner(style=True, links=True, add_nofollow=True,
                          page_structure=False, safe_attrs_only=False)
        try:
            v = tostring(tree,method='text',encoding=unicode)
        except:
            v = None
            
        if v == None:
            c = lxml.html.tostring(tree)
            print 'v== null' 
#            resulttext = ''
#            for t in c:
#                subtext = self.gettextonly(t)
#                resulttext += subtext + '\n'
#            return resulttext
            return c
        else:
            # Clean up the javascript and comment.
            try:
                v = cleaner.clean_html(v)
            except:
                # Ignore clean error
                pass
            return v.strip()
Esempio n. 20
0
def getFormatHtml(htmlContent):
    try:
        dom = soupparser.fromstring(htmlContent)
    except Exception, e:
        cleaner = Cleaner()
        htmlContent = cleaner.clean_html(htmlContent)
        doc = soupparser.fromstring(htmlContent)
Esempio n. 21
0
def handle_item(path):
  # url="http://news.39.net/"+path.split("/root/39_data/news.39.net/")[1]
  flag,title,text=False,"",""
  try:
    # request=requests.get(url,proxies=get_proxy(),timeout=5)
    # if request.status_code!=200: raise
    with open(path,"r") as file:
      content=file.read()
    html=lxml.html.fromstring(content.decode("gbk"))
    try:
      if re.search("utf",html.xpath("//meta/@charset")[0]): 
       html=lxml.html.fromstring(r.content.decode("utf-8"))
    except: pass
    try:
      if len(html.xpath("//div[@class='art_box']/h1/text()"))>0:
        title=html.xpath("//div[@class='art_box']/h1/text()")[0]
      else:
        title=html.xpath("//div[@class='artbox']/h1/text()")[0]
    except:
      title=""
    print("title:%s"%title)
    if len(html.xpath("//div[@id='contentText']"))>0: div1=html.xpath("//div[@id='contentText']")[0]
    elif len(html.xpath("//div[@class='article']"))>0: div1=html.xpath("//div[@class='article']")[0]
    else: raise
    cleaner = Cleaner(scripts = True)
    for p in div1.xpath("./p"):
      p=cleaner.clean_html(p)
      try:
        text+=p.text_content().strip()+"\n"
      except: pass
    print("text:%s"%text)
    flag=True
  except Exception,e:
    print(e)
Esempio n. 22
0
class HTMLSanitiser:
	def __init__(self):
		self.Cleaner = Cleaner(scripts = False, javascript = False, comments = False, links = False, meta = True, page_structure = False, processing_instructions = False, embedded = False, frames = False, forms = False, annoying_tags = False, remove_unknown_tags = False, safe_attrs_only = True, allow_tags=ALLOWED_TAGS)
		#self.Cleaner = Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False)

	def IsValidURL(self, URL):
		ParsedURL = urlparse(URL)
		return (ParsedURL.scheme in ALLOWED_URL_SCHEMES)

	def CleanURLs(self, HTML): 
		# Largely Inspired from: http://stackoverflow.com/questions/5789127/how-to-replace-links-using-lxml-and-iterlinks
		ParsedHTML = lxml.html.document_fromstring(HTML)
		#print dir(ParsedHTML)
		#'iter', 'iterancestors', 'iterchildren'
		#for Element in ParsedHTML.iterchildren():
		#	print dir(Element)
		#	print Element.tag
		for Element, Attribute, Link, Pos in ParsedHTML.iterlinks():
			if not self.IsValidURL(Link):
				Element.set(Attribute, Link.replace(Link, ''))
		return lxml.html.tostring(ParsedHTML)

	def CleanThirdPartyHTML(self, HTML): 
		# 1st clean URLs, 2nd get rid of basics, 3rd apply white list
		return self.Cleaner.clean_html(clean_html(self.CleanURLs(HTML)))

	def TestPrint(self, TestInfo, TestOutput):
		TestInfo += "_" * (60 - len(TestInfo)) # Make info visually easier to compare
		print TestInfo + TestOutput
Esempio n. 23
0
def html_cleanup(input):
    cleaner = Cleaner(
        scripts = True,
        javascript = True,
        comments = True,
        style = False,
        links = True,
        meta = True,
        page_structure = True,
        processing_instructions = True,
        embedded = False,
        frames = False,
        forms = True,
        annoying_tags = True,
        allow_tags = ['a', 'img', 'span', 'div', 'p', 'br', 'iframe', # for google cal
                      'strong', 'em', 'b', 'i', 'u', 'strike', 'blockquote', 'sub', 'sup',
                      'ul', 'ol', 'li', 'table', 'tdata', 'tr', 'th', 'td',
                      'h1', 'h2', 'h3', 'h4'],
        remove_unknown_tags = False,
        safe_attrs_only = True,
        host_whitelist = ['youtube.com', 'www.google.com'],
        whitelist_tags = ['iframe', 'embed', 'script', 'img']
        )
    sane = cleaner.clean_html("<div>%s</div>"%input)
    return sane[len('<div>'):-len('</div>')]
Esempio n. 24
0
    def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
Esempio n. 25
0
def _statistica_(url_string):
    """Implementa la logica per estrarre documento
    e metadati da rivista-statistica
    """
    url = urlparse.urlparse(url_string)
    conn = httplib.HTTPConnection(url.hostname)
    conn.request("GET", url.path)
    res = conn.getresponse()
    body = res.read()

    my_page = html.fromstring(body)

    # Rimuovi il banner dei cookie del *****
    for el in my_page.xpath('//*[@id="cookiesAlert"]'):
        el.getparent().remove(el)

    # Rimuovi tutti i tag script e il loro contenuto
    cleaner = Cleaner()
    cleaner.javascript = True
    my_page = cleaner.clean_html(my_page)

    title = my_page.xpath('//*[@id="articleTitle"]/h3')
    full_content = my_page.xpath('//*[@id="content"]')
    doi = my_page.xpath('//*[@id="pub-id::doi"]')

    full_content = ''.join(
        [etree.tostring(fix_links(el, url_string)) for el in full_content])

    result = {
        'title': title[0].text_content(),
        'content': full_content,
        'doi': doi[0].text_content()
        }

    return json.JSONEncoder().encode(result)
Esempio n. 26
0
def visit(url):
	if url.startswith(base_url) == False:
		return

	try:
		resp = urlopen(url)
	except URLError as e:
		return

	page = resp.read()
	cleaner = Cleaner()
	cleaner.javasript = True
	cleaner.style = True
	cleaner.kill_tags = ELEMENTS_TO_IGNORE

	# soup = BeautifulSoup(page, "lxml")
	# for link in soup.findAll('a'):
	# 	if link.has_attr('href'):
	# 		if link.has_attr('class') and 'history' in link['class']:
	# 			continue
	# 		next_link = urljoin(url,link['href'])
	# 		next_link = urldefrag(next_link)[0]
	# 		if next_link not in visited_pages:
	# 			visited_pages.append(next_link)
	# 			pages_to_visit.append(next_link)

	clean_page = cleaner.clean_html(page)
	soup = BeautifulSoup(clean_page, "lxml")
	extract(soup, url)
Esempio n. 27
0
    def test_allow_tags(self):
        html = """
            <html>
            <head>
            </head>
            <body>
            <p>some text</p>
            <table>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            </table>
            <img>
            </body>
            </html>
            """

        html_root = lxml.html.document_fromstring(html)
        cleaner = Cleaner(
            remove_unknown_tags = False,
            allow_tags = ['table', 'tr', 'td'])
        result = cleaner.clean_html(html_root)

        self.assertEqual(12-5+1, len(list(result.iter())))
def createPages():
    items = source.contentItems()
    for item in items:
        doc = parse(item).getroot()
        cleaner = Cleaner(style=True, links=False, page_structure=True, safe_attrs_only=False)
        cleaned = cleaner.clean_html(doc)
    # get the pagetitle

        titles = cleaned.find_class('Pagetitle')
    # snag the page title - method returns list. . there's really only one
        title = titles[0].text_content()

    # get the description
        descrips = cleaned.find_class('Summarytext')
        descrip = descrips[0].text_content()
    #Need to have temporary id
        id = str(random.randint(0, 99999999))

        target.invokeFactory("Document", id=uid)
        obj = target[uid]
        obj.setTitle(title)
        obj.setDescription(descrip)
        obj.setText.getBodyText()


# Will finish Archetypes content item creation process,
# rename-after-creation and such
        obj.processForm()

        return obj
Esempio n. 29
0
def analyze(request):
    url = request.GET['url']
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')]
    response = opener.open(url)
    raw_html = response.read()

    cleaner = Cleaner(kill_tags = ['style', 'script', 'head'], allow_tags = [''], remove_unknown_tags = False)
    raw_text = cleaner.clean_html(raw_html)
    ptn = re.compile('<div>|</div>')
    raw_text = re.sub(ptn, '', raw_text)
    ptn = re.compile('\s+')
    raw_text = re.sub(ptn, ' ', raw_text)
    raw_text = raw_text.strip().lower()
    prd, score = MLearn.predict(raw_text)
    donut = score * 100
    results = MLearn.predict_other(raw_text)
    related_headline = results[0][2]
    related_verdict = results[0][0]
    related_score = results[0][1] * 100

    context = {
    	'url': url,
    	'verdict': prd,
    	'score': donut,
    	'related_headline': related_headline,
    	'related_verdict': related_verdict,
    	'related_score': related_score,
    	'results': results,
    }

    return render(request, 'results.html', context)
    def learn_stopwords(self):
        req = urllib2.Request(self.html_url, headers={'Host':'github.com', 'Referer':'https://github.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36'})
        r = urllib2.urlopen(req)
        page = r.read()
        tree = html.fromstring(page)

        # get readme part
        readme_tree = tree.xpath('//*[@id="readme"]/article')
        if len(readme_tree) < 1:
            return

        readme_tree = readme_tree[0]
        self.origin_readme = readme_tree.text_content()
        cleaner = Cleaner(allow_tags=['p','h1','h2','h3','h4','h5','pre'], remove_unknown_tags=False)
        readme_tree = cleaner.clean_html(readme_tree)

        header = ""
        # iterate each header and paragraph
        for sub in readme_tree.iterchildren():
            if sub is None:
                break

            if sub.tag == 'pre' and header:
                self.add_stopwords(self.filter_all(header))
                header = ""
            elif sub.tag in ['h1','h2','h3','h4'] and sub.text is not None:
                header = sub.text.strip().lower()
Esempio n. 31
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc['content']
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = ' '.join([title for title in tree.xpath('//title/text()')])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = ' '.join(
                tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc['url'],
                desc=description,
                rank=doc['rank'],
                content='\n'.join([title, doc['url'], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Esempio n. 32
0
import lxml.html
from lxml.html.clean import Cleaner

# ダウンロードした XHTML ファイルのファイル名を書きます。
# ちなみに 789_14547.html は《吾輩は猫である》です。
FILE_NAME = 'data/xhtml/789_14547.html'

with open(FILE_NAME, encoding='shift_jis') as f:
    data = f.read().encode('shift_jis')

cleaner = Cleaner(page_structure=False, remove_tags=(
    'ruby', 'br'), kill_tags=('rt', 'rp'))
cln_html = cleaner.clean_html(data).decode('utf-8')

plain_text = lxml.html.fromstring(cln_html).find_class('main_text')[
    0].text_content()
# print(plain_text)

# 別ファイルへの保存
PLAIN_TEXT = FILE_NAME.replace('xhtml', 'text').replace('.html', '.txt')
print(PLAIN_TEXT)
with open(PLAIN_TEXT, 'w') as f:
    f.write(plain_text)
def scrape_links(links):
     maincleaner = Cleaner(allow_tags=['div'], remove_unknown_tags=False, remove_tags=['div'])     # funtion to remove every tag

#    while True:
     for link in links:            # Loop through all the links
        if link == last_link:      # Check if this link has already been scraped (this will eventually be changed to check dates)
            break                  # If we've hit something we've already scraped, break out of the loop
#        try:
        linkhtml = scraperwiki.scrape(link).decode('latin_1')          # scrape the contents of the current link and decode from Windows-1252 encoding
        print link
        root = lxml.html.fromstring(linkhtml)                               # turn scraped content into an HTML object

        # GET TITLE
        title = root.cssselect("h1")[0].text.encode('utf-8')                # grab the page header (title) and return its text as unicode
        title = replace_all(title, subDic)                                  # replace alphanumeric obfuscations with letters

        # GET DATE
        date = root.cssselect("div.adInfo")[0].text                         # get the text of the html entity that contains the date and time of the post
        cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)', r'\1 \2', date.strip())  # get date into a standard format
        cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w', cleandate).group(0) # find the date string on the page
        rawdate = datetime.strptime(cleandate,'%B %d, %Y %I:%M %p')                 # encode the date as a date using format Month dd, YYYY
        date = rawdate.strftime('%Y-%m-%d %H:%M')                        # decode that date back into a string of format YYYY-mm-dd

        # GET MAIN BODY TEXT
        mainwithtags = root.cssselect("div.postingBody")[0]                # grabs the body text of the post
        main = maincleaner.clean_html(mainwithtags).text.encode('utf-8')            # gets rid of all HTML tags
        main = replace_all(main, subDic)                                            # replace alphanumeric obfuscations with letters

        # GET PHONE NUMBER(S)
        stripped = replace_all(main.lower(), wordDic)                               # replaces common phone number obfuscations with actual numbers
        phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]")      # list of known phone number dividers
        stripped = phonecomp.sub('',stripped)                               # remove phone number dividers
        phone = re.findall(r'(?:1?)[1-9]\d{9}',stripped)                    # search for groups of 10 consecutive numbers (with an optional preceding 1)
        phone = list(set(phone))                                            # gets rid of duplicate numbers by turning list into a set and back
        phone = ", ".join(phone)                                            # formats phone numbers as "phone1, phone2,... phoneN"
        
        # GET LISTED AGE
        if root.cssselect("p.metaInfoDisplay"):                             # does the entry have metainfo?
            listedage = root.cssselect("p.metaInfoDisplay")[0]              # get the the first html metainfo element
            listedage = re.sub("[^\d]","",listedage.text)                   # get rid of all non-numeric text in the text of the element
        else:                                                               # if there's no metainfo
            listedage = ""                                                  # set the listed age to an empty string

        # GET LOCATION
        if re.findall(r'Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL):  # 
            location = re.findall('Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL)[0].encode('utf-8')
#            location = removeNonAscii(location)
            #if any(x in NEIGHBORHOODS) in location:
             #   print x, 'x'
              #  area =  x
            area = None
            for neighborhood in NEIGHBORHOODS:
                if neighborhood in location.lower():
                    area = neighborhood

            print repr(area)
            print repr(location)
        else:
            location = ""

        picturelist=[]
        pictures = root.cssselect('ul#viewAdPhotoLayout img')
        for i in range(len(pictures)):
            largepic = re.sub('/medium/','/large/',pictures[i].get('src'))
            picturelist.append(largepic)
        print picturelist 
        picturelist = " ".join(picturelist)
        x = urllib.urlopen(largepic).read()
        piccode = base64.encodestring(x)
        print piccode
        
#        except:
#            print 'FAILED TO LOAD: ' + link
#        continue
#            record = {}
#            record['Title'] = 'LOAD FAILURE'
        # Set up our data record - we'll need it later

        record = {}
        record['Title'] = title #.encode('ascii', 'ignore').strip()
        record['Date'] = date
        record['Main'] = main #.encode('ascii', 'ignore').strip()
        record['Pictures'] = picturelist
        record['Phone'] = phone
        record['Listed Age'] = listedage #.encode('ascii', 'ignore').strip()
        record['Location'] = location
        record['area']= area
        record['PicCode'] = piccode #.encode('ascii', 'ignore').strip()
            # Print out the data we've gathered
           #print record, '------------'
            # Finally, save the record to the datastore - 'Artist' is our unique key
        scraperwiki.sqlite.save(["Title"], record)
        time.sleep(2)
Esempio n. 34
0
  def get_message_tree(self):
    tree = {
      'id': self.get_msg_info(self.index.MSG_ID),
      'tags': self.get_msg_info(self.index.MSG_TAGS).split(','),
      'summary': self.get_msg_summary(),
      'headers': {},
      'headers_lc': {},
      'attributes': {},
      'text_parts': [],
      'html_parts': [],
      'attachments': [],
      'conversation': [],
    }

    conv_id = self.get_msg_info(self.index.MSG_CONV_ID)
    if conv_id:
      conv = Email(self.index, int(conv_id, 36))
      tree['conversation'] = convs = [conv.get_msg_summary()]
      for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','):
        if rid:
          convs.append(Email(self.index, int(rid, 36)).get_msg_summary())

    # FIXME: Decide if this is strict enough or too strict...?
    html_cleaner = Cleaner(page_structure=True, meta=True, links=True,
                           javascript=True, scripts=True, frames=True,
                           embedded=True, safe_attrs_only=True)

    msg = self.get_msg()
    for hdr in msg.keys():
      tree['headers'][hdr] = self.index.hdr(msg, hdr)
      tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr)

    # Note: count algorithm must match that used in extract_attachment above
    count = 0
    for part in msg.walk():
      mimetype = part.get_content_type()
      if mimetype.startswith('multipart/'):
        continue

      count += 1
      if (part.get('content-disposition', 'inline') == 'inline'
      and mimetype in ('text/plain', 'text/html')):
        payload, charset, openpgp = self.decode_payload(part)
        # FIXME: Do something with the openpgp data!
        if (mimetype == 'text/html' or
            '<html>' in payload or
            '</body>' in payload):
          tree['html_parts'].append({
            'openpgp_status': openpgp and openpgp[0] or '',
            'openpgp_data': openpgp and openpgp[1] or '',
            'charset': charset,
            'type': 'html',
            'data': (payload.strip() and html_cleaner.clean_html(payload)) or ''
          })
        else:
          tree['text_parts'].extend(self.parse_text_part(payload, charset,
                                                         openpgp))
      else:
        tree['attachments'].append({
          'mimetype': mimetype,
          'count': count,
          'part': part,
          'length': len(part.get_payload(None, True) or ''),
          'content-id': part.get('content-id', ''),
          'filename': part.get_filename() or ''
        })

    if self.is_editable():
      tree['is_editable'] = True
      tree['editing_string'] = self.get_editing_string(tree)

    return tree
Esempio n. 35
0
    deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w")

for record in f:
    # We convert into UTF8 first of all
    orig_encoding, text = convert_encoding(record.payload.read())
    url = record.url
    if orig_encoding is None:
        logging.info("Encoding of document " + url + " could not be identified")

    if len(text) > 0:
        # HTML is then normalized
        cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

        tree=""
        try:
            cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
            tree = ftfy.fix_text(cleanhtml, fix_entities=False, fix_character_width=False)
            #document = html5lib.parse(fixedtext, treebuilder="lxml", namespaceHTMLElements=False)
            #tree = etree.tostring(document, encoding="utf-8")
        except Exception as ex:
            sys.stderr.write(str(ex)+"\n")
            continue
        cleantree = tree.replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        #printable_str = ''.join(x for x in cleantree if x in string.printable)
        lang = guess_lang_from_data2(tree)
        if len(languages) > 0 and lang not in languages:
            logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.")
        else:
Esempio n. 36
0
def html_clean(str):
    """ Clean up HTML to be safe """
    cleaner = Cleaner(safe_attrs_only=True)
    return cleaner.clean_html(str)
Esempio n. 37
0
class Tokenizer:
    def __init__(self):
        self.cleaner = Cleaner(scripts = True, javascript = True, style = True, \
            meta = True, annoying_tags = True, embedded = True, page_structure = False, \
            kill_tags = ['img', 'CDATA', 'form'], remove_tags = ['a','div'], remove_unknown_tags = True, comments = True)
        self.cleanerBody = Cleaner(
            page_structure=False,
            kill_tags=['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        self.stopwords = set(stopwords.words('english'))
        self.ps = PS()

    def parseHTML(self, content):

        cleaned = self.cleaner.clean_html(content)

        cleanedBody = self.cleanerBody.clean_html(cleaned)
        #print(cleanedBody)
        soup = BeautifulSoup(cleaned, 'lxml')
        soupBody = BeautifulSoup(cleanedBody, 'lxml')
        #print(soup)
        if soup.html != None:
            title = self.getTitle(soup)
            heading = self.getHeadings(soup)
            body = self.getBody(soupBody)
            #print(title)
            #print(heading)
            #print(body)
            return (title, heading, body)

    def getTitle(self, soup):
        try:
            title = soup.title.getText().lower()
            #print(title)
            tokens = RegexpTokenizer(r'[a-z]+').tokenize(title)
            stemmed = [self.ps.stem(t) for t in tokens]
            filtered = [t for t in stemmed if not t in self.stopwords]
            #print(filtered)
            return filtered
            #print(tokens)
        except AttributeError:
            pass
            #print('No <title> tag in this file')

    def getHeadings(self, soup):
        result = []
        for i in range(1, 3):
            try:
                headerList = soup.find_all("h" + str(i))
                for h in headerList:
                    replaced = re.sub('<[/*a-zA-Z0-9]*>', ' ', str(h).lower())
                    #print(replaced)
                    tokens = RegexpTokenizer(r'[a-z]+').tokenize(replaced)
                    stemmed = [self.ps.stem(t) for t in tokens]
                    filtered = [t for t in stemmed if not t in self.stopwords]
                    result.extend(filtered)
                    #print(filtered)
                    return filtered
            except AttributeError:
                pass

    def getBody(self, soup):
        try:

            replaced = re.sub('<[/*a-zA-Z0-9]*>', ' ', str(soup.body).lower())
            #print(replaced)
            tokens = RegexpTokenizer(r'[a-z]+').tokenize(replaced)
            stemmed = [self.ps.stem(t) for t in tokens]
            filtered = [t for t in stemmed if not t in self.stopwords]
            #print(filtered)
            return filtered
        except AttributeError:
            pass
Esempio n. 38
0
def as_clean_html(value):
    cleaner = Cleaner(style=True, scripts=True)
    return cleaner.clean_html(value)
Esempio n. 39
0
 def _clean_html(self, html):
     cleaner = Cleaner(style=True, scripts=True)
     return cleaner.clean_html(html)
# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines)
# -- CLICK THE 'RUN' BUTTON BELOW
# Check the 'Console' tab again, and you'll see how we're extracting
# the HTML that was inside <td></td> tags.
# We use lxml, which is a Python library especially for parsing html.
# -----------------------------------------------------------------------------

html = html.replace('<br>', ' ')
html = re.sub(r'(\&.*?;)|(\n|\t|\r)', ' ', html)
print html
issues = []
root = lxml.html.fromstring(html)  # turn our HTML into an lxml object
cleaner = Cleaner(remove_tags=['font', 'span'],
                  links=False,
                  remove_unknown_tags=False)
root = cleaner.clean_html(root)
newhtml = lxml.html.tostring(root)

record = {}
datestring = re.findall("Updated (.*?)</p>", newhtml)[0]
date = time.strptime(
    datestring,
    '%b %d, %Y')  # encode the date as a date using format Month dd, YYYY
date = time.strftime(
    '%Y-%m-%d',
    date)  # decode that date back into a string of format YYYY-mm-dd

if scraperwiki.sqlite.get_var(
        'last_update'
) == None or scraperwiki.sqlite.get_var('last_update') != date:
    record["Date"] = date
Esempio n. 41
0
    def get_message_tree(self, want=None):
        msg = self.get_msg()
        tree = {'id': self.get_msg_info(self.index.MSG_ID)}

        for p in 'text_parts', 'html_parts', 'attachments':
            if want is None or p in want:
                tree[p] = []

        if want is None or 'summary' in want:
            tree['summary'] = self.get_msg_summary()

        if want is None or 'tags' in want:
            tree['tags'] = self.get_msg_info(self.index.MSG_TAGS).split(',')

        if want is None or 'conversation' in want:
            tree['conversation'] = {}
            conv_id = self.get_msg_info(self.index.MSG_THREAD_MID)
            if conv_id:
                conv = Email(self.index, int(conv_id, 36))
                tree['conversation'] = convs = [conv.get_msg_summary()]
                for rid in conv.get_msg_info(
                        self.index.MSG_REPLIES).split(','):
                    if rid:
                        convs.append(
                            Email(self.index, int(rid, 36)).get_msg_summary())

        if (want is None or 'headers' in want or 'editing_string' in want
                or 'editing_strings' in want):
            tree['headers'] = {}
            for hdr in msg.keys():
                tree['headers'][hdr] = self.index.hdr(msg, hdr)

        if want is None or 'headers_lc' in want:
            tree['headers_lc'] = {}
            for hdr in msg.keys():
                tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr)

        if want is None or 'header_list' in want:
            tree['header_list'] = [(k, self.index.hdr(msg, k, value=v))
                                   for k, v in msg.items()]

        # FIXME: Decide if this is strict enough or too strict...?
        html_cleaner = Cleaner(page_structure=True,
                               meta=True,
                               links=True,
                               javascript=True,
                               scripts=True,
                               frames=True,
                               embedded=True,
                               safe_attrs_only=True)

        # Note: count algorithm must match that used in extract_attachment
        #       above
        count = 0
        for part in msg.walk():
            mimetype = part.get_content_type()
            if (mimetype.startswith('multipart/')
                    or mimetype == "application/pgp-encrypted"):
                continue
            try:
                if (mimetype == "application/octet-stream"
                        and part.cryptedcontainer is True):
                    continue
            except:
                pass

            count += 1
            if (part.get('content-disposition', 'inline') == 'inline'
                    and mimetype in ('text/plain', 'text/html')):
                payload, charset = self.decode_payload(part)

                if (mimetype == 'text/html' or '<html>' in payload
                        or '</body>' in payload):
                    if want is None or 'html_parts' in want:
                        tree['html_parts'].append({
                            'charset':
                            charset,
                            'type':
                            'html',
                            'data': ((payload.strip()
                                      and html_cleaner.clean_html(payload))
                                     or '')
                        })
                elif want is None or 'text_parts' in want:
                    text_parts = self.parse_text_part(payload, charset)
                    if want is None or 'text_parts' in want:
                        tree['text_parts'].extend(text_parts)

            elif want is None or 'attachments' in want:
                tree['attachments'].append({
                    'mimetype':
                    mimetype,
                    'count':
                    count,
                    'part':
                    part,
                    'length':
                    len(part.get_payload(None, True) or ''),
                    'content-id':
                    part.get('content-id', ''),
                    'filename':
                    part.get_filename() or ''
                })

        if self.is_editable():
            if not want or 'editing_strings' in want:
                tree['editing_strings'] = self.get_editing_strings(tree)
            if not want or 'editing_string' in want:
                tree['editing_string'] = self.get_editing_string(tree)

        if want is None or 'crypto' in want:
            if 'crypto' not in tree:
                tree['crypto'] = {
                    'encryption': msg.encryption_info,
                    'signature': msg.signature_info
                }
            else:
                tree['crypto']['encryption'] = msg.encryption_info
                tree['crypto']['signature'] = msg.signature_info

        return tree
Esempio n. 42
0
def clean_comment_text(string):
    doc = document_fromstring(string)
    cleaner = Cleaner()
    return cleaner.clean_html(doc).text_content()
Esempio n. 43
0
def clean_project_desc(string):
    cleaner = Cleaner(remove_tags=["a"])
    return cleaner.clean_html(string)
Esempio n. 44
0
class TransformHtmlProceedingsToXml(object):
    """Get proceedings of the European Parliament."""

    @timeit
    def __init__(self):
        self.cli()
        self.infiles = self.get_files(self.indir, self.pattern)
        self.n_proceedings = 0
        self.rm_a = Cleaner(remove_tags=['a'])
        self.main()

    def __str__(self):
        message = "Information for {} MEPs extracted!".format(
            str(self.n_proceedings))
        return message

    def get_files(self, directory, fileclue):
        """Get all files in a directory matching a pattern.

        Keyword arguments:
        directory -- a string for the input folder path
        fileclue -- a string as glob pattern
        """
        matches = []
        for root, dirnames, filenames in os.walk(directory):
            for filename in fnmatch.filter(filenames, fileclue):
                matches.append(os.path.join(root, filename))
        return matches

    def read_html(self, infile):
        """Parse a HTML file."""
        with open(infile, encoding='utf-8', mode='r') as input:
            return html.parse(input)

    def serialize(self, infile, root):
        ofile_name = os.path.splitext(os.path.basename(infile))[0]
        ofile_path = os.path.join(self.outdir, ofile_name+'.xml')
        xml = etree.tostring(
            root,
            encoding='utf-8',
            xml_declaration=True,
            pretty_print=True).decode('utf-8')
        with open(ofile_path, mode='w', encoding='utf-8') as ofile:
            ofile.write(xml)
        pass

    def get_name(self, tree):
        name = tree.xpath('//li[@class="mep_name"]')[0]
        name = self.rm_a.clean_html(name)
        name = html.tostring(name).decode('utf-8')
        name = re.sub(r'[\t\n]', r'', name)
        name = name.split('<br>')
        name = [html.fromstring(x).text_content() for x in name]
        name = ' '.join(name)
        return name

    def get_nationality(self, tree):
        nationality = tree.find_class('nationality')[0]
        nationality = nationality.text.strip()
        return nationality

    def get_id(self, infile):
        id = os.path.splitext(os.path.basename(infile))[0]
        return id

    def parse_date(self, a_date, a_pattern):
        output = datetime.datetime.strptime(a_date, a_pattern).date()
        return output

    def get_birth(self, tree):
        birth = tree.xpath('.//span[@class="more_info"]')
        birth_date = None
        birth_place = None
        death_date = None
        death_place = None
        for i in birth:
            if i.text is not None:
                birth_text = re.sub(r'[\t\n]', r'', i.text)
                birth_text = birth_text.strip()
                if re.match(r'^Date of birth: (.+?), (.+)$', birth_text):
                    info = re.match(
                        r'^Date of birth: (.+?), (.+)$', birth_text)
                    birth_date = self.parse_date(info.group(1), "%d %B %Y")
                    birth_place = info.group(2)
                elif re.match(r'^Date of birth: (.+?)$', birth_text):
                    info = re.match(r'^Date of birth: (.+?)$', birth_text)
                    birth_date = self.parse_date(info.group(1), "%d %B %Y")
                    birth_place = None
                elif re.match(r'^Date of death: (.+?), (.+)$', birth_text):
                    info = re.match(
                        r'^Date of death: (.+?), (.+)$', birth_text)
                    death_date = self.parse_date(info.group(1), "%d %B %Y")
                    death_place = info.group(2)
                elif re.match(r'^Date of death: (.+?)$', birth_text):
                    info = re.match(r'^Date of death: (.+?)$', birth_text)
                    death_date = self.parse_date(info.group(1), "%d %B %Y")
                    death_place = None
        return birth_date, birth_place, death_date, death_place

    def get_political_groups(self, tree, id):
        political_groups = tree.xpath('.//div[@class="boxcontent nobackground"]/h4[contains(., "Political groups")]/following-sibling::ul[1]//li')
        output = []
        for i in political_groups:
            info = i.text
            info = re.sub(r'\n', r'', info)
            info = re.sub(r'\t+', r'\t', info)
            info = re.sub(r' \t/ ', r'\t', info)
            info = re.sub(r'\t:\t', r'\t', info)
            info = re.sub(r' - ', r'\t', info)
            info = re.sub(r'\t$', r'', info)
            info = info.strip()
            info = info.split('\t')
            info = [x.strip() for x in info]
            m_state = i.attrib['class']
            s_date = self.parse_date(info[0], "%d.%m.%Y")
            if info[1] == '...':
                e_date = self.date
            else:
                e_date = self.parse_date(info[1], "%d.%m.%Y")
            p_group = info[2]
            p_group_role = info[3]
            output.append({
                'id': id,
                'm_state': m_state,
                's_date': s_date,
                'e_date': e_date,
                'p_group': p_group,
                'p_group_role': p_group_role})
        return output

    def get_national_parties(self, tree, id):
        political_groups = tree.xpath('.//div[@class="boxcontent nobackground"]/h4[contains(., "National parties")]/following-sibling::ul[1]//li')
        output = []
        for i in political_groups:
            info = i.text
            info = re.sub(r'\n', r'', info)
            info = re.sub(r'\t+', r'\t', info)
            info = re.sub(r' \t/ ', r'\t', info)
            info = re.sub(r'\t:\t', r'\t', info)
            info = re.sub(r' - ', r'\t', info)
            info = re.sub(r'\t$', r'', info)
            info = info.strip()
            info = info.split('\t')
            info = [x.strip() for x in info]
            s_date = self.parse_date(info[0], "%d.%m.%Y")
            if info[1] == '...':
                e_date = self.date
            else:
                e_date = self.parse_date(info[1], "%d.%m.%Y")
            n_party = info[2]
            output.append({
                'id': id,
                's_date': s_date,
                'e_date': e_date,
                'n_party': n_party})
        return output

    def extract_info(self, infile):
        id = self.get_id(infile)
        tree = self.read_html(infile).getroot()
        name = self.get_name(tree)
        nationality = self.get_nationality(tree)
        birth_date, birth_place, death_date, death_place = self.get_birth(tree)
        self.meps[id] = {
            'name': name,
            'nationality': nationality,
            'birth_date': birth_date,
            'birth_place': birth_place,
            'death_date': death_date,
            'death_place': death_place
            }
        self.political_groups = (
            self.political_groups + self.get_political_groups(tree, id))
        self.national_parties = (
            self.national_parties + self.get_national_parties(tree, id))
        pass

    def serialize_dict_of_dicts(self, dict_of_dicts, ofile_name):
        df = pd.DataFrame.from_dict(dict_of_dicts, orient='index')
        opath = os.path.join(self.outdir, ofile_name)
        df.to_csv(
            opath,
            sep='\t',
            mode='w',
            encoding='utf-8',
            index_label='id')
        pass

    def serialize_list_of_dicts(self, list_of_dicts, ofile_name, col_order):
        df = pd.DataFrame(list_of_dicts)
        df = df[col_order]
        opath = os.path.join(self.outdir, ofile_name)
        df.to_csv(opath, sep='\t', mode='w', encoding='utf-8', index=False)
        pass

    def main(self):
        self.meps = {}
        self.political_groups = []
        self.national_parties = []
        for infile in self.infiles:
            print(infile)
            if self.date is None:
                self.date = datetime.datetime.fromtimestamp(
                    os.path.getmtime(infile)).date()
            self.extract_info(infile)
            self.n_proceedings += 1
        self.serialize_dict_of_dicts(self.meps, 'meps.csv')
        self.serialize_list_of_dicts(
            self.political_groups,
            'political_groups.csv',
            ['id', 'm_state', 's_date', 'e_date', 'p_group', 'p_group_role'])
        self.serialize_list_of_dicts(
            self.national_parties,
            'national_parties.csv',
            ['id', 's_date', 'e_date', 'n_party'])
        pass

    def cli(self):
        """CLI parses command-line arguments"""
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "-i", "--input",
            required=True,
            help="path to the input directory.")
        parser.add_argument(
            "-o", "--output",
            required=True,
            help="path to the output directory.")
        parser.add_argument(
            '-p', "--pattern",
            required=False,
            default="*.html",
            help="glob pattern to filter files.")
        parser.add_argument(
            '-d', "--date",
            required=False,
            default=None,
            help="date of download of HTML files.")
        args = parser.parse_args()
        self.indir = args.input
        self.outdir = args.output
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
        self.pattern = args.pattern
        self.date = args.date
        pass
Esempio n. 45
0
 def __init__(self, html):
     cleaner = Cleaner(style=True, page_structure=False)
     self.html = cleaner.clean_html(html)
Esempio n. 46
0
def sanitize(text):
    if text.strip():
        cleaner = Cleaner(safe_attrs_only=False, style=True)
        return cleaner.clean_html(text)
    else:
        return text
def clean_summernote_html(string):
    cleaner = Cleaner()
    return cleaner.clean_html(string)
Esempio n. 48
0
                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span',
                            'img', 'area', 'map'
                        ]
                        args = {
                            'meta': False,
                            'safe_attrs_only': False,
                            'page_structure': False,
                            'scripts': True,
                            'style': True,
                            'links': True,
                            'remove_tags': tags
                        }
                        cleaner = Cleaner(**args)
                        path = '/html/body'
                        body = doc.xpath(path)[0]
                        result = cleaner.clean_html(
                            body).text_content().encode('ascii', 'ignore')

                        dict_result[el] += "\n\n " + " ".join(
                            str(result).split(" ")[:count_of_words])
            except:
                print("error at ", el[:100])
                dict_result[el] = ""
        else:
            dict_result[el] = ""

        idx += 1
        count += 1
        if idx >= block:
            idx = 0
            print("processing item " + str(count) + " of " + str(ntotal))
            print("work with: ", el[:100] + "...")
Esempio n. 49
0
class DataHandler(object):
    def __init__(self):
        # 需要保留的标签
        allow_tags = ['p', 'br', 'img', 'video']
        # 需要保留的属性
        allow_attrs = ['src', 'controls']
        self.cleaner = Cleaner(style=True,
                               scripts=True,
                               comments=True,
                               javascript=True,
                               page_structure=True,
                               safe_attrs_only=True,
                               remove_unknown_tags=False,
                               safe_attrs=frozenset(allow_attrs),
                               allow_tags=allow_tags)

        self.fdfs_sender = Sender()

    @property
    def current_timestamp(self):
        return datetime.datetime.now()

    @property
    def current_time(self):
        # time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def str_to_string(value):
        """字符串-->标准时间字符串"""
        return parse(re.sub(r'年|月|日', '-',
                            value)).strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def datetime_to_string(dt):
        """把Datetime格式转成字符串"""
        return dt.strftime('%Y-%m-%d %H:%M:%S')

    @staticmethod
    def datetime_to_timestamp(date_time):
        """把Datetime类型转成时间戳形式"""
        # return time.mktime(date_time.timetuple())
        return date_time.timestamp()

    @staticmethod
    def string_to_datetime(string):
        """把字符串转化成Datetime格式"""
        return datetime.datetime.strptime(string, "%Y-%m-%d %H:%M:%S")

    @staticmethod
    def timestamp_to_datetime(stamp):
        """把时间戳转化为Datetime格式"""
        return datetime.datetime.timestamp(stamp)

    @staticmethod
    def timestamp_to_string(stamp):
        """把时间戳转成字符串形式"""
        return time.strftime("%Y-%m-%d-%H", time.localtime(stamp))

    def string_to_timestamp(self, str_time):
        """把字符串转成时间戳形式"""
        return time.mktime(self.string_to_datetime(str_time).timetuple())

    def remove_needless_elements(self, text):
        """去标签化"""
        text = self.cleaner.clean_html(text).replace('<div>',
                                                     '').replace('</div>', '')
        return text

    def handle_images(self, url, text, headers=None):
        has_images = True
        download_images_success = True
        failure_time = 0

        response = HtmlResponse(url=url, body=text, encoding="utf-8")
        image_urls = response.xpath("//img/@src").extract()
        if not image_urls:
            has_images = False
        for old_url in image_urls:
            if not (old_url.startswith("https://")
                    or old_url.startswith("http://")):
                old_url = urljoin(url, old_url)
            new_url = self.fdfs_sender.download_upload_image(image_url=old_url,
                                                             headers=headers)
            if new_url:
                text = text.replace(old_url, new_url)
                print(f'图片下载->上传->替换成功,{old_url},===》{new_url}')
            else:
                failure_time += 1
                download_images_success = False  # 表明其中有图片下载失败了
        return has_images, download_images_success, text
Esempio n. 50
0
def _clean_html_body(request, email, body, charset):
    """Clean up a html part as best we can

    Doesn't catch LXML errors
    """
    html_tree = lxml_html.fromstring(body, parser=inboxen_parser)

    # if the HTML doc says its a different encoding, use that
    for meta_tag in html_tree.xpath("/html/head/meta"):
        if meta_tag.get("http-equiv", None) == "Content-Type":
            try:
                content = meta_tag.attrib["content"]
                content = content.split(";", 1)[1]
                charset = dict(HEADER_PARAMS.findall(content))["charset"]
                break
            except (KeyError, IndexError):
                pass
        elif "charset" in meta_tag.attrib:
            charset = meta_tag.attrib["charset"]
            break

    try:
        # check there's a body for premailer
        if html_tree.find("body") is not None:
            html_tree = InboxenPremailer(html_tree).transform()
    except Exception as exc:
        # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything
        messages.info(
            request,
            _("Part of this message could not be parsed - it may not display correctly"
              ))
        _log.warning("Failed to render CSS for %s: %s", email["eid"], exc)

    # Mail Pile uses this, give back if you come up with something better
    cleaner = Cleaner(
        allow_tags=HTML_ALLOW_TAGS,
        kill_tags=["style"],  # remove style tags, not attrs
        remove_unknown_tags=False,
        safe_attrs=HTML_SAFE_ATTRS,
        safe_attrs_only=True,
        style=False,  # keep style attrs
    )

    html_tree = cleaner.clean_html(html_tree)

    # filter images if we need to
    if not email["display_images"]:
        for img in html_tree.xpath("//img"):
            try:
                # try to delete src first - we don't want to add a src where there wasn't one already
                del img.attrib["src"]
                # replace image with 1px png
                img.attrib["src"] = staticfiles_storage.url(
                    "imgs/placeholder.svg")
                email["has_images"] = True
            except KeyError:
                pass

    for link in html_tree.xpath("//a"):
        try:
            # proxy link
            url = link.attrib["href"]
            link.attrib["href"] = proxy_url(url)
        except KeyError:
            pass

        # open link in tab
        link.attrib["target"] = "_blank"
        # and prevent window.opener bug (noopener is only supported in newer
        # browsers, plus we already set noreferrer in the head)
        link.attrib["rel"] = "noreferrer"

    # finally, export to unicode
    body = unicode_damnit(etree.tostring(html_tree, method="html"), charset)
    return safestring.mark_safe(body)
Esempio n. 51
0
def cleaner(content_str):
    # 补全标签
    if content_str is None:
        return None
    try:
        soup = BeautifulSoup(content_str, 'lxml')
        html_str = soup.prettify()
    except:
        html_str = content_str

    # 去掉style,scripts
    clean = Cleaner(style=True,
                    scripts=True,
                    comments=True,
                    javascript=True,
                    page_structure=False,
                    safe_attrs_only=False)
    tree = html.fromstring(html_str)
    content = html.tostring(clean.clean_html(tree), encoding='UTF-8')

    # 删除其他标签,只保留p与img
    con = remove_tags(content, keep=('img', 'p'))

    # 去掉空格,换行
    enter = re.compile('\n')
    con = enter.sub('', con).replace(' ', '')

    # 清理img其他属性
    img_attr1 = re.compile(r'<img(.*?)src')
    con = img_attr1.sub('<img src', con)

    img_attr3 = re.compile(r'<img(.*?)data-original', re.S)
    con = img_attr3.sub('<img src', con)

    try:
        img_attr2 = re.findall(r'src=".*?"(.*?)>', con)
        for attr in img_attr2:
            con = con.replace(attr, '')
    except:
        pass

    # 清理p标签
    p_class = re.compile(r'<p(.*?)>')
    con = p_class.sub('<p>', con)

    # 删除空的p标签
    con = con.replace(r'<p></p>', '')

    # 删除img外围的p标签
    imgs = re.findall(r'<img[^>]+>', con, re.S)
    for img in imgs:
        try:
            con = con.replace('<p>' + img + '</p>', img)
        except Exception as e:
            pass

    # 国际在线站点文章末尾清理
    p_last = re.compile(r'>标签:.*')
    con = p_last.sub('>', con)

    return con
Esempio n. 52
0
def clean(content, title=None):
    content = content.decode("utf-8")

    # We're parsing the content html twice!
    # TODO: This one can probably be removed
    # LXML parsing is used to get title and meta head info from HTML
    html_doc = html.fromstring(content,
                               parser=html.HTMLParser(encoding="utf-8"))
    head_doc = html_doc.find('head')

    reconstructed_body = "<html><body>" + content + "</body></html>"

    # Get title so it can be added as an H1 tag, but remove it from
    # the html itself - so that Pandoc doesn't use it
    if not title:
        title = html_doc.find('.//title')
        title.getparent().remove(title)
        title = title.text_content()
        title = title[:title.rfind('-')]

    # Add in the title
    if "<body><h1>" not in reconstructed_body:
        reconstructed_body = reconstructed_body.replace(
            "<body>", "<body><h1>" + title + "</h1>")

    # Remove stuff that readability didn't remove
    doc = html.fromstring(reconstructed_body)

    # Use lxml's cleaner to remove all useless tags
    # (currently, this removes styles, even when not asked to)
    cleaner = Cleaner(
        scripts=True,
        javascript=True,
        comments=True,
        links=True,
        forms=True,
        annoying_tags=True,
        style=True,
        inline_style=False,
    )
    doc = cleaner.clean_html(doc)

    body_doc = doc.find('body')

    bad_body_xpaths = [
        "//nav",
        "//footer",
        "//button",
        "//form[@id='interview_experience_form']",
        "//div[@id='author']",
        "//div[@id='video']",
        "//div[@id='share-buttons']",
        "//div[@id='ide_link']",
        "//div[@id='disqus_thread']",
        "//div[@id='secondary']",
        "//div[@id='personalNoteDiv']",
        "//div[@id='practiceLinkDiv']",
        "//div[@class='leftSideBarParent']",
        "//div[@class='author_info_box']",
        "//div[@class='plugins']",
        "//div[@class='no-p-tag']",
        "//div[@class='comments-main']",
        "//ins[@class='adsbygoogle']",
        "//h3",
        "//h1[@class='entry-title']",
        "//h2[not(@class='tabtitle')]",
        "//hr",

        # This requires XPath 2.0
        # "//a[ends-with(@href, 'sudo-gate')]",
        "//a[contains(@href, 'sudo-gate')]",
        "//p[contains(., '*****@*****.**')]",
        "//p[starts-with(., 'Please write comments if you find')]",
    ]

    bad_parent_xpaths = [
        "//h2[starts-with(text(), 'Recommended')]",
    ]

    # This one has to be removed first, so h2's parent can die!
    remove_xpaths(body_doc, bad_parent_xpaths, parent=True)
    remove_xpaths(body_doc, bad_body_xpaths)

    # Convert all language tags to p tags
    # H1 is used only for post title
    for lang_h1 in body_doc.xpath("//h2[@class='tabtitle']"):
        lang_p = '<p><strong>%s</strong></p>' % lang_h1.text_content()
        lang_h1.addnext(lxml.etree.XML(lang_p))
        lang_h1.getparent().remove(lang_h1)

    # Not too sure if this is needed - but at this point
    # I don't want to remove any code that works
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    try:
        # Add Source link to doc - this may fail for various reasons
        src_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
            'content')  # noqa
        src_link = "<p><a href='" + src_url + "' rel='tag'>" + src_url + "</a></p>"  # noqa
        post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
        post_content_doc.append(lxml.etree.XML("<h3>Source</h3>"))
        post_content_doc.append(lxml.etree.XML(src_link))
    except:  # noqa
        pass

    # Code in the HTML is in the form of a table
    # We convert the table into a single pre / code tag
    for code_tag in body_doc.xpath('//div[starts-with(@id,"highlighter")]'):
        code = str(code_tag.text_content()).replace("\n\n", "")
        code = html_escape(code)
        code = "<pre> <code>" + code + "</code> </pre>"
        code_tag.addnext(lxml.etree.XML(code))
        code_tag.getparent().remove(code_tag)

    result = html.tostring(body_doc).decode("utf-8")

    return result
Esempio n. 53
0
    def parse_article(self, response):
        due_date = response.meta['due_date']
        last_update = response.meta['last_update']

        lines = response.css('.center .MsoNormal')

        pattern = re.compile(r'^(\d{6})\s*(\D+)$')
        html = response.css('.xilan_con').extract_first()
        cleaner = Cleaner(page_structure=False, style=True)
        html = cleaner.clean_html(html)
        html = lxml.html.fragment_fromstring(html)
        text = lxml.html.tostring(html, method='text', encoding='unicode')
        text = text.replace('代码', '', 1).replace('名称', '', 1)
        text = ''.join(text.split())
        parts = re.split(r'(\d{6})', text)
        # print('text:', parts[:30])
        i = 0
        count = len(parts)

        provinces = []
        province_count = 0
        city_count = 0
        district_count = 0
        while i < count:
            part = parts[i]
            if not part:
                i += 1
                continue

            if part.isdigit():
                code = part

                i += 1
                area = parts[i]

                if code.endswith('0000'):
                    a = Province(code=code, name=area)
                    provinces.append(a)
                    province_count += 1
                elif code.endswith('00'):
                    a = City(code=code, name=area)
                    province = provinces[-1]
                    cities = province.get('cities') or []
                    cities.append(a)
                    province['cities'] = cities
                    city_count += 1
                else:
                    a = District(code=code, name=area)

                    city = provinces[-1]['cities'][-1]
                    districts = city.get('districts') or []
                    districts.append(a)
                    city['districts'] = districts
                    district_count += 1
            else:
                raise CloseSpider('行政区划代码无效: %s' % (part, ))

            i += 1

        print('(更新于 %s) 截止至 %s 县及县以上行政区划代码: \
采集到省份 %d 个,城市 %d 个,区县 %d 个' %
              (last_update.strftime('%Y-%m-%d'), due_date.strftime('%Y-%m-%d'),
               province_count, city_count, district_count))

        yield Areas(due_date=due_date,
                    last_update=last_update,
                    provinces=provinces)
Esempio n. 54
0
class Session(object):
    def __init__(self, encoding='utf8'):
        # Obiekt Session, używany przy kolejnych zapytaniach. Potrzebny, żeby
        # raz ustawić nagłówki i nie przekazywać ich do każdego zapytania
        # osobno.
        self.session = requests.Session()
        self.session.headers.update({
            'Accept'            : 'text/html,application/xhtml+xml,'\
                    'application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding'   : 'gzip, deflate',
            'Accept-Language'   : 'pl,en-US;q=0.7,en;q=0.3',
            'Cache-Control'     : 'max-age=0',
            'Connection'        : 'keep-alive',
            #'Host'              : 'www.krs-online.com.pl',
            'User-Agent'        : 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) '\
                    'Gecko/20100101 Firefox/28.0',
            #'Referer'           : 'http://www.krs-online.com.pl/muzeum-slaska-opolskiego-krs-1260077.html',
            #'Cookie'            : 'krs_fk45=h5mfc4oblmd1e1nokkpu4694e5; krs_cookie_accepted=true',
            #'DNT'               : '1',
            })

        self.parser = HTMLParser(encoding=encoding)
        self.cleaner = Cleaner(
            # usuwanie skryptów, styli i komentarzy
            scripts=True,
            javascript=True,
            comments=True,
            style=True,
            # head i body zostają
            page_structure=False)

    def get_session(self):
        return self.session

    def clean(self, dirty_text):
        return self.cleaner.clean_html(dirty_text)

    def get(self, address, params={}):
        response = self.session.get(address, params=params)
        response.raise_for_status()

        return response.text

    def parse(self, raw_text):
        return fromstring(raw_text, parser=self.parser)

    def get_site(self, address, params={}):
        text = self.get(address, params)
        text = self.clean(text)
        text = self.parse(text)

        return text

    def post(self, address, params={}):
        response = self.session.post(address, data=params)
        response.raise_for_status()
        return response.text

    def post_to_site(self, address, params={}):
        text = self.post(address, params)
        text = self.clean(text)
        text = self.parse(text)
        return text
def cleanup(data, tags):
    cleaner = Cleaner(remove_tags=tags)
    clean = cleaner.clean_html(data)
    root = lxml.html.fromstring(clean)
    return root
Esempio n. 56
0
def clean_html(text):
    cleaner = Cleaner(style=False)
    return cleaner.clean_html(text)
Esempio n. 57
0
    def handle(self, *args, **options):
        individuals = Individual.objects.all()
        for y1, y2 in year_ranges:
            url = url_pattern % (y1, y2, y1, y2)
            r = requests.get(url)
            r.encoding = "utf-8"
            output = r.text
            root = etree.HTML(output)
            dates = [
                d.text for d in root.xpath(
                    "//h2[@class=\"h3_style\"]/a[contains(@href,\"agenda\")]")
            ]
            tables = root.xpath("//table[@class=\"interlaced\"]")
            if len(dates) != len(tables):
                raise Exception("Dates and Questions Mismatch! %d <> %d" %
                                (len(dates), len(tables)))

            for i in range(0, len(dates)):
                date = datetime.strptime(dates[i], '%d.%m.%Y')
                print date
                table = tables[i]
                for row in table.xpath(".//tr")[1:]:
                    cells = row.xpath("td")
                    if all_text(cells[3]).strip() == '-':
                        continue
                    legislator_name = cells[1].text
                    if legislator_name.startswith(u"郭偉强"):
                        legislator_name = u"郭偉強"
                    title = all_text(cells[2])
                    question_type_text = all_text(cells[0])
                    individual = None
                    for p in individuals:
                        if legislator_name.startswith(p.name_ch):
                            individual = p
                            break
                    if individual is None:
                        print(legislator_name)
                        raise Exception("Individual not found. ",
                                        legislator_name)
                    link = cells[3].xpath(".//a")[0].attrib['href']
                    key = str(md5.new(link).hexdigest())
                    m = re.match(r"(.*[0-9]+|UQ)[\(]{0,1}(.*)\)",
                                 question_type_text)
                    if m is None:
                        raise Exception("Undefined Question Type", link,
                                        question_type_text)
                    question_type = m.group(2)
                    detail_r = requests.get(link)
                    detail_r.encoding = "big5"
                    output = detail_r.text
                    cleaner = Cleaner(comments=False)
                    output = cleaner.clean_html(output)
                    detail_root = etree.HTML(output)
                    try:
                        press_release = all_text(
                            detail_root.xpath("//div[@id=\"pressrelease\"]")
                            [0])
                    except IndexError:
                        detail_r = requests.get(link)
                        detail_r.encoding = "utf-8"
                        output = detail_r.text
                        output = cleaner.clean_html(output)
                        detail_root = etree.HTML(output)
                        press_release = all_text(
                            detail_root.xpath("//span[@id=\"pressrelease\"]")
                            [0])
                    question_start = press_release.find(u'以下')
                    reply_start = press_release.rfind(u'答覆:')
                    question_text = press_release[question_start:reply_start]
                    answer_text = press_release[reply_start + 3:]
                    #print(question_text)
                    #print(answer_text)
                    #print link
                    #print date
                    #print individual.name_en
                    #print key
                    #print question_type
                    question = Question()
                    question.key = key
                    question.individual = individual
                    question.date = date
                    question.question_type = question_type
                    question.question = question_text
                    question.answer = answer_text
                    question.title = title
                    question.link = link
                    question.title_ch = title
                    try:
                        question.save()
                    except IntegrityError:
                        print("%s %s already exists" % (str(date), title))
 try:
     email = kinoroot.cssselect("div.fliesstext a")[0]
 except:
     continue
 try:
     url = kinoroot.cssselect("div.fliesstext a")[1]
 except:
     continue
 # print lxml.html.tostring(anschrift1)
 data = {
     'kinoname':
     kinoname,
     'anschrift1':
     re.sub(
         '<\/{0,1}div>', ' ',
         cleaner.clean_html(
             lxml.html.tostring(anschrift1, encoding=unicode))),
     'anschrift2':
     re.sub(
         '<\/{0,1}div>', ' ',
         cleaner.clean_html(
             lxml.html.tostring(anschrift2, encoding=unicode))),
     'anschrift3':
     re.sub(
         '<\/{0,1}div>', ' ',
         cleaner.clean_html(
             lxml.html.tostring(anschrift3, encoding=unicode))),
     'anschrift4':
     re.sub(
         '<\/{0,1}div>', ' ',
         cleaner.clean_html(
             lxml.html.tostring(anschrift4, encoding=unicode))),
Esempio n. 59
0
        <head>
            <title>Test</title>
        </head>
        <body>
        hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
            <h2>Hello World</h2>
            <p>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh</p>
            <a><img src="/test.png"></img></a>
            hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
            <img src="/test.png"></img>
        </body>
    </html>
"""
    string = get_document("http://news.sina.com.cn/c/nd/2016-04-06/doc-ifxrcizs6891671.shtml")
    allow_tags = ("b", "blod", "big", "em", "font", "h1", "h2", "h3", "h4",
                  "h5", "h6", "i", "italic", "small", "strike", "sub",
                  "a", "p", "strong", "div", "img", "tt", "u", "html",
                  "meta", "body", "head", "br", "sup", "title", "article")
    cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                      style=True, links=True, meta=False,
                      add_nofollow=False, page_structure=False,
                      processing_instructions=True, embedded=False,
                      frames=False, forms=False, annoying_tags=False,
                      remove_tags=None, remove_unknown_tags=False,
                      safe_attrs_only=False, allow_tags=allow_tags)
    string = cleaner.clean_html(string)
    extract(string=string)



Esempio n. 60
0
#!/usr/bin/python3
import sys

sys.path.append('..')
from app import db
from app.models import Review
from lxml.html.clean import Cleaner

review = Review.query.order_by(Review.id).first()
while review is not None:
    print(review.id)
    cleaner = Cleaner(safe_attrs_only=False, style=False)
    new_content = cleaner.clean_html(review.content)
    if new_content != review.content:
        print('=======')
        print(review.content)
        print('-------')
        print(new_content)
        print('=======')
        review.content = new_content
        db.session.commit()

    review = Review.query.filter(Review.id > review.id).order_by(
        Review.id).first()