Beispiel #1
0
 def test_xhtml_escape(self):
     tests = [
         ("<foo>", "&lt;foo&gt;"),
         (u"<foo>", u"&lt;foo&gt;"),
         (b("<foo>"), b("&lt;foo&gt;")),
         ("<>&\"", "&lt;&gt;&amp;&quot;"),
         ("&amp;", "&amp;amp;"),
     ]
     for unescaped, escaped in tests:
         self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped))
         self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
Beispiel #2
0
    def test_xhtml_escape(self):
        tests = [
            ("<foo>", "&lt;foo&gt;"),
            (u"<foo>", u"&lt;foo&gt;"),
            (b("<foo>"), b("&lt;foo&gt;")),

            ("<>&\"", "&lt;&gt;&amp;&quot;"),
            ("&amp;", "&amp;amp;"),
            ]
        for unescaped, escaped in tests:
            self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped))
            self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
Beispiel #3
0
def get_part_of_page(url, xpath="//div[contains(@class,'entrybody')]", charset='utf-8'):
	page_contents = get_page(url, charset)
	if page_contents:
		tree = etree.parse(StringIO(page_contents), etree.HTMLParser())

		find_content = etree.XPath(xpath)
		entry = find_content(tree)

		if len(entry) > 0:
			return xhtml_unescape(etree.tostring(entry[0], pretty_print=True).strip())
		else:
			logging.error('xpath expression "%s" returned nothing on "%s" - modify it', xpath, url)
			return None
	else:
		return None
Beispiel #4
0
def get_part_of_page(url,
                     xpath="//div[contains(@class,'entrybody')]",
                     charset='utf-8'):
    page_contents = get_page(url, charset)
    if page_contents:
        tree = etree.parse(StringIO(page_contents), etree.HTMLParser())

        find_content = etree.XPath(xpath)
        entry = find_content(tree)

        if len(entry) > 0:
            return xhtml_unescape(
                etree.tostring(entry[0], pretty_print=True).strip())
        else:
            logging.error(
                'xpath expression "%s" returned nothing on "%s" - modify it',
                xpath, url)
            return None
    else:
        return None
Beispiel #5
0
			      c.comment_author as 'author', c.comment_author_email as 'email', c.comment_author_url as 'author_url',
			      c.comment_date as 'date', c.comment_content as 'content',
			      c.user_id > 0 as 'is_user',
			      CASE c.comment_type WHEN 'pingback' THEN 'pingback' ELSE 'comment' END as 'type',
			      p.post_name, p.post_date, p.guid as 'old_path'
			FROM {0}comments c
			     JOIN {0}posts p ON (c.comment_post_ID=p.ID)
			WHERE c.comment_approved='1'
			      AND p.post_type='post' AND p.post_status='publish'
			ORDER BY p.ID ASC, c.comment_date ASC""".format(wp_prefix))
    # """
    comments_by_id = dict()
    threads = OrderedDict()
    for row in cur:
        comment = dict(list(zip([c[0] for c in cur.description], row)))
        comment['content'] = xhtml_unescape(comment['content']).replace(
            '\r', '')
        comment['postfile_path'] = path.join(
            outdir, str(comment['post_date'].year),
            "%02d" % comment['post_date'].month,
            comment['post_name'] + '.comments')

        # author
        if comment['type'] == 'pingback':
            comment['title'] = xhtml_unescape(comment['author'])
            comment['source'] = comment['author_url']
            del comment['author_url']
            del comment['author']
            del comment['email']
            # pingback verification
            if check_pingbacks:
                logging.debug(
Beispiel #6
0
			      c.comment_author as 'author', c.comment_author_email as 'email', c.comment_author_url as 'author_url',
			      c.comment_date as 'date', c.comment_content as 'content',
			      c.user_id > 0 as 'is_user',
			      CASE c.comment_type WHEN 'pingback' THEN 'pingback' ELSE 'comment' END as 'type',
			      p.post_name, p.post_date, p.guid as 'old_path'
			FROM {0}comments c
			     JOIN {0}posts p ON (c.comment_post_ID=p.ID)
			WHERE c.comment_approved='1'
			      AND p.post_type='post' AND p.post_status='publish'
			ORDER BY p.ID ASC, c.comment_date ASC""".format(wp_prefix))
	# """
	comments_by_id = dict()
	threads = OrderedDict()
	for row in cur:
		comment = dict(zip([c[0] for c in cur.description], row))
		comment['content'] = xhtml_unescape(comment['content']).replace('\r', '')
		comment['postfile_path'] = path.join(outdir, str(comment['post_date'].year), "%02d" % comment['post_date'].month, comment['post_name']+'.comments')

		# author
		if comment['type'] == 'pingback':
			comment['title'] = xhtml_unescape(comment['author'])
			comment['source'] = comment['author_url']
			del comment['author_url']
			del comment['author']
			del comment['email']
			# pingback verification
			if check_pingbacks:
				logging.debug('about to load page "%s", which has been the source of a pingback', comment['source'])
				contents = get_page(comment['source'])
				old_path = '/'.join(['/', comment['old_path'].split('/', 3)[2], str(comment['post_date'].year), "%02d" % comment['post_date'].month, comment['post_name']])
				if contents and ('href="http:'+old_path in contents or 'href="https:'+old_path in contents):