def set_auto_slug(self): """Generate a slug for this post.""" #cfg = get_application().cfg slug = gen_slug(self.title) if not slug: slug = to_blog_timezone(self.pub_date).strftime('%H%M') full_slug = gen_timestamped_slug(slug, self.content_type, self.pub_date) if full_slug != self.slug: while Post.query.autoflush(False).filter_by(slug=full_slug) \ .limit(1).count(): full_slug = increment_string(full_slug) self.slug = full_slug
def parse_feed(fd): """Parse an extended WordPress RSS feed into a structure the general importer system can handle. The return value is a `Blog` object. """ tree = parse_broken_wxr(fd) authors = {} def get_author(name): if name: author = authors.get(name) if author is None: author = authors[name] = Author(name, None) return author tags = {} for item in tree.findall(WORDPRESS.tag): tag = Tag(item.findtext(WORDPRESS.tag_slug), item.findtext(WORDPRESS.tag_name)) tags[tag.name] = tag categories = {} for item in tree.findall(WORDPRESS.category): category = Category(item.findtext(WORDPRESS.category_nicename), item.findtext(WORDPRESS.cat_name)) categories[category.name] = category posts = [] clean_empty_tags = re.compile("\<(?P<tag>\w+?)\>[\r\n]?\</(?P=tag)\>") for item in tree.findall("item"): status = {"draft": STATUS_DRAFT}.get(item.findtext(WORDPRESS.status), STATUS_PUBLISHED) post_name = item.findtext(WORDPRESS.post_name) pub_date = parse_wordpress_date(item.findtext(WORDPRESS.post_date_gmt)) content_type = {"post": "entry", "page": "page"}.get(item.findtext(WORDPRESS.post_type), "entry") slug = None if pub_date is None or post_name is None: status = STATUS_DRAFT if status == STATUS_PUBLISHED: slug = gen_timestamped_slug(post_name, content_type, pub_date) # Store WordPress comment ids mapped to Comment objects comments = {} for x in item.findall(WORDPRESS.comment): if x.findtext(WORDPRESS.comment_approved) == "spam": continue commentobj = Comment( x.findtext(WORDPRESS.comment_author), x.findtext(WORDPRESS.comment_content), x.findtext(WORDPRESS.comment_author_email), x.findtext(WORDPRESS.comment_author_url), comments.get(x.findtext(WORDPRESS.comment_parent), None), parse_wordpress_date(x.findtext(WORDPRESS.comment_date_gmt)), x.findtext(WORDPRESS.comment_author_ip), "html", x.findtext(WORDPRESS.comment_type) in ("pingback", "traceback"), (COMMENT_UNMODERATED, COMMENT_MODERATED)[x.findtext(WORDPRESS.comment_approved) == "1"], ) comments[x.findtext(WORDPRESS.comment_id)] = commentobj post_body = item.findtext(CONTENT.encoded) post_intro = item.findtext("description") if post_intro and not post_body: post_body = post_intro post_intro = None elif post_body: find_more_results = re.split("<!--more ?.*?-->", post_body) if len(find_more_results) > 1: post_intro = clean_empty_tags.sub("", _wordpress_to_html(find_more_results[0])) post_body = find_more_results[1] else: # hmm. nothing to process. skip that entry continue post_body = clean_empty_tags.sub("", _wordpress_to_html(post_body)) post = Post( slug, item.findtext("title"), item.findtext("link"), pub_date, get_author(item.findtext(DC_METADATA.creator)), post_intro, post_body, [tags[x.text] for x in item.findall("tag") if x.text in tags], [categories[x.text] for x in item.findall("category") if x.text in categories], comments.values(), item.findtext("comment_status") != "closed", item.findtext("ping_status") != "closed", parser="html", content_type=content_type, ) posts.append(post) return Blog( tree.findtext("title"), tree.findtext("link"), tree.findtext("description") or "", tree.findtext("language") or "en", tags.values(), categories.values(), posts, authors.values(), )
def parse_feed(fd): """Parse an extended WordPress RSS feed into a structure the general importer system can handle. The return value is a `Blog` object. """ tree = parse_broken_wxr(fd) authors = {} def get_author(name): if name: author = authors.get(name) if author is None: author = authors[name] = Author(name, None) return author tags = {} for item in tree.findall(WORDPRESS.tag): tag = Tag(item.findtext(WORDPRESS.tag_slug), item.findtext(WORDPRESS.tag_name)) tags[tag.name] = tag categories = {} for item in tree.findall(WORDPRESS.category): category = Category(item.findtext(WORDPRESS.category_nicename), item.findtext(WORDPRESS.cat_name)) categories[category.name] = category posts = [] clean_empty_tags = re.compile("\<(?P<tag>\w+?)\>[\r\n]?\</(?P=tag)\>") for item in tree.findall('item'): status = { 'draft': STATUS_DRAFT }.get(item.findtext(WORDPRESS.status), STATUS_PUBLISHED) post_name = item.findtext(WORDPRESS.post_name) pub_date = parse_wordpress_date(item.findtext(WORDPRESS.post_date_gmt)) content_type={'post': 'entry', 'page': 'page'}.get( item.findtext(WORDPRESS.post_type), 'entry') slug = None if pub_date is None or post_name is None: status = STATUS_DRAFT if status == STATUS_PUBLISHED: slug = gen_timestamped_slug(post_name, content_type, pub_date) # Store WordPress comment ids mapped to Comment objects comments = {} for x in item.findall(WORDPRESS.comment): if x.findtext(WORDPRESS.comment_approved) == 'spam': continue commentobj = Comment( x.findtext(WORDPRESS.comment_author), x.findtext(WORDPRESS.comment_content), x.findtext(WORDPRESS.comment_author_email), x.findtext(WORDPRESS.comment_author_url), comments.get(x.findtext(WORDPRESS.comment_parent), None), parse_wordpress_date(x.findtext( WORDPRESS.comment_date_gmt)), x.findtext(WORDPRESS.comment_author_ip), 'html', x.findtext(WORDPRESS.comment_type) in ('pingback', 'traceback'), (COMMENT_UNMODERATED, COMMENT_MODERATED) [x.findtext(WORDPRESS.comment_approved) == '1'] ) comments[x.findtext(WORDPRESS.comment_id)] = commentobj post_body = item.findtext(CONTENT.encoded) post_intro = item.findtext('description') if post_intro and not post_body: post_body = post_intro post_intro = None elif post_body: find_more_results = re.split('<!--more ?.*?-->', post_body) if len(find_more_results) > 1: post_intro = clean_empty_tags.sub('', _wordpress_to_html(find_more_results[0])) post_body = find_more_results[1] else: # hmm. nothing to process. skip that entry continue post_body = clean_empty_tags.sub('', _wordpress_to_html(post_body)) post = Post( slug, item.findtext('title'), item.findtext('link'), pub_date, get_author(item.findtext(DC_METADATA.creator)), post_intro, post_body, [tags[x.text] for x in item.findall('tag') if x.text in tags], [categories[x.text] for x in item.findall('category') if x.text in categories], comments.values(), item.findtext('comment_status') != 'closed', item.findtext('ping_status') != 'closed', parser='html', content_type=content_type ) posts.append(post) return Blog( tree.findtext('title'), tree.findtext('link'), tree.findtext('description') or '', tree.findtext('language') or 'en', tags.values(), categories.values(), posts, authors.values() )