snap_preview = doc.find_class('snap_preview')[0] # lastnode = snap_preview[-1] # if lastnode.tag == 'div' and lastnode.get('class') == 'possibly-related': # snap_preview.remove(lastnode) post.Contents = html.tostring(snap_preview) # commentstring = doc.get_element_by_id('comments').text.split()[0] # if commentstring == "No": # post.Comments = 0 # else: # post.Comments = int(commentstring) return post if __name__ == "__main__": blog = Blog(filepath="dorait.blog", createnew=False) for ui in sitemap.parse_sitemap("http://krishashok.wordpress.com/sitemap.xml"): if not '/20' in ui.loc: #guards against non-post pages. Needs to be more robust #But okay for now! continue if not blog.has_key(ui.loc): p = post_from_page(ui.loc) p.PostedAt = ui.lastmod blog.persist(ui.loc, p) print "Done %s" % ui.loc else: print "Skipped %s" % ui.loc
snap_preview = doc.find_class('snap_preview')[0] lastnode = snap_preview[-1] if lastnode.tag == 'div' and lastnode.get('class') == 'possibly-related': snap_preview.remove(lastnode) post.Contents = html.tostring(snap_preview) commentstring = doc.get_element_by_id('comments').text.split()[0] if commentstring == "No": post.Comments = 0 else: post.Comments = int(commentstring) return post if __name__ == "__main__": blog = Blog(filepath="dorait.blog", createnew=False) for ui in sitemap.parse_sitemap("http://dorai.wordpress.com/sitemap.xml"): if not '/20' in ui.loc: #guards against non-post pages. Needs to be more robust #But okay for now! continue if not blog.has_key(ui.loc): p = post_from_page(ui.loc) p.PostedAt = ui.lastmod blog.persist(ui.loc, p) print "Done %s" % ui.loc else: print "Skipped %s" % ui.loc
if categories: post.Categories = [c.text for c in categories] snap_preview = doc.find_class('snap_preview')[0] post.Contents = html.tostring(snap_preview) # commentstring = doc.get_element_by_id('comments').text.split()[0] # if commentstring == "No": # post.Comments = 0 # else: # post.Comments = int(commentstring) return post if __name__ == "__main__": blog = Blog(filepath="mona.blog", createnew=False) for ui in sitemap.parse_sitemap("http://pixelbits.wordpress.com/sitemap.xml"): if not '/20' in ui.loc: #guards against non-post pages. Needs to be more robust #But okay for now! continue if not blog.has_key(ui.loc): p = post_from_page(ui.loc) p.PostedAt = ui.lastmod blog.persist(ui.loc, p) print "Done %s" % ui.loc else: print "Skipped %s" % ui.loc