def dataset_blocks(url): print "blocks: %s" % (url) d = Dataset( "%s:27017" % (mongodb_host) ) page = d.read(url) txt = mw(page["dataset"][0]["*"]) (blocks, structure) = txt.get_blocks() key = "%s/blocks" % (url) value = { "structure" : structure, "blocks": blocks } d.write(key, value) return value
from wekeypedia.parser.dataset import Dataset mw_content = "yo" pp = pprint.PrettyPrinter(indent=2) if len(sys.argv) > 2: d = Dataset("/Users/tk/datasets/wicrimea") page = sys.argv[1] revision = sys.argv[2] mw_content = d.get_revision_content(page, revision) # print mw_content txt = mw(mw_content) #print unicode(txt.text) #pp.pprint(txt.text.nodes) headings = txt.get_headings() for h in headings: print (" " * (h.level - 2)) + str(h.title) print "—"*10 blocks = txt.get_blocks() print blocks