def store_revisions(self, page_url): """ Retrieve all the revision of a give wikipedia page_url parameters: - page_url: a wikipedia page URL """ p = Page() d = Dataset( "%s:27017" % (mongodb_host) ) title = url2title(page_url) lang = url2lang(page_url) p.fetch_from_api_title(title, lang=lang) revisions = p.get_all_editors() i = 0 for revision in revisions: i += 1 # ex: en/crimea/revision/999999 key = "%s/%s/revision/%s" % (lang,title,revision["revid"]) # fetch the revision from the internet value = p.get_revisions(extra_params={ "rvstartid": revision["revid"], "rvlimit" : 1}) # write in it the database handler d.write(key, value) self.update_state( state='PROGRESS', meta= { 'current': i, 'total': len(revisions)})
def get_page_info(url, length, index): # print "" # print url title = url2title(url) lang = url2lang(url) print "[%s] %s (%s/%s)" % (lang, title, index, length) wp = Page() # print wp if (lang != "www"): r = wp.fetch_from_api_title(title, lang=lang) file = "dataset/%s.info.json" % (wp.page_id) if not os.path.isfile(file): with open(file, "w") as out: data = { "edits": wp.get_all_editors(), "langs": wp.get_langlinks() } json.dump(data, out)
def dataset_timeline(url): print "timeline: %s" % (url) d = Dataset( "%s:27017" % (mongodb_host) ) title = url2title(url) lang = url2lang(url) url = "%s/%s" % (lang, title) regex_string = "%s\/%s\/revision/([0-9]*$)" % (lang, title) r = d.find({ "url" : { "$regex" : regex_string } }, { "dataset.timestamp" : 1, "dataset.revid" : 1 }) timeline = [] for result in r: i = { "timestamp": result["dataset"][0]["timestamp"], "revid": result["dataset"][0]["revid"] } timeline.append(i) timeline = sorted( timeline, key=lambda rev: rev["timestamp"]) print "start: %s" % (timeline[0]) print "end: %s" % (timeline[-1]) k = "%s/%s/timeline" % (lang, title) d.delete(k) d.write(k, timeline) print r.count()
p_lang.fetch_from_api_title(l["*"], lang=l["lang"]) print u" → [%s] %s (%s)" % (l["lang"], p_lang.title, p_lang.url) out.write(p_lang.url+"\n") out.close() def write_revision(rev_id, file): rev_with_content = p.get_revisions(extra_params={ "rvstartid": rev_id, "rvlimit" : 1}) with open(file, "w") as f: json.dump(rev_with_content, f) with open(source_ext, "r") as file: for l in file: lang = url2lang(l) p = Page() r = p.fetch_from_api_title(url2title(l.strip()), lang=lang) print "" print u"📖 [%s] %s" % (lang, p.title) revisions = p.get_all_editors() revisions_downloaded = 0 # print revisions[0:10] print u" 🔨 revisions: %s" % (len(revisions)) # revs = p.get_revisions(extra_params={ "rvstartid": revisions[0]["revid"], "rvendid": revisions[-1]["revid"] })