def fetch_page(source): print "📄 fetching: %s" % source.encode('utf-8-sig') p = Page() r = p.fetch_from_api_title(source.strip(), { "redirects":"true", "rvparse" : "true", "prop": "info|revisions", "inprop": "url", "rvprop": "content" }) with open("path_points/%s.json" % (source), "w") as f: json.dump(r, f) with codecs.open("_path_points/%s.md" % (source), "w", "utf-8-sig") as file: file.write("---\n") # REWRITE ME result = r["query"]["pages"][r["query"]["pages"].keys()[0]] for k in result.keys(): if k != "revisions": # print k # print result[k] content = result[k] if k == "fullurl": content = urllib.unquote(content) file.write("%s: %s\n" % (k, unicode(content))) file.write("---\n\n") if "revisions" in result.keys(): file.write(result["revisions"][0]["*"]) # for k in hl.keywords: # fetch_page(k)
def get_page_info(url, length, index): # print "" # print url title = url2title(url) lang = url2lang(url) print "[%s] %s (%s/%s)" % (lang, title, index, length) wp = Page() # print wp if (lang != "www"): r = wp.fetch_from_api_title(title, lang=lang) file = "dataset/%s.info.json" % (wp.page_id) if not os.path.isfile(file): with open(file, "w") as out: data = { "edits": wp.get_all_editors(), "langs": wp.get_langlinks() } json.dump(data, out)
def store_revisions(self, page_url): """ Retrieve all the revision of a give wikipedia page_url parameters: - page_url: a wikipedia page URL """ p = Page() d = Dataset( "%s:27017" % (mongodb_host) ) title = url2title(page_url) lang = url2lang(page_url) p.fetch_from_api_title(title, lang=lang) revisions = p.get_all_editors() i = 0 for revision in revisions: i += 1 # ex: en/crimea/revision/999999 key = "%s/%s/revision/%s" % (lang,title,revision["revid"]) # fetch the revision from the internet value = p.get_revisions(extra_params={ "rvstartid": revision["revid"], "rvlimit" : 1}) # write in it the database handler d.write(key, value) self.update_state( state='PROGRESS', meta= { 'current': i, 'total': len(revisions)})
def fetch_pageviews(source): if os.path.exists("pageviews/%s.json" % (source)) == False: print "📄 fetching pageviews: %s" % source.encode('utf-8-sig') p = Page() p.fetch_from_api_title(source.strip()) r = p.get_pageviews() with codecs.open("pageviews/%s.json" % (source), "w", "utf-8-sig") as f: json.dump(r, f)
def fetch_page(source): if os.path.exists("pages/%s.json" % (source)) == True: return print "📄 fetching: %s" % source.encode('utf-8-sig') p = Page() r = p.fetch_from_api_title(source.strip(), { "redirects":"true", "rvparse" : "true", "prop": "info|revisions", "inprop": "url", "rvprop": "content" }) with codecs.open("pages/%s.json" % (source), "w", "utf-8-sig") as f: json.dump(r, f)
def sources_pages(sources): pages = [] for s in sources: print s.strip() w = Page() w.fetch_from_api_title(s.strip()) pages.extend(w.get_links()) return list(set(pages))
def fetch_revisions(source): if os.path.exists("revisions/%s.json" % (source)) == True: return print "📄 fetching revisions: %s" % source.encode('utf-8-sig') p = Page() p.fetch_from_api_title(source.strip()) r = p.get_all_editors() with codecs.open("revisions/%s.json" % (source), "w", "utf-8-sig") as f: json.dump(r, f)
def store_last_revisions(db_url): d = Dataset( "%s:27017" % (mongodb_host) ) url = db_url.replace("/timeline", "") (lang, page) = url.split("/") p = Page() p.fetch_from_api_title(page, lang=lang) last_rev = p.get_revisions(extra_params={ "rvlimit" : 1 }) print "last revisions: %s" % (url.encode("utf8")) t = list(d.find({ "url": db_url }, { "url" : 1, "dataset" : { "$slice": -1 } })) # print t[0] extra_params = { "rvstartid": t[0]["dataset"][0]["revid"], "rvendid": last_rev[0]["revid"], "rvdir": "newer" } print extra_params revs = p.get_revisions(extra_params=extra_params) print "%s new revisions since %s (%s)" % (len(revs), t[0]["dataset"][0]["timestamp"], t[0]["dataset"][0]["revid"]) print "%s ----> %s" % (t[0]["dataset"][0]["timestamp"], last_rev[0]["timestamp"]) for r in revs: key = "%s/%s/revision/%s" % (lang, page, r["revid"]) value = [ r ] d.write(key, value)
from multiprocessing.dummy import Pool as ThreadPool users = {} init() extra_languages = [ "en", "simple", "uk", "fr", "ru", "de" ] source_in = "data/in/wicrimea-seeds.txt" source_ext = "data/out/wicrimea-seeds.extended.txt" out = open(source_ext, "w") with open(source_in, "r") as file: for l in file: p = Page() r = p.fetch_from_api_title(url2title(l.strip())) print "" print u"→ %s (%s)" % (p.title, l.strip()) out.write(l.strip()+"\n") langs = p.get_langlinks() for l in langs: if l["lang"] in extra_languages: # print l p_lang = Page() p_lang.fetch_from_api_title(l["*"], lang=l["lang"])