def get_page_info(url, length, index):
 #  print ""

	# print url

	title = url2title(url)
	lang = url2lang(url) 

	print "[%s] %s (%s/%s)" % (lang, title, index, length)

	wp = Page()

#	print wp

	if (lang != "www"):
		r = wp.fetch_from_api_title(title, lang=lang)

		file = "dataset/%s.info.json" % (wp.page_id)

		if not os.path.isfile(file):
			with open(file, "w") as out:
				data = {
					"edits": wp.get_all_editors(),
					"langs": wp.get_langlinks()
				}

				json.dump(data, out)
Example #2
0
source_in = "data/in/wicrimea-seeds.txt"
source_ext = "data/out/wicrimea-seeds.extended.txt"

out = open(source_ext, "w")

with open(source_in, "r") as file:
  for l in file:
    p = Page()
    r = p.fetch_from_api_title(url2title(l.strip()))

    print ""
    print u"→ %s (%s)" % (p.title, l.strip())

    out.write(l.strip()+"\n")

    langs = p.get_langlinks()

    for l in langs:
      if l["lang"] in extra_languages:
#        print l

        p_lang = Page()
        p_lang.fetch_from_api_title(l["*"], lang=l["lang"])

        print u"   → [%s] %s (%s)" % (l["lang"], p_lang.title, p_lang.url)
        out.write(p_lang.url+"\n")

out.close()

def write_revision(rev_id, file):
  rev_with_content = p.get_revisions(extra_params={ "rvstartid": rev_id, "rvlimit" : 1})