Esempio n. 1
0
def get_page_info(url, length, index):
 #  print ""

	# print url

	title = url2title(url)
	lang = url2lang(url) 

	print "[%s] %s (%s/%s)" % (lang, title, index, length)

	wp = Page()

#	print wp

	if (lang != "www"):
		r = wp.fetch_from_api_title(title, lang=lang)

		file = "dataset/%s.info.json" % (wp.page_id)

		if not os.path.isfile(file):
			with open(file, "w") as out:
				data = {
					"edits": wp.get_all_editors(),
					"langs": wp.get_langlinks()
				}

				json.dump(data, out)
Esempio n. 2
0
def store_revisions(self, page_url):
  """
  Retrieve all the revision of a give wikipedia page_url

  parameters:
    - page_url: a wikipedia page URL
  """

  p = Page()

  d = Dataset( "%s:27017" % (mongodb_host) )

  title = url2title(page_url)
  lang = url2lang(page_url)

  p.fetch_from_api_title(title, lang=lang)

  revisions = p.get_all_editors()

  i = 0

  for revision in revisions:
    i += 1

    # ex: en/crimea/revision/999999
    key = "%s/%s/revision/%s" % (lang,title,revision["revid"])

    # fetch the revision from the internet
    value = p.get_revisions(extra_params={ "rvstartid": revision["revid"], "rvlimit" : 1})

    # write in it the database handler
    d.write(key, value)
    self.update_state( state='PROGRESS',
      meta= { 'current': i, 'total': len(revisions)})
Esempio n. 3
0
def fetch_revisions(source):
  if os.path.exists("revisions/%s.json" % (source)) == True:
    return

  print "📄  fetching revisions: %s" % source.encode('utf-8-sig')

  p = Page()
  p.fetch_from_api_title(source.strip())
  r = p.get_all_editors()

  with codecs.open("revisions/%s.json" % (source), "w", "utf-8-sig") as f:
    json.dump(r, f)  
Esempio n. 4
0
def write_revision(rev_id, file):
  rev_with_content = p.get_revisions(extra_params={ "rvstartid": rev_id, "rvlimit" : 1})

  with open(file, "w") as f:
    json.dump(rev_with_content, f)

with open(source_ext, "r") as file:
  for l in file:
    lang = url2lang(l)
    p = Page()
    r = p.fetch_from_api_title(url2title(l.strip()), lang=lang)

    print ""
    print u"📖  [%s] %s" % (lang, p.title)

    revisions = p.get_all_editors()
    revisions_downloaded = 0

    # print revisions[0:10]

    print u"  🔨  revisions: %s" % (len(revisions))

    # revs = p.get_revisions(extra_params={ "rvstartid": revisions[0]["revid"], "rvendid": revisions[-1]["revid"] })

    revisions_dir = "data/out/%s/%s-%s/revisions" % (lang, p.page_id, p.title)

    if not os.path.exists(revisions_dir):
      os.makedirs(revisions_dir)

    pool = ThreadPool(8)