def fetch_page(source):
  print "📄  fetching: %s" % source.encode('utf-8-sig')

  p = Page()
  r = p.fetch_from_api_title(source.strip(), { "redirects":"true", "rvparse" : "true", "prop": "info|revisions", "inprop": "url", "rvprop": "content" })

  with open("path_points/%s.json" % (source), "w") as f:
    json.dump(r, f)

  with codecs.open("_path_points/%s.md" % (source), "w", "utf-8-sig") as file:
    file.write("---\n")

    # REWRITE ME
    result = r["query"]["pages"][r["query"]["pages"].keys()[0]]
    for k in result.keys():
      if k != "revisions":
        # print k
        # print result[k]

        content = result[k]

        if k == "fullurl":
          content = urllib.unquote(content)

        file.write("%s: %s\n" % (k, unicode(content)))

    file.write("---\n\n")

    if "revisions" in result.keys():
      file.write(result["revisions"][0]["*"])


# for k in hl.keywords:
#   fetch_page(k)
def get_page_info(url, length, index):
 #  print ""

	# print url

	title = url2title(url)
	lang = url2lang(url) 

	print "[%s] %s (%s/%s)" % (lang, title, index, length)

	wp = Page()

#	print wp

	if (lang != "www"):
		r = wp.fetch_from_api_title(title, lang=lang)

		file = "dataset/%s.info.json" % (wp.page_id)

		if not os.path.isfile(file):
			with open(file, "w") as out:
				data = {
					"edits": wp.get_all_editors(),
					"langs": wp.get_langlinks()
				}

				json.dump(data, out)
Example #3
0
def store_revisions(self, page_url):
  """
  Retrieve all the revision of a give wikipedia page_url

  parameters:
    - page_url: a wikipedia page URL
  """

  p = Page()

  d = Dataset( "%s:27017" % (mongodb_host) )

  title = url2title(page_url)
  lang = url2lang(page_url)

  p.fetch_from_api_title(title, lang=lang)

  revisions = p.get_all_editors()

  i = 0

  for revision in revisions:
    i += 1

    # ex: en/crimea/revision/999999
    key = "%s/%s/revision/%s" % (lang,title,revision["revid"])

    # fetch the revision from the internet
    value = p.get_revisions(extra_params={ "rvstartid": revision["revid"], "rvlimit" : 1})

    # write in it the database handler
    d.write(key, value)
    self.update_state( state='PROGRESS',
      meta= { 'current': i, 'total': len(revisions)})
def fetch_pageviews(source):
  if os.path.exists("pageviews/%s.json" % (source)) == False:
    print "📄  fetching pageviews: %s" % source.encode('utf-8-sig')

    p = Page()
    p.fetch_from_api_title(source.strip())
    r = p.get_pageviews()  

    with codecs.open("pageviews/%s.json" % (source), "w", "utf-8-sig") as f:
      json.dump(r, f)
def fetch_page(source):
  if os.path.exists("pages/%s.json" % (source)) == True:
    return

  print "📄  fetching: %s" % source.encode('utf-8-sig')

  p = Page()
  r = p.fetch_from_api_title(source.strip(), { "redirects":"true", "rvparse" : "true", "prop": "info|revisions", "inprop": "url", "rvprop": "content" })

  with codecs.open("pages/%s.json" % (source), "w", "utf-8-sig") as f:
    json.dump(r, f)
def sources_pages(sources):
  pages = []

  for s in sources:
    print s.strip()

    w = Page()
    w.fetch_from_api_title(s.strip())
    pages.extend(w.get_links())

  return list(set(pages))
def fetch_revisions(source):
  if os.path.exists("revisions/%s.json" % (source)) == True:
    return

  print "📄  fetching revisions: %s" % source.encode('utf-8-sig')

  p = Page()
  p.fetch_from_api_title(source.strip())
  r = p.get_all_editors()

  with codecs.open("revisions/%s.json" % (source), "w", "utf-8-sig") as f:
    json.dump(r, f)  
Example #8
0
def store_last_revisions(db_url):
  d = Dataset( "%s:27017" % (mongodb_host) )

  url = db_url.replace("/timeline", "")

  (lang, page) = url.split("/")

  p = Page()
  p.fetch_from_api_title(page, lang=lang)

  last_rev = p.get_revisions(extra_params={ "rvlimit" : 1 })

  print "last revisions: %s" % (url.encode("utf8"))

  t = list(d.find({ "url": db_url }, { "url" : 1, "dataset" : { "$slice": -1 } }))

  # print t[0]

  extra_params = {
    "rvstartid": t[0]["dataset"][0]["revid"],
    "rvendid": last_rev[0]["revid"],
    "rvdir": "newer"
  }

  print extra_params

  revs = p.get_revisions(extra_params=extra_params)

  print "%s new revisions since %s (%s)" % (len(revs), t[0]["dataset"][0]["timestamp"], t[0]["dataset"][0]["revid"])
  print "%s  ---->  %s" % (t[0]["dataset"][0]["timestamp"], last_rev[0]["timestamp"])

  for r in revs:
    key = "%s/%s/revision/%s" % (lang, page, r["revid"])
    value = [ r ]

    d.write(key, value)
Example #9
0
from multiprocessing.dummy import Pool as ThreadPool

users = {}

init()

extra_languages = [ "en", "simple", "uk", "fr", "ru", "de" ]

source_in = "data/in/wicrimea-seeds.txt"
source_ext = "data/out/wicrimea-seeds.extended.txt"

out = open(source_ext, "w")

with open(source_in, "r") as file:
  for l in file:
    p = Page()
    r = p.fetch_from_api_title(url2title(l.strip()))

    print ""
    print u"→ %s (%s)" % (p.title, l.strip())

    out.write(l.strip()+"\n")

    langs = p.get_langlinks()

    for l in langs:
      if l["lang"] in extra_languages:
#        print l

        p_lang = Page()
        p_lang.fetch_from_api_title(l["*"], lang=l["lang"])