def extracts(lang, titles, limit): wc = WapitiClient('*****@*****.**', api_url='https://' + lang + '.wikipedia.org/w/api.php') if limit > len(titles): limit = len(titles) ret = {} for i in range(limit): title = titles[i] res = wc.get_page_extract(title) if res: ret[title] = {'title': title, 'extract': res[0].extract} return ret
from wapiti import WapitiClient client = WapitiClient('*****@*****.**') res = [] cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics') for cat in cats: res.append(client.get_category_recursive(cat, 1000)) print res[0][0] import pdb;pdb.set_trace()
# -*- coding: utf-8 -*- # <nbformat>3</nbformat> # <codecell> from wapiti import WapitiClient from pprint import pprint as pp client = WapitiClient('*****@*****.**') # <codecell> client.get_ancient_pages(limit=5) # <codecell> client.get_category_articles('Africa', limit=5) # <codecell> es_client = WapitiClient('*****@*****.**', api_url='http://es.wikipedia.org/w/api.php') # a handy mapping of namespace name translations dict([(x.canonical, x.title) for x in es_client.source_info.namespace_map]) # <codecell> client.print_usage()
import gevent from gevent import monkey monkey.patch_all() from wapiti import WapitiClient client = WapitiClient('*****@*****.**') cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics') tasks = [gevent.spawn(client.get_category_recursive, x, 1000) for x in cats] gevent.wait(tasks) print tasks[0].value[0] import pdb pdb.set_trace()
which results in a list of lists like this: [u'English', u'w:English language', u'en', u'4,234,378', u'129,657', u'763'] """ import os import sys import json import re from pyquery import PyQuery from wapiti import WapitiClient wikis_json_filename = sys.argv[1] wikis_json = open(wikis_json_filename, 'rb').read() wikis_list = json.loads(wikis_json) wc = WapitiClient('*****@*****.**') RE_INTRO = re.compile(r'(?P<intro>.*?\.)\s*([A-Z\[\n]{1}|\Z)') RE_FN = re.compile(r'\[[0-9]{1,2}\]') def parse_count(count_text): try: return int(count_text.replace(',', '').strip()) except AttributeError: raise def get_desc(article_name): lang_article = wc.web_request_operation('http://en.wikipedia.org/wiki/' + article_name) pq = PyQuery(lang_article[0])