Exemple #1
0
def extracts(lang, titles, limit):
    wc = WapitiClient('*****@*****.**',
                      api_url='https://' + lang + '.wikipedia.org/w/api.php')
    if limit > len(titles):
        limit = len(titles)
    ret = {}
    for i in range(limit):
        title = titles[i]
        res = wc.get_page_extract(title)
        if res:
            ret[title] = {'title': title, 'extract': res[0].extract}
    return ret
Exemple #2
0
def extracts(lang, titles, limit):
    wc = WapitiClient('*****@*****.**',
                      api_url='https://' + lang + '.wikipedia.org/w/api.php')
    if limit > len(titles):
        limit = len(titles)
    ret = {}
    for i in range(limit):
        title = titles[i]
        res = wc.get_page_extract(title)
        if res:
            ret[title] = {'title': title, 'extract': res[0].extract}
    return ret
Exemple #3
0
from wapiti import WapitiClient

client = WapitiClient('*****@*****.**')

res = []
cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics')
for cat in cats:
    res.append(client.get_category_recursive(cat, 1000))

print res[0][0]

import pdb;pdb.set_trace()
Exemple #4
0
# -*- coding: utf-8 -*-
# <nbformat>3</nbformat>

# <codecell>

from wapiti import WapitiClient
from pprint import pprint as pp

client = WapitiClient('*****@*****.**')

# <codecell>

client.get_ancient_pages(limit=5)

# <codecell>

client.get_category_articles('Africa', limit=5)

# <codecell>

es_client = WapitiClient('*****@*****.**', api_url='http://es.wikipedia.org/w/api.php')
# a handy mapping of namespace name translations
dict([(x.canonical, x.title) for x in es_client.source_info.namespace_map])

# <codecell>

client.print_usage()

Exemple #5
0
# -*- coding: utf-8 -*-
# <nbformat>3</nbformat>

# <codecell>

from wapiti import WapitiClient
from pprint import pprint as pp

client = WapitiClient('*****@*****.**')

# <codecell>

client.get_ancient_pages(limit=5)

# <codecell>

client.get_category_articles('Africa', limit=5)

# <codecell>

es_client = WapitiClient('*****@*****.**',
                         api_url='http://es.wikipedia.org/w/api.php')
# a handy mapping of namespace name translations
dict([(x.canonical, x.title) for x in es_client.source_info.namespace_map])

# <codecell>

client.print_usage()
Exemple #6
0
import gevent
from gevent import monkey

monkey.patch_all()

from wapiti import WapitiClient

client = WapitiClient('*****@*****.**')

cats = ('Africa', 'FA-Class_articles', 'GA-Class_articles', 'Physics')
tasks = [gevent.spawn(client.get_category_recursive, x, 1000) for x in cats]
gevent.wait(tasks)

print tasks[0].value[0]

import pdb

pdb.set_trace()
Exemple #7
0
which results in a list of lists like this:

[u'English', u'w:English language', u'en', u'4,234,378', u'129,657', u'763']
"""

import os
import sys
import json
import re
from pyquery import PyQuery
from wapiti import WapitiClient

wikis_json_filename = sys.argv[1]
wikis_json = open(wikis_json_filename, 'rb').read()
wikis_list = json.loads(wikis_json)
wc = WapitiClient('*****@*****.**')
RE_INTRO = re.compile(r'(?P<intro>.*?\.)\s*([A-Z\[\n]{1}|\Z)')
RE_FN = re.compile(r'\[[0-9]{1,2}\]')


def parse_count(count_text):
    try:
        return int(count_text.replace(',', '').strip())
    except AttributeError:
        raise


def get_desc(article_name):
    lang_article = wc.web_request_operation('http://en.wikipedia.org/wiki/' +
                                            article_name)
    pq = PyQuery(lang_article[0])