コード例 #1
0
class WikiScratcher:
    def __init__(self, category):
        self.wikipedia = MediaWiki(url='https://en.wikipedia.org/w/api.php',
                                   user_agent='wiki-data-loader',
                                   lang='en')
        self.category = category

    # returns {pagename: {sectionname: section}, ....}
    def get_sections(self, num_pages):
        res = {}
        page_titles = self.wikipedia.categorymembers(self.category,
                                                     results=num_pages,
                                                     subcategories=False)
        if (len(page_titles) < num_pages):
            print('Only ' + str(len(page_titles)) + ' pages found !!!')
        for p_title in page_titles:
            res[p_title] = {}
            p = self.wikipedia.page(p_title)
            # add the summary
            res[p_title]['summary'] = p.summary
            # add all other sections
            section_titles = p.sections
            for s_title in section_titles:
                # ignore sections like 'references' or 'see also'
                if (self._ignore_section(s_title)):
                    continue
                section_text = p.section(s_title)
                # ignore empty sections which are in fact most likely subheaders
                if len(section_text) > 0:
                    res[p_title][s_title] = section_text
        return res
コード例 #2
0
from mediawiki import MediaWiki
wikipedia = MediaWiki(lang='ka')

citysubcats = ['კატეგორია:ქალაქები_ქვეყნების_მიხედვით']
while citysubcats:
    subsubcats = []
    for subcat in citysubcats:
        subcat = subcat[len('კატეგორია:'):]
        #print('SUBCAT', subcat)
        citypages, ssc = wikipedia.categorymembers(subcat, subcategories=True)
        subsubcats.extend(ssc)
        for citypage in citypages:
            print(citypage)
    citysubcats = subsubcats
コード例 #3
0
pp = pprint.PrettyPrinter(indent=4)
full_section_list = []
record_number_ingested = 0

# Get NUM_RESULTS mediawiki pages in specific category
# Set NUM_RESULTS to None to get all pages in category
CATEGORY = "Plant"
NUM_RESULTS = None

plantwiki = MediaWiki(url=API_URL,
                      rate_limit=True,
                      rate_limit_wait=(timedelta(seconds=1)),
                      timeout=100)

all_plant_names = plantwiki.categorymembers(CATEGORY,
                                            results=NUM_RESULTS,
                                            subcategories=False)

for plant_name in all_plant_names:

    try:
        wikicode = parse(plant_name)
    except ValueError:
        print("Encountered an issue getting", plant_name,
              "from wiki. Continuing on...")
        continue
    templates = wikicode.filter_templates()
    plant_info = get_param_dict(templates)
    record_number_ingested += 1

    # INFO: Getting sections takes quite a bit of time