class WikiScratcher: def __init__(self, category): self.wikipedia = MediaWiki(url='https://en.wikipedia.org/w/api.php', user_agent='wiki-data-loader', lang='en') self.category = category # returns {pagename: {sectionname: section}, ....} def get_sections(self, num_pages): res = {} page_titles = self.wikipedia.categorymembers(self.category, results=num_pages, subcategories=False) if (len(page_titles) < num_pages): print('Only ' + str(len(page_titles)) + ' pages found !!!') for p_title in page_titles: res[p_title] = {} p = self.wikipedia.page(p_title) # add the summary res[p_title]['summary'] = p.summary # add all other sections section_titles = p.sections for s_title in section_titles: # ignore sections like 'references' or 'see also' if (self._ignore_section(s_title)): continue section_text = p.section(s_title) # ignore empty sections which are in fact most likely subheaders if len(section_text) > 0: res[p_title][s_title] = section_text return res
from mediawiki import MediaWiki wikipedia = MediaWiki(lang='ka') citysubcats = ['კატეგორია:ქალაქები_ქვეყნების_მიხედვით'] while citysubcats: subsubcats = [] for subcat in citysubcats: subcat = subcat[len('კატეგორია:'):] #print('SUBCAT', subcat) citypages, ssc = wikipedia.categorymembers(subcat, subcategories=True) subsubcats.extend(ssc) for citypage in citypages: print(citypage) citysubcats = subsubcats
pp = pprint.PrettyPrinter(indent=4) full_section_list = [] record_number_ingested = 0 # Get NUM_RESULTS mediawiki pages in specific category # Set NUM_RESULTS to None to get all pages in category CATEGORY = "Plant" NUM_RESULTS = None plantwiki = MediaWiki(url=API_URL, rate_limit=True, rate_limit_wait=(timedelta(seconds=1)), timeout=100) all_plant_names = plantwiki.categorymembers(CATEGORY, results=NUM_RESULTS, subcategories=False) for plant_name in all_plant_names: try: wikicode = parse(plant_name) except ValueError: print("Encountered an issue getting", plant_name, "from wiki. Continuing on...") continue templates = wikicode.filter_templates() plant_info = get_param_dict(templates) record_number_ingested += 1 # INFO: Getting sections takes quite a bit of time