def test_missing(self): """ Get missing page """ try: wptools.page(pageid=1).get(False) self.fail("failed to raise LookupError") except LookupError as detail: print(detail)
def test_not_found(self): """ Try to get a non-existent page """ try: wptools.page(NOPAGE, silent=True).get(False) self.fail("failed to raise LookupError") except LookupError as detail: pass
def test_lookup_unicode_error(self): """ Raise LookupError without UnicodeDecodeError. Issue #29 """ try: wptools.page('Д北_TEST', silent=True).get(False) self.fail("failed to raise LookupError") except LookupError as detail: pass
def test_lookup_unicode_error(self): """ Potentially raise UnicodeDecodeError on LookupError """ try: wptools.page('阿Vane').get(False) # issue 29 self.fail("failed to raise LookupError") except LookupError as detail: print(detail)
def test_unknown_lang(self): """ Mediawiki site function not supported """ # "jp" Wikinews (unknown language code) try: wptools.page(wiki='jp.wikinews.org') self.fail("failed to raise LookupError") except LookupError as detail: print(detail)
def test_page_get(self): """ test page.get() without making any requests, is for coverage """ wptools.request.WPToolsRequest.DISABLED = True skip = ['parse', 'query', 'restbase', 'wikiata'] page = wptools.page('TEST', skip=skip) page.get() page = wptools.page('TEST', wikibase='TEST', skip=skip) page.get()
def test_core_init(self): page = wptools.page('TEST', skip='SKIP') self.assertEqual(page.flags, {'silent': False, 'skip': 'SKIP', 'verbose': False}) page = wptools.page('TEST', variant='VARIANT') self.assertEqual(page.params, {'lang': 'en', 'title': 'TEST', 'variant': 'VARIANT'}) page = wptools.page('TEST', wiki='WIKI') self.assertEqual(page.params, {'lang': 'en', 'title': 'TEST', 'wiki': 'WIKI'})
def test_page_init(self): page = wptools.page('TEST', silent=True) self.assertEqual(page.params, {'lang': 'en', 'title': 'TEST'}) self.assertEqual(page.flags, {'silent': True, 'verbose': False}) page = wptools.page(pageid=123, silent=SILENT_FLAG) self.assertEqual(page.params, {'lang': 'en', 'pageid': 123}) page = wptools.page(wikibase='Q42', silent=SILENT_FLAG) self.assertEqual(page.params, {'lang': 'en', 'wikibase': 'Q42'}) self.assertTrue('requests' not in page.data)
def wiki_test(): #wiki = wikipediaapi.Wikipedia('ko') #page_py = wiki.page #page_py = wiki.page('파이썬') #wikipage = wikipedia.page('kakao') #print(wikipage.images[0]) #print(page_py.images[0]) #print("Page - Exists: %s" % page_py.exists()) #print("Page - Summary: %s" % page_py.summary[0:100]) kr = wptools.page(lang='ko', action='kakao').get_query() kakao = wptools.page('kakao').get_query() kakao_img = kakao.pageimage print(kakao_img)
def executeAStar(pageA=None, pageB=None, printResult=True, userheuristic=heuristicFunction, random=True, timeResult=False, debug=True): # Initialize the start and end goal DEBUG = debug startPage = wiki.page("West Pullman, Chicago", silent=True) endPage = wiki.page('Midwestern United States', silent=True) path = astar(startPage, endPage, userheuristic)
def get_page(page_search): lang_codes = ['en', 'de', 'es', 'fr'] for lang in lang_codes: try: return wptools.page(page_search, lang=lang).get_parse() except LookupError: try: wikipedia.set_lang(lang) search = wikipedia.search(page_search) if search: return wptools.page(search[0], lang=lang).get_parse() except LookupError: print(f'no wiki page found for {page_search} lang {lang}.') return None
def get_wikidata(subject): node_data = {} base = wptools.page(subject).get_parse().data["wikibase"] page = wptools.page(wikibase=base).get_wikidata().data # clean data (remove (PXXX) ) data = page["wikidata"] def clean_dict(data=None): data = data or {} clean = {} for key in dict(data): if not key: continue val = data[key] if isinstance(val, list): for idx, v in enumerate(val): if isinstance(v, str): v = " ".join(v.split(" ")[:-1]) val[idx] = v if isinstance(v, dict): v = clean_dict(dict(v)) val[idx] = v if isinstance(v, list): for idx, v1 in enumerate(v): v[idx] = " ".join(v1.split(" ")[:-1]) val[idx] = v elif isinstance(val, dict): val = clean_dict(dict(val)) elif val.endswith(")"): val = " ".join(val.split(" ")[:-1]) key = " ".join(key.split(" ")[:-1]) if val and key: clean[key] = val return clean data = clean_dict(data) # parse for distant child of node_data["description"] = page["description"] # direct child of #node_data["instance of"] = page["what"] # data fields for k in data: node_data[k] = data[k] # related to node_data["url"] = page["wikidata_url"] #node_data["aliases"] = page["aliases"] return node_data
def test_random(self): """ Get a random title """ r = wptools.page() self.assertTrue(r.pageid is not None) self.assertTrue(r.title is not None)
def test_random(self): """ Get random title from random language wiki """ page = wptools.page(lang=random.choice(LANG)) page.get(show=False) self.assertTrue(page.data['pageid'] is not None)
def test_selected(self): """ Get selected title """ t = titles.title() p = wptools.page(t['title'], lang=t['lang']).get_query(False) self.assertTrue(p.pageid is not None)
def __init__(self, title, lang, silent=False): self.title = title #Fetch page from wikipedia try: self._page = wptools.page(title, silent=silent, lang=lang).get() logger.info("Wiki parse {}".format(self.url)) except LookupError: err_msg = f"\"{title}\" not found" logger.warning(err_msg) raise PageNotExists(err_msg) self.ast = wtp.parse(self._page.data['wikitext']) self.type = None #Detect radio pages by searching a radio infobox self.infobox = self.radio_site = None self.find_infobox() if self.infobox: self.type = PageInfo.RADIO return None #Detect page with lists of radios #Retrieve all section delimited with an h2 self.sections = [section for section in self.ast.sections \ if section.level in [0, 2] and self.allowed_section(section)] #Get at least one table with radio listed self.manage_datatype() if self.have_table: self.type = PageInfo.LIST return None err_msg = "{} : Invalid url, not a radio station or listing" raise PageError(err_msg.format(self.url))
def test_page_query(self): page = wptools.page('TEST') qobj = wptools.query.WPToolsQuery() qstr = page._query('random', qobj) self.assertTrue('list=random' in qstr) qstr = page._query('query', qobj) self.assertTrue('action=query' in qstr) self.assertTrue('pageprops' in qstr) qstr = page._query('querymore', qobj) self.assertTrue('action=query' in qstr) self.assertTrue('&cllimit=500' in qstr) self.assertTrue('&imlimit=500&lllimit=500&pclimit=500' in qstr) qstr = page._query('parse', qobj) self.assertTrue('action=parse' in qstr) self.assertTrue('parsetree' in qstr) page.data['entities'] = ['Q1', 'Q2', 'Q3'] qstr = page._query('labels', qobj) self.assertTrue('action=wbgetentities' in qstr) self.assertTrue('ids=Q1|Q2|Q3' in qstr) qstr = page._query('wikidata', qobj) self.assertTrue('action=wbgetentities' in qstr) page.params.update({'endpoint': '/page/summary/'}) qstr = page._query('restbase', qobj) self.assertTrue('api/rest' in qstr) self.assertTrue('requests' not in page.data)
def test_complex_infobox(self): """ Successfully populate complex infobox dict """ p = wptools.page('Aung San Suu Kyi').get_parse(False) self.assertGreaterEqual(len(p.infobox), 32) self.assertTrue('errors' not in p.infobox)
def test_page_get_imageinfo(self): page = wptools.page('TEST', silent=SILENT_FLAG) self.assertRaises(ValueError, page.get_imageinfo) page.cache = {'imageinfo': imageinfo.cache} page.data['image'] = [{ 'kind': 'parse-image', 'file': 'Douglas adams portrait cropped.jpg' }] page._normalize_images() query = page._query('imageinfo', wptools.query.WPToolsQuery()) self.assertTrue('File%3ADouglas' in query) page._set_data('imageinfo') image = page.data['image'][0] self.assertTrue('/c/c0/' in image['url']) self.assertTrue('/commons.' in image['descriptionurl']) self.assertTrue(image['file'].startswith('File:')) self.assertEqual(image['height'], 386) self.assertEqual(image['size'], 32915) self.assertEqual(image['width'], 333) self.assertTrue('requests' not in page.data)
def download_images(searchquery): import wptools #https://github.com/siznax/wptools/wiki/Examples#get-a-representative-image import requests import os page = wptools.page(searchquery, lang="de") page.get_query() result = page.images(['url', 'file']) print(result[0]['url'], "Dateiname: ", result[0]['file']) url = result[0]['url'] file = result[0]['file'] print(file) r = requests.get(url) save_path = os.path.join(os.path.expanduser('~'), 'learny', 'learny', f"{searchquery }-{file[5:]}") with open(save_path, 'wb') as f: # with open(f'/home/alex/Downloads/{searchquery }-{file[5:]}', 'wb') as f: f.write(r.content) # Retrieve HTTP meta-data print(r.status_code) print(r.headers['content-type']) print(r.encoding)
def scrape_stadium_manually(OPTAvenueName, alternativeName, overwrite=False): print('Searching for {}'.format(OPTAvenueName)) if stadiums.find_one({'venueName': OPTAvenueName}) and not overwrite: print('Already in db') return True wiki_name = wikipedia.search(html.unescape(alternativeName + ' stadium')) geocode_result = gmaps.geocode(alternativeName) if geocode_result: elevation_result = gmaps.elevation( convert.normalize_lat_lng( geocode_result[0]['geometry']['location'])) stadium_data_to_insert = { 'venueName': OPTAvenueName, 'location_data': geocode_result[0], 'altitude_data': elevation_result[0] } stadiums.replace_one({'venueName': OPTAvenueName}, stadium_data_to_insert, upsert=True) if wiki_name: wiki_page = wptools.page(wiki_name[0], silent=True) wiki_page.get_parse() wiki_data = wiki_page.data['infobox'] wiki_data_to_insert = { 'venueName': OPTAvenueName, 'wiki_name': wiki_name[0], 'wiki_data': wiki_data } wiki.replace_one({'venueName': OPTAvenueName}, wiki_data_to_insert, upsert=True)
def test_page_get_wikidata(self): page = wptools.page('TEST', wikibase='WIKIBASE', skip=SKIP_FLAG, silent=SILENT_FLAG) page.cache = {'wikidata': wikidata.cache} page._set_data('wikidata') page.cache['labels'] = labels_1.cache page._set_data('labels') page.cache['labels'] = labels_2.cache page._set_data('labels') page.cache['labels'] = labels_3.cache page._set_data('labels') page._post_labels_updates() page.cache['imageinfo'] = imageinfo.cache page._set_data('imageinfo') data = page.data self.assertEqual(data['wikibase'], 'Q42') self.assertEqual(data['image'][0]['kind'], 'wikidata-image') self.assertEqual(len(data['claims']), 102) self.assertEqual(len(data['labels']), 147) self.assertEqual(len(data['wikidata']), 102) self.assertTrue('requests' not in page.data)
def fetch_wikipedia(qid, lang, normalized): """Look up the Wikipedia page for the given QID and language""" print( 'fetching "{}" Infobox/Wikidata for entity: {} ({}) ...'.format( lang, qid, normalized ), file=sys.stderr ) try: page = wptools.page(wikibase=qid, lang=lang, silent=True).get() return { 'infobox': get_infobox(page), 'wikidata': page.data["wikidata"], 'title': page.data["title"], 'url': 'https://{}.wikipedia.org/wiki/{}'.format(lang, page.data["title"]) } except LookupError: print( 'No page exists for {} on {}.wikipedia.org'.format(qid, lang), file=sys.stderr ) # If the lookup fails, just return an empty dict return {}
def test_page_get_random(self): page = wptools.page('TEST', skip=['imageinfo'], silent=True) page.cache = {'random': query.cache} page._set_data('random') page.get_random() self.assertEqual(page.data['pageid'], 45564415) self.assertEqual(page.data['title'], '1990 NBL Finals')
def main(delay=1): """ GET random pages forever """ print("%d languages" % len(LANGUAGES)) start = int(time.time()) count = 0 while True: count += 1 elapsed = int(time.time()) - start lang = random.choice(LANGUAGES) page = wptools.page(lang=lang, silent=True) page.get() print("[%d](%d) %s" % (count, elapsed, page.data.get('url'))) preview = page.data.get('extext') if preview: preview = preview.strip().replace("\n", '')[:72] print(" %s %s" % (page.data.get('wikibase'), preview)) time.sleep(delay)
def test_page_query(self): page = wptools.page('TEST') qobj = wptools.query.WPToolsQuery() qstr = page._query('random', qobj) self.assertTrue('list=random' in qstr) qstr = page._query('query', qobj) self.assertTrue('action=query' in qstr) self.assertTrue('pageprops' in qstr) qstr = page._query('querymore', qobj) self.assertTrue('action=query' in qstr) self.assertTrue('&cllimit=500' in qstr) self.assertTrue('&imlimit=500&lllimit=500&pclimit=500' in qstr) qstr = page._query('parse', qobj) self.assertTrue('action=parse' in qstr) self.assertTrue('parsetree' in qstr) page.data['claims'] = {'Q0': 'TEST'} qstr = page._query('claims', qobj) self.assertTrue('action=wbgetentities' in qstr) self.assertTrue('ids=Q0' in qstr) qstr = page._query('wikidata', qobj) self.assertTrue('action=wbgetentities' in qstr) qstr = page._query('restbase', qobj) self.assertTrue('api/rest' in qstr)
def getWiki(search, artist): print("Search Term: " + search) try: page = wptools.page(search).get_parse() if ('birth_place' in page.data['infobox']): birth_place = page.data['infobox']['birth_place'] birth_place = birth_place.replace("[", "").replace("]", "") # print(birth_place) updateArtist(birth_place, artist) return birth_place if ('hometown' in page.data['infobox']): print("hometown found") hometown = page.data['infobox']['hometown'] hometown = hometown.replace("[", "").replace("]", "") updateArtist(hometown, artist) return hometown if ('origin' in page.data['infobox']): print("origin found") origin = page.data['infobox']['origin'] origin = origin.replace("[", "").replace("]", "") updateArtist(origin, artist) return origin return False except: return False
def wiki_infobox(text): try: page = wptools.page(text, silent=True).get_parse() infobox = page.data['infobox'] except: infobox = {} return infobox
def scrape(stadium, overwrite=False): print('Searching for {}'.format(stadium)) if stadiums.find_one({'venueName': stadium}): print('Already in db') return True # Add string stadium to name of the stadium for wikipedia search, take most popular result as correct wiki_name = wikipedia.search(html.unescape(stadium + ' stadium')) geocode_result = gmaps.geocode(stadium) if geocode_result: elevation_result = gmaps.elevation( convert.normalize_lat_lng( geocode_result[0]['geometry']['location'])) stadium_data_to_insert = { 'venueName': stadium, 'location_data': geocode_result[0], 'altitude_data': elevation_result[0] } stadiums.replace_one({'venueName': stadium}, stadium_data_to_insert, upsert=True) if wiki_name: wiki_page = wptools.page(wiki_name[0], silent=True) wiki_page.get_parse() wiki_data = wiki_page.data['infobox'] wiki_data_to_insert = { 'venueName': stadium, 'wiki_name': wiki_name[0], 'wiki_data': wiki_data } wiki.replace_one({'venueName': stadium}, wiki_data_to_insert, upsert=True)
def test_disambiguation_wikibase(self): """ Get an unambiguous page by wikibase """ p = wptools.page(wikibase='Q528917') p.get_wikidata(False).get_query(False) self.assertTrue(p.pageid == 20974062)
def infoall(title): infobox = [] for i in range(len(title)): page = wptools.page(title[i]).get_parse() infobox.append(page.data['infobox']) print("Current progress", np.round(i / len(title) * 100, 2), "%") return infobox
def wiki_infobox_extractor(page_title=None): """ Crawls the infoboxs values by title and returns all triples. Args: :param page_title: title to crawl. Returns: :return: dict with triples and wikibase as key. """ triples = [] wikipage_triples = {} wikibase = None ibox = None try: ibox_json = wptools.page(page_title, lang='de').get_parse() #ibox_json.data => shows all available data if ((ibox_json.data['wikibase'] != None) and (ibox_json.data['infobox'] != None)): wikibase = ibox_json.data["wikibase"] ibox = ibox_json.data["infobox"] triples = [] for key, value in ibox.items(): val = remove_html_tags(value) if not "_tabelle" in key: pred = re.sub("_", " ", str(key)) pred = re.sub("-", " ", str(pred)) if val != "": triples.append((page_title, pred, val)) for triple in triples: if ", " in triple[2]: pairs = triple[2].split(", ") for value in pairs: triples.append((triple[0], triple[1], value)) triples.remove(triple) except: return wikibase, triples return wikibase, triples
def test_unknown_lang(self): """ Mediawiki site function not supported """ # "jp" Wikinews (unknown language code) b = wptools.page(wiki='jp.wikinews.org') self.assertTrue(b.fatal)
def _get_infobox_of_page(name, check_item, lang): try: if lang == 'es': opt = { 'boxterm': 'Ficha', 'skip': ['imageinfo'], 'silent': True, 'lang': lang, } else: opt = {'skip': ['imageinfo'], 'silent': True, 'lang': lang} page = wptools.page(name, **opt).get_parse() # search item must be in data search_str = str(page.data.get('wikitext', '')).lower() check_item_search = remove_most_common_endings(check_item).lower() name_search = remove_most_common_endings(name).lower() ctx_name = max(( search_str.count(check_item_search), search_str.count(name_search), )) if name_search != check_item_search and ctx_name < 5: return None infobox = page.data['infobox'] if infobox: infobox = {k.lower(): v for k, v in infobox.items()} return infobox except LookupError: return None
def test_core_get(self): page = wptools.page('TEST') try: page._get('TEST', False, None, 0) self.fail("failed to raise ValueError") except ValueError: pass
def test_boxterm(self): """ Spanish taxobox not captured. Issue #91 """ page = wptools.page('Okapi', lang='es', boxterm='Ficha', skip=['imageinfo']) page.get_parse(show=False) self.assertTrue(len(page.data['infobox']) > 0)
def test_get_rest(self): ''' Get random RESTBase ''' t = titles.title() r = wptools.page(t['title'], lang=t['lang']) r.get_rest() self.assertTrue(r.lead is not None)
def test_selected(self): """ Test overall functionality from random i18n choice """ title = titles.title() page = wptools.page(title['title'], lang=title['lang']) page.get(show=False) self.assertTrue(page.data['pageid'] is not None)
def test_wikidata_claims(self): ''' Get wikidata claims ''' p = wptools.page('Paris').get_wikidata(False) self.assertTrue('latitude' in p.wikidata['coordinates']) self.assertEqual(p.wikidata['country'], 'France') self.assertTrue(len(p.wikidata['instance']) > 3)
def test_wikidata_random(self): title = titles.title() page = wptools.page(title['title'], lang=title['lang']) from pprint import pprint page.get_query(show=False).get_wikidata() self.assertTrue(len(page.data['claims']) > 5) self.assertTrue(len(page.data['labels']) > 5) self.assertTrue(len(page.data['wikidata']) > 5)
def test_normalized_filename(self): """ Ensure parse-image does not cause infinite loop. The normalized API result image title did not match orginal parse-image file name... Issue #93 """ page = wptools.page('Aphra Behn') page.get_parse(show=False) self.assertTrue(len(page.data['requests']) < 3)
def test_get_claims(self): page = wptools.page("test_get_claims") page.cache["wikidata"] = wikidata.cache page._set_wikidata() page.cache["claims"] = claims.cache page._set_claims_data() self.assertEqual(len(page.claims), 11) self.assertEqual(len(page.props), 10) self.assertTrue(str(page.what), "human") self.assertTrue("science" in page.wikidata["genre"].lower()) self.assertTrue("Mostly Harmless" in page.wikidata["work"])
def test_get_imageinfo(self): page = wptools.page("test_get_imageinfo") page.images = [{"file": "Douglas adams portrait cropped.jpg", "kind": "test"}] page.cache["imageinfo"] = imageinfo.cache page._set_imageinfo_data() image = page.images[0] self.assertTrue(image["file"].startswith("File:")) self.assertTrue("/c/c0/" in image["url"]) self.assertTrue("/commons." in image["descriptionurl"]) self.assertTrue(image["size"] > 1024) self.assertTrue(image["width"] > 240) self.assertTrue(image["height"] > 320)
def test_wikibase(self): """ Get everything wikibase only """ p = wptools.page(wikibase='Q43303').get_wikidata(False) self.assertEqual(p.title, 'Malcolm_X') self.assertEqual(p.what, 'human') self.assertEqual(p.wikibase, 'Q43303') self.assertTrue(p.label is not None) self.assertTrue(p.description is not None) self.assertTrue(p.images.pop()['file'] is not None) self.assertTrue(len(p.wikidata) > 5)
def test_get_parse(self): page = wptools.page("test_get_parse") page.cache["parse"] = parse.cache page._set_parse_data() self.assertEqual(len(page.infobox), 15) self.assertTrue("satire" in page.infobox["genre"]) self.assertEqual(page.lang, "en") self.assertEqual(len(page.links), 2) self.assertEqual(page.pageid, 8091) self.assertTrue(len(page.parsetree) > 1024 * 64) self.assertEqual(str(page.title), "Douglas_Adams") self.assertEqual(str(page.wikibase), "Q42") self.assertTrue(page.wikidata_url.startswith("http")) self.assertTrue(len(page.wikitext) > 1024 * 64)
def test_get_wikidata(self): page = wptools.page("test_get_wikidata") page.cache["wikidata"] = wikidata.cache page._set_wikidata() self.assertEqual(len(page.claims), 11) self.assertEqual(page.description, "English writer and humorist") self.assertEqual(page.label, "Douglas Adams") self.assertEqual(page.images[0]["kind"], "wikidata-image") self.assertTrue("wikidata" in page.modified) self.assertEqual(len(page.props), 10) self.assertEqual(str(page.title), "Douglas_Adams") self.assertEqual(str(page.wikibase), "Q42") self.assertEqual(len(page.wikidata), 5) self.assertTrue(str(page.wikidata["birth"]).startswith("+1952")) self.assertTrue(page.wikidata_url.endswith("Q42"))
def test_get_rest(self): page = wptools.page("test_get_rest") page.cache["rest"] = rest.cache page._set_rest_data() self.assertEqual(page.description, "English writer and humorist") self.assertEqual(page.lang, "en") self.assertEqual(len(page.images), 2) self.assertEqual(page.image("image")["kind"], "rest-image") self.assertEqual(page.image("thumb")["kind"], "rest-thumb") self.assertTrue(len(page.lead) > 1024 * 3) self.assertTrue(page.lead.startswith("<span")) self.assertTrue("page" in page.modified) self.assertEqual(page.pageid, 8091) self.assertEqual(str(page.title), "Douglas_Adams") self.assertTrue(page.url.endswith("Adams")) self.assertTrue("Douglas_Adams" in page.url_raw)
def test_caching(self): abc = wptools.page("test_caching") abc.claims = {"Q1": "test"} abc.cache["claims"] = {"response"} abc.cache["imageinfo"] = {"response"} abc.images = [{"url": "URL"}] abc.cache["parse"] = {"response"} abc.cache["query"] = {"response"} abc.cache["rest"] = {"response"} abc.cache["wikidata"] = {"response"} abc.get_claims() abc.get_imageinfo() abc.get_parse() abc.get_query() abc.get_rest() abc.get_wikidata() self.assertTrue(not abc.pageid)
def test_get_query(self): page = wptools.page("test_get_query") page.cache["query"] = query.cache page._set_query_data() self.assertEqual(page.description, "English writer and humorist") self.assertTrue(page.extext.startswith("**Douglas")) self.assertTrue(page.extract.startswith("<p><b>Douglas")) self.assertTrue(len(page.images) > 1) self.assertEqual(page.label, "Douglas Adams") self.assertEqual(page.lang, "en") self.assertTrue("page" in page.modified) self.assertEqual(page.pageid, 8091) self.assertTrue(wptools.utils.is_text(page.random)) self.assertEqual(page.title, "Douglas_Adams") self.assertTrue(page.url.endswith("Adams")) self.assertTrue("Douglas_Adams" in page.url_raw) self.assertEqual(str(page.wikibase), "Q42") self.assertTrue(page.wikidata_url.endswith("Q42"))
def get(args): """ invoke wptools and assemble selected output """ html = args.H lang = args.l nowrap = args.n query = args.q silent = args.s title = args.t verbose = args.v wiki = args.w start = time.time() if query: fetch = WPToolsFetch(lang=lang, verbose=verbose, wiki=wiki) if title: return fetch.query("query", title) return fetch.query("random", None) item = wptools.page(title, lang=lang, silent=silent, verbose=verbose, wiki=wiki) item.get_query() if not hasattr(item, "extract") or not item.extract: return "NOT_FOUND" out = _item_text(item, nowrap) if html: out = _item_html(item) if not silent: print("%5.3f seconds" % (time.time() - start), file=sys.stderr) try: return out.encode("utf-8") except KeyError: return out
def get(args): """ invoke wptools and assemble selected output """ html = args.H lang = args.l nowrap = args.n query = args.q silent = args.s title = args.t verbose = args.v wiki = args.w if query: qobj = WPToolsQuery(lang=lang, wiki=wiki) if title: return qobj.query(title) return qobj.random() page = wptools.page(title, lang=lang, silent=silent, verbose=verbose, wiki=wiki) try: page.get_query() except (StandardError, ValueError, LookupError): return "NOT_FOUND" if not page.data.get('extext'): out = page.cache['query']['query'] out = _page_text(page, nowrap) if html: out = _page_html(page) try: return out.encode('utf-8') except KeyError: return out
def main(args): """ GET top pages or random pages forever """ delay = args.delay lang = args.lang top = args.top start = int(time.time()) pages = ['forever'] if top: pages = popular(lang) print_header(delay, lang, pages) try: count = 0 requests = 0 elapsed = 0 while len(pages) > 0: language = lang or random.choice(languages()) if top and not lang: language = 'en' page = wptools.page(lang=language, silent=True) if top: page = wptools.page(pages.pop(0), lang=language, silent=True) page.get() preview = page.data.get('extext') if preview: preview = preview.strip().replace("\n", '')[:64] url = page.data.get('url') elapsed = int(time.time()) - start count += 1 nrq = len(page.data.get('requests')) requests += nrq rps = float(0) if elapsed > 0: rps = float(requests) / elapsed frps = '{:.1f}'.format(rps) print("[%d] %d %s %s" % (count, nrq, frps, url)) print("%s %s" % (page.data.get('wikibase'), preview)) time.sleep(delay) except KeyboardInterrupt: print("Done. %d requests %d seconds" % (requests, elapsed)) except: page.flags['silent'] = False page.show() print("EXCEPTION %d requests %d seconds" % (requests, elapsed)) raise
def test_mixed_lang(self): """ Get mixed language """ p = wptools.page('Abraham Lincoln', lang='zh').get_query(False) self.assertEqual(p.wikibase, 'Q91')
def test_wikidata_title(self): """ Get wikidata from title only """ w = wptools.page('Les Misérables').get_wikidata(False) self.assertTrue(w.wikibase is not None)
def test_random_wiki(self): """ Get random title from random Wikmedia project """ page = wptools.page(wiki=random.choice(WIKIS)) self.assertTrue(page.data['pageid'] is not None)