Example #1
0
 def test_missing(self):
     """
     Get missing page
     """
     try:
         wptools.page(pageid=1).get(False)
         self.fail("failed to raise LookupError")
     except LookupError as detail:
         print(detail)
Example #2
0
 def test_not_found(self):
     """
     Try to get a non-existent page
     """
     try:
         wptools.page(NOPAGE, silent=True).get(False)
         self.fail("failed to raise LookupError")
     except LookupError as detail:
         pass
Example #3
0
 def test_lookup_unicode_error(self):
     """
     Raise LookupError without UnicodeDecodeError. Issue #29
     """
     try:
         wptools.page('Д北_TEST', silent=True).get(False)
         self.fail("failed to raise LookupError")
     except LookupError as detail:
         pass
Example #4
0
 def test_lookup_unicode_error(self):
     """
     Potentially raise UnicodeDecodeError on LookupError
     """
     try:
         wptools.page('阿Vane').get(False)  # issue 29
         self.fail("failed to raise LookupError")
     except LookupError as detail:
         print(detail)
Example #5
0
 def test_unknown_lang(self):
     """
     Mediawiki site function not supported
     """
     # "jp" Wikinews (unknown language code)
     try:
         wptools.page(wiki='jp.wikinews.org')
         self.fail("failed to raise LookupError")
     except LookupError as detail:
         print(detail)
Example #6
0
    def test_page_get(self):
        """
        test page.get() without making any requests, is for coverage
        """
        wptools.request.WPToolsRequest.DISABLED = True
        skip = ['parse', 'query', 'restbase', 'wikiata']

        page = wptools.page('TEST', skip=skip)
        page.get()

        page = wptools.page('TEST', wikibase='TEST', skip=skip)
        page.get()
Example #7
0
    def test_core_init(self):
        page = wptools.page('TEST', skip='SKIP')
        self.assertEqual(page.flags,
                         {'silent': False, 'skip': 'SKIP', 'verbose': False})

        page = wptools.page('TEST', variant='VARIANT')
        self.assertEqual(page.params,
                         {'lang': 'en', 'title': 'TEST', 'variant': 'VARIANT'})

        page = wptools.page('TEST', wiki='WIKI')
        self.assertEqual(page.params,
                         {'lang': 'en', 'title': 'TEST', 'wiki': 'WIKI'})
Example #8
0
    def test_page_init(self):
        page = wptools.page('TEST', silent=True)
        self.assertEqual(page.params, {'lang': 'en', 'title': 'TEST'})
        self.assertEqual(page.flags, {'silent': True, 'verbose': False})

        page = wptools.page(pageid=123, silent=SILENT_FLAG)
        self.assertEqual(page.params, {'lang': 'en', 'pageid': 123})

        page = wptools.page(wikibase='Q42', silent=SILENT_FLAG)
        self.assertEqual(page.params, {'lang': 'en', 'wikibase': 'Q42'})

        self.assertTrue('requests' not in page.data)
Example #9
0
def wiki_test():
    #wiki = wikipediaapi.Wikipedia('ko')
    #page_py = wiki.page
    #page_py = wiki.page('파이썬')
    #wikipage = wikipedia.page('kakao')
    #print(wikipage.images[0])
    #print(page_py.images[0])
    #print("Page - Exists: %s" % page_py.exists())
    #print("Page - Summary: %s" % page_py.summary[0:100])
    kr = wptools.page(lang='ko', action='kakao').get_query()
    kakao = wptools.page('kakao').get_query()
    kakao_img = kakao.pageimage
    print(kakao_img)
Example #10
0
def executeAStar(pageA=None,
                 pageB=None,
                 printResult=True,
                 userheuristic=heuristicFunction,
                 random=True,
                 timeResult=False,
                 debug=True):
    # Initialize the start and end goal
    DEBUG = debug
    startPage = wiki.page("West Pullman, Chicago", silent=True)
    endPage = wiki.page('Midwestern United States', silent=True)

    path = astar(startPage, endPage, userheuristic)
Example #11
0
def get_page(page_search):
    lang_codes = ['en', 'de', 'es', 'fr']
    for lang in lang_codes:
        try:
            return wptools.page(page_search, lang=lang).get_parse()
        except LookupError:
            try:
                wikipedia.set_lang(lang)
                search = wikipedia.search(page_search)
                if search:
                    return wptools.page(search[0], lang=lang).get_parse()
            except LookupError:
                print(f'no wiki page found for {page_search} lang {lang}.')
    return None
Example #12
0
def get_wikidata(subject):
    node_data = {}
    base = wptools.page(subject).get_parse().data["wikibase"]
    page = wptools.page(wikibase=base).get_wikidata().data

    # clean data (remove (PXXX) )
    data = page["wikidata"]

    def clean_dict(data=None):
        data = data or {}
        clean = {}
        for key in dict(data):
            if not key:
                continue
            val = data[key]
            if isinstance(val, list):
                for idx, v in enumerate(val):
                    if isinstance(v, str):
                        v = " ".join(v.split(" ")[:-1])
                        val[idx] = v
                    if isinstance(v, dict):
                        v = clean_dict(dict(v))
                        val[idx] = v
                    if isinstance(v, list):
                        for idx, v1 in enumerate(v):
                            v[idx] = " ".join(v1.split(" ")[:-1])
                        val[idx] = v
            elif isinstance(val, dict):
                val = clean_dict(dict(val))
            elif val.endswith(")"):
                val = " ".join(val.split(" ")[:-1])
            key = " ".join(key.split(" ")[:-1])
            if val and key:
                clean[key] = val
        return clean

    data = clean_dict(data)

    # parse for distant child of
    node_data["description"] = page["description"]
    # direct child of
    #node_data["instance of"] = page["what"]
    # data fields
    for k in data:
        node_data[k] = data[k]
    # related to
    node_data["url"] = page["wikidata_url"]
    #node_data["aliases"] = page["aliases"]
    return node_data
Example #13
0
 def test_random(self):
     """
     Get a random title
     """
     r = wptools.page()
     self.assertTrue(r.pageid is not None)
     self.assertTrue(r.title is not None)
Example #14
0
 def test_random(self):
     """
     Get random title from random language wiki
     """
     page = wptools.page(lang=random.choice(LANG))
     page.get(show=False)
     self.assertTrue(page.data['pageid'] is not None)
Example #15
0
 def test_random(self):
     """
     Get a random title
     """
     r = wptools.page()
     self.assertTrue(r.pageid is not None)
     self.assertTrue(r.title is not None)
Example #16
0
 def test_selected(self):
     """
     Get selected title
     """
     t = titles.title()
     p = wptools.page(t['title'], lang=t['lang']).get_query(False)
     self.assertTrue(p.pageid is not None)
Example #17
0
 def test_selected(self):
     """
     Get selected title
     """
     t = titles.title()
     p = wptools.page(t['title'], lang=t['lang']).get_query(False)
     self.assertTrue(p.pageid is not None)
Example #18
0
    def __init__(self, title, lang, silent=False):
        self.title = title
        #Fetch page from wikipedia
        try:
            self._page = wptools.page(title, silent=silent, lang=lang).get()
            logger.info("Wiki parse {}".format(self.url))
        except LookupError:
            err_msg = f"\"{title}\" not found"
            logger.warning(err_msg)
            raise PageNotExists(err_msg)

        self.ast = wtp.parse(self._page.data['wikitext'])
        self.type = None

        #Detect radio pages by searching a radio infobox
        self.infobox = self.radio_site = None
        self.find_infobox()
        if self.infobox:
            self.type = PageInfo.RADIO
            return None

        #Detect page with lists of radios
        #Retrieve all section delimited with an h2
        self.sections = [section for section in self.ast.sections \
            if section.level in [0, 2] and self.allowed_section(section)]

        #Get at least one table with radio listed
        self.manage_datatype()

        if self.have_table:
            self.type = PageInfo.LIST
            return None

        err_msg = "{} : Invalid url, not a radio station or listing"
        raise PageError(err_msg.format(self.url))
Example #19
0
    def test_page_query(self):
        page = wptools.page('TEST')
        qobj = wptools.query.WPToolsQuery()

        qstr = page._query('random', qobj)
        self.assertTrue('list=random' in qstr)

        qstr = page._query('query', qobj)
        self.assertTrue('action=query' in qstr)
        self.assertTrue('pageprops' in qstr)

        qstr = page._query('querymore', qobj)
        self.assertTrue('action=query' in qstr)
        self.assertTrue('&cllimit=500' in qstr)
        self.assertTrue('&imlimit=500&lllimit=500&pclimit=500' in qstr)

        qstr = page._query('parse', qobj)
        self.assertTrue('action=parse' in qstr)
        self.assertTrue('parsetree' in qstr)

        page.data['entities'] = ['Q1', 'Q2', 'Q3']
        qstr = page._query('labels', qobj)
        self.assertTrue('action=wbgetentities' in qstr)
        self.assertTrue('ids=Q1|Q2|Q3' in qstr)

        qstr = page._query('wikidata', qobj)
        self.assertTrue('action=wbgetentities' in qstr)

        page.params.update({'endpoint': '/page/summary/'})
        qstr = page._query('restbase', qobj)
        self.assertTrue('api/rest' in qstr)

        self.assertTrue('requests' not in page.data)
Example #20
0
 def test_complex_infobox(self):
     """
     Successfully populate complex infobox dict
     """
     p = wptools.page('Aung San Suu Kyi').get_parse(False)
     self.assertGreaterEqual(len(p.infobox), 32)
     self.assertTrue('errors' not in p.infobox)
Example #21
0
    def test_page_get_imageinfo(self):
        page = wptools.page('TEST', silent=SILENT_FLAG)

        self.assertRaises(ValueError, page.get_imageinfo)

        page.cache = {'imageinfo': imageinfo.cache}
        page.data['image'] = [{
            'kind': 'parse-image',
            'file': 'Douglas adams portrait cropped.jpg'
        }]

        page._normalize_images()
        query = page._query('imageinfo', wptools.query.WPToolsQuery())
        self.assertTrue('File%3ADouglas' in query)

        page._set_data('imageinfo')
        image = page.data['image'][0]
        self.assertTrue('/c/c0/' in image['url'])
        self.assertTrue('/commons.' in image['descriptionurl'])
        self.assertTrue(image['file'].startswith('File:'))
        self.assertEqual(image['height'], 386)
        self.assertEqual(image['size'], 32915)
        self.assertEqual(image['width'], 333)

        self.assertTrue('requests' not in page.data)
Example #22
0
 def test_complex_infobox(self):
     """
     Successfully populate complex infobox dict
     """
     p = wptools.page('Aung San Suu Kyi').get_parse(False)
     self.assertGreaterEqual(len(p.infobox), 32)
     self.assertTrue('errors' not in p.infobox)
Example #23
0
def download_images(searchquery):
    import wptools
    #https://github.com/siznax/wptools/wiki/Examples#get-a-representative-image
    import requests
    import os

    page = wptools.page(searchquery, lang="de")
    page.get_query()
    result = page.images(['url', 'file'])
    print(result[0]['url'], "Dateiname: ", result[0]['file'])

    url = result[0]['url']
    file = result[0]['file']
    print(file)
    r = requests.get(url)
    save_path = os.path.join(os.path.expanduser('~'), 'learny', 'learny',
                             f"{searchquery }-{file[5:]}")

    with open(save_path, 'wb') as f:
        #	with open(f'/home/alex/Downloads/{searchquery }-{file[5:]}', 'wb') as f:
        f.write(r.content)

    # Retrieve HTTP meta-data
    print(r.status_code)
    print(r.headers['content-type'])
    print(r.encoding)
Example #24
0
def scrape_stadium_manually(OPTAvenueName, alternativeName, overwrite=False):
    print('Searching for {}'.format(OPTAvenueName))
    if stadiums.find_one({'venueName': OPTAvenueName}) and not overwrite:
        print('Already in db')
        return True

    wiki_name = wikipedia.search(html.unescape(alternativeName + ' stadium'))
    geocode_result = gmaps.geocode(alternativeName)
    if geocode_result:
        elevation_result = gmaps.elevation(
            convert.normalize_lat_lng(
                geocode_result[0]['geometry']['location']))
        stadium_data_to_insert = {
            'venueName': OPTAvenueName,
            'location_data': geocode_result[0],
            'altitude_data': elevation_result[0]
        }
        stadiums.replace_one({'venueName': OPTAvenueName},
                             stadium_data_to_insert,
                             upsert=True)
    if wiki_name:
        wiki_page = wptools.page(wiki_name[0], silent=True)
        wiki_page.get_parse()
        wiki_data = wiki_page.data['infobox']
        wiki_data_to_insert = {
            'venueName': OPTAvenueName,
            'wiki_name': wiki_name[0],
            'wiki_data': wiki_data
        }
        wiki.replace_one({'venueName': OPTAvenueName},
                         wiki_data_to_insert,
                         upsert=True)
Example #25
0
    def test_page_get_wikidata(self):
        page = wptools.page('TEST',
                            wikibase='WIKIBASE',
                            skip=SKIP_FLAG,
                            silent=SILENT_FLAG)

        page.cache = {'wikidata': wikidata.cache}
        page._set_data('wikidata')

        page.cache['labels'] = labels_1.cache
        page._set_data('labels')

        page.cache['labels'] = labels_2.cache
        page._set_data('labels')

        page.cache['labels'] = labels_3.cache
        page._set_data('labels')

        page._post_labels_updates()

        page.cache['imageinfo'] = imageinfo.cache
        page._set_data('imageinfo')

        data = page.data
        self.assertEqual(data['wikibase'], 'Q42')
        self.assertEqual(data['image'][0]['kind'], 'wikidata-image')
        self.assertEqual(len(data['claims']), 102)
        self.assertEqual(len(data['labels']), 147)
        self.assertEqual(len(data['wikidata']), 102)

        self.assertTrue('requests' not in page.data)
Example #26
0
def fetch_wikipedia(qid, lang, normalized):
    """Look up the Wikipedia page for the given QID and language"""
    print(
        'fetching "{}" Infobox/Wikidata for entity: {} ({}) ...'.format(
            lang,
            qid,
            normalized
        ),
        file=sys.stderr
    )
    try:
        page = wptools.page(wikibase=qid, lang=lang, silent=True).get()
        return {
            'infobox': get_infobox(page),
            'wikidata': page.data["wikidata"],
            'title': page.data["title"],
            'url': 'https://{}.wikipedia.org/wiki/{}'.format(lang, page.data["title"])
        }
    except LookupError:
        print(
            'No page exists for {} on {}.wikipedia.org'.format(qid, lang),
            file=sys.stderr
        )
    # If the lookup fails, just return an empty dict
    return {}
Example #27
0
 def test_page_get_random(self):
     page = wptools.page('TEST', skip=['imageinfo'], silent=True)
     page.cache = {'random': query.cache}
     page._set_data('random')
     page.get_random()
     self.assertEqual(page.data['pageid'], 45564415)
     self.assertEqual(page.data['title'], '1990 NBL Finals')
Example #28
0
 def test_random(self):
     """
     Get random title from random language wiki
     """
     page = wptools.page(lang=random.choice(LANG))
     page.get(show=False)
     self.assertTrue(page.data['pageid'] is not None)
Example #29
0
def main(delay=1):
    """
    GET random pages forever
    """

    print("%d languages" % len(LANGUAGES))

    start = int(time.time())
    count = 0
    while True:
        count += 1
        elapsed = int(time.time()) - start

        lang = random.choice(LANGUAGES)
        page = wptools.page(lang=lang, silent=True)
        page.get()

        print("[%d](%d) %s" % (count, elapsed, page.data.get('url')))

        preview = page.data.get('extext')
        if preview:
            preview = preview.strip().replace("\n", '')[:72]

        print("  %s %s" % (page.data.get('wikibase'), preview))

        time.sleep(delay)
Example #30
0
    def test_page_query(self):
        page = wptools.page('TEST')

        qobj = wptools.query.WPToolsQuery()

        qstr = page._query('random', qobj)
        self.assertTrue('list=random' in qstr)

        qstr = page._query('query', qobj)
        self.assertTrue('action=query' in qstr)
        self.assertTrue('pageprops' in qstr)

        qstr = page._query('querymore', qobj)
        self.assertTrue('action=query' in qstr)
        self.assertTrue('&cllimit=500' in qstr)
        self.assertTrue('&imlimit=500&lllimit=500&pclimit=500' in qstr)

        qstr = page._query('parse', qobj)
        self.assertTrue('action=parse' in qstr)
        self.assertTrue('parsetree' in qstr)

        page.data['claims'] = {'Q0': 'TEST'}
        qstr = page._query('claims', qobj)
        self.assertTrue('action=wbgetentities' in qstr)
        self.assertTrue('ids=Q0' in qstr)

        qstr = page._query('wikidata', qobj)
        self.assertTrue('action=wbgetentities' in qstr)

        qstr = page._query('restbase', qobj)
        self.assertTrue('api/rest' in qstr)
Example #31
0
def getWiki(search, artist):
    print("Search Term: " + search)
    try:
        page = wptools.page(search).get_parse()
        if ('birth_place' in page.data['infobox']):
            birth_place = page.data['infobox']['birth_place']
            birth_place = birth_place.replace("[", "").replace("]", "")
            # print(birth_place)
            updateArtist(birth_place, artist)
            return birth_place
        if ('hometown' in page.data['infobox']):
            print("hometown found")
            hometown = page.data['infobox']['hometown']
            hometown = hometown.replace("[", "").replace("]", "")
            updateArtist(hometown, artist)
            return hometown
        if ('origin' in page.data['infobox']):
            print("origin found")
            origin = page.data['infobox']['origin']
            origin = origin.replace("[", "").replace("]", "")
            updateArtist(origin, artist)
            return origin
        return False
    except:
        return False
def wiki_infobox(text):
    try:
        page = wptools.page(text, silent=True).get_parse()
        infobox = page.data['infobox']
    except:
        infobox = {}
    return infobox
Example #33
0
def scrape(stadium, overwrite=False):
    print('Searching for {}'.format(stadium))
    if stadiums.find_one({'venueName': stadium}):
        print('Already in db')
        return True

    # Add string stadium to name of the stadium for wikipedia search, take most popular result as correct
    wiki_name = wikipedia.search(html.unescape(stadium + ' stadium'))
    geocode_result = gmaps.geocode(stadium)
    if geocode_result:
        elevation_result = gmaps.elevation(
            convert.normalize_lat_lng(
                geocode_result[0]['geometry']['location']))
        stadium_data_to_insert = {
            'venueName': stadium,
            'location_data': geocode_result[0],
            'altitude_data': elevation_result[0]
        }
        stadiums.replace_one({'venueName': stadium},
                             stadium_data_to_insert,
                             upsert=True)
    if wiki_name:
        wiki_page = wptools.page(wiki_name[0], silent=True)
        wiki_page.get_parse()
        wiki_data = wiki_page.data['infobox']
        wiki_data_to_insert = {
            'venueName': stadium,
            'wiki_name': wiki_name[0],
            'wiki_data': wiki_data
        }
        wiki.replace_one({'venueName': stadium},
                         wiki_data_to_insert,
                         upsert=True)
Example #34
0
 def test_disambiguation_wikibase(self):
     """
     Get an unambiguous page by wikibase
     """
     p = wptools.page(wikibase='Q528917')
     p.get_wikidata(False).get_query(False)
     self.assertTrue(p.pageid == 20974062)
Example #35
0
def infoall(title):
    infobox = []
    for i in range(len(title)):
        page = wptools.page(title[i]).get_parse()
        infobox.append(page.data['infobox'])
    print("Current progress", np.round(i / len(title) * 100, 2), "%")
    return infobox
Example #36
0
def wiki_infobox_extractor(page_title=None):
    """ Crawls the infoboxs values by title and returns all triples.
    Args:
        :param page_title: title to crawl.
    Returns:
        :return: dict with triples and wikibase as key.
    """
    triples = []
    wikipage_triples = {}
    wikibase = None
    ibox = None
    try:
        ibox_json = wptools.page(page_title, lang='de').get_parse()
        #ibox_json.data => shows all available data
        if ((ibox_json.data['wikibase'] != None)
                and (ibox_json.data['infobox'] != None)):
            wikibase = ibox_json.data["wikibase"]
            ibox = ibox_json.data["infobox"]
            triples = []
            for key, value in ibox.items():
                val = remove_html_tags(value)
                if not "_tabelle" in key:
                    pred = re.sub("_", " ", str(key))
                    pred = re.sub("-", " ", str(pred))
                    if val != "":
                        triples.append((page_title, pred, val))
            for triple in triples:
                if ", " in triple[2]:
                    pairs = triple[2].split(", ")
                    for value in pairs:
                        triples.append((triple[0], triple[1], value))
                    triples.remove(triple)
    except:
        return wikibase, triples
    return wikibase, triples
Example #37
0
 def test_unknown_lang(self):
     """
     Mediawiki site function not supported
     """
     # "jp" Wikinews (unknown language code)
     b = wptools.page(wiki='jp.wikinews.org')
     self.assertTrue(b.fatal)
Example #38
0
def _get_infobox_of_page(name, check_item, lang):
    try:
        if lang == 'es':
            opt = {
                'boxterm': 'Ficha',
                'skip': ['imageinfo'],
                'silent': True,
                'lang': lang,
            }
        else:
            opt = {'skip': ['imageinfo'], 'silent': True, 'lang': lang}
        page = wptools.page(name, **opt).get_parse()
        # search item must be in data
        search_str = str(page.data.get('wikitext', '')).lower()
        check_item_search = remove_most_common_endings(check_item).lower()
        name_search = remove_most_common_endings(name).lower()
        ctx_name = max((
            search_str.count(check_item_search),
            search_str.count(name_search),
        ))
        if name_search != check_item_search and ctx_name < 5:
            return None
        infobox = page.data['infobox']
        if infobox:
            infobox = {k.lower(): v for k, v in infobox.items()}
        return infobox
    except LookupError:
        return None
Example #39
0
 def test_disambiguation_wikibase(self):
     """
     Get an unambiguous page by wikibase
     """
     p = wptools.page(wikibase='Q528917')
     p.get_wikidata(False).get_query(False)
     self.assertTrue(p.pageid == 20974062)
Example #40
0
 def test_core_get(self):
     page = wptools.page('TEST')
     try:
         page._get('TEST', False, None, 0)
         self.fail("failed to raise ValueError")
     except ValueError:
         pass
Example #41
0
 def test_boxterm(self):
     """
     Spanish taxobox not captured. Issue #91
     """
     page = wptools.page('Okapi', lang='es', boxterm='Ficha',
                         skip=['imageinfo'])
     page.get_parse(show=False)
     self.assertTrue(len(page.data['infobox']) > 0)
Example #42
0
 def test_get_rest(self):
     '''
     Get random RESTBase
     '''
     t = titles.title()
     r = wptools.page(t['title'], lang=t['lang'])
     r.get_rest()
     self.assertTrue(r.lead is not None)
Example #43
0
 def test_selected(self):
     """
     Test overall functionality from random i18n choice
     """
     title = titles.title()
     page = wptools.page(title['title'], lang=title['lang'])
     page.get(show=False)
     self.assertTrue(page.data['pageid'] is not None)
Example #44
0
 def test_wikidata_claims(self):
     '''
     Get wikidata claims
     '''
     p = wptools.page('Paris').get_wikidata(False)
     self.assertTrue('latitude' in p.wikidata['coordinates'])
     self.assertEqual(p.wikidata['country'], 'France')
     self.assertTrue(len(p.wikidata['instance']) > 3)
Example #45
0
 def test_wikidata_random(self):
     title = titles.title()
     page = wptools.page(title['title'], lang=title['lang'])
     from pprint import pprint
     page.get_query(show=False).get_wikidata()
     self.assertTrue(len(page.data['claims']) > 5)
     self.assertTrue(len(page.data['labels']) > 5)
     self.assertTrue(len(page.data['wikidata']) > 5)
Example #46
0
 def test_normalized_filename(self):
     """
     Ensure parse-image does not cause infinite loop. The
     normalized API result image title did not match orginal
     parse-image file name... Issue #93
     """
     page = wptools.page('Aphra Behn')
     page.get_parse(show=False)
     self.assertTrue(len(page.data['requests']) < 3)
Example #47
0
 def test_get_claims(self):
     page = wptools.page("test_get_claims")
     page.cache["wikidata"] = wikidata.cache
     page._set_wikidata()
     page.cache["claims"] = claims.cache
     page._set_claims_data()
     self.assertEqual(len(page.claims), 11)
     self.assertEqual(len(page.props), 10)
     self.assertTrue(str(page.what), "human")
     self.assertTrue("science" in page.wikidata["genre"].lower())
     self.assertTrue("Mostly Harmless" in page.wikidata["work"])
Example #48
0
 def test_get_imageinfo(self):
     page = wptools.page("test_get_imageinfo")
     page.images = [{"file": "Douglas adams portrait cropped.jpg", "kind": "test"}]
     page.cache["imageinfo"] = imageinfo.cache
     page._set_imageinfo_data()
     image = page.images[0]
     self.assertTrue(image["file"].startswith("File:"))
     self.assertTrue("/c/c0/" in image["url"])
     self.assertTrue("/commons." in image["descriptionurl"])
     self.assertTrue(image["size"] > 1024)
     self.assertTrue(image["width"] > 240)
     self.assertTrue(image["height"] > 320)
Example #49
0
 def test_wikibase(self):
     """
     Get everything wikibase only
     """
     p = wptools.page(wikibase='Q43303').get_wikidata(False)
     self.assertEqual(p.title, 'Malcolm_X')
     self.assertEqual(p.what, 'human')
     self.assertEqual(p.wikibase, 'Q43303')
     self.assertTrue(p.label is not None)
     self.assertTrue(p.description is not None)
     self.assertTrue(p.images.pop()['file'] is not None)
     self.assertTrue(len(p.wikidata) > 5)
Example #50
0
 def test_get_parse(self):
     page = wptools.page("test_get_parse")
     page.cache["parse"] = parse.cache
     page._set_parse_data()
     self.assertEqual(len(page.infobox), 15)
     self.assertTrue("satire" in page.infobox["genre"])
     self.assertEqual(page.lang, "en")
     self.assertEqual(len(page.links), 2)
     self.assertEqual(page.pageid, 8091)
     self.assertTrue(len(page.parsetree) > 1024 * 64)
     self.assertEqual(str(page.title), "Douglas_Adams")
     self.assertEqual(str(page.wikibase), "Q42")
     self.assertTrue(page.wikidata_url.startswith("http"))
     self.assertTrue(len(page.wikitext) > 1024 * 64)
Example #51
0
 def test_get_wikidata(self):
     page = wptools.page("test_get_wikidata")
     page.cache["wikidata"] = wikidata.cache
     page._set_wikidata()
     self.assertEqual(len(page.claims), 11)
     self.assertEqual(page.description, "English writer and humorist")
     self.assertEqual(page.label, "Douglas Adams")
     self.assertEqual(page.images[0]["kind"], "wikidata-image")
     self.assertTrue("wikidata" in page.modified)
     self.assertEqual(len(page.props), 10)
     self.assertEqual(str(page.title), "Douglas_Adams")
     self.assertEqual(str(page.wikibase), "Q42")
     self.assertEqual(len(page.wikidata), 5)
     self.assertTrue(str(page.wikidata["birth"]).startswith("+1952"))
     self.assertTrue(page.wikidata_url.endswith("Q42"))
Example #52
0
 def test_get_rest(self):
     page = wptools.page("test_get_rest")
     page.cache["rest"] = rest.cache
     page._set_rest_data()
     self.assertEqual(page.description, "English writer and humorist")
     self.assertEqual(page.lang, "en")
     self.assertEqual(len(page.images), 2)
     self.assertEqual(page.image("image")["kind"], "rest-image")
     self.assertEqual(page.image("thumb")["kind"], "rest-thumb")
     self.assertTrue(len(page.lead) > 1024 * 3)
     self.assertTrue(page.lead.startswith("<span"))
     self.assertTrue("page" in page.modified)
     self.assertEqual(page.pageid, 8091)
     self.assertEqual(str(page.title), "Douglas_Adams")
     self.assertTrue(page.url.endswith("Adams"))
     self.assertTrue("Douglas_Adams" in page.url_raw)
Example #53
0
 def test_caching(self):
     abc = wptools.page("test_caching")
     abc.claims = {"Q1": "test"}
     abc.cache["claims"] = {"response"}
     abc.cache["imageinfo"] = {"response"}
     abc.images = [{"url": "URL"}]
     abc.cache["parse"] = {"response"}
     abc.cache["query"] = {"response"}
     abc.cache["rest"] = {"response"}
     abc.cache["wikidata"] = {"response"}
     abc.get_claims()
     abc.get_imageinfo()
     abc.get_parse()
     abc.get_query()
     abc.get_rest()
     abc.get_wikidata()
     self.assertTrue(not abc.pageid)
Example #54
0
 def test_get_query(self):
     page = wptools.page("test_get_query")
     page.cache["query"] = query.cache
     page._set_query_data()
     self.assertEqual(page.description, "English writer and humorist")
     self.assertTrue(page.extext.startswith("**Douglas"))
     self.assertTrue(page.extract.startswith("<p><b>Douglas"))
     self.assertTrue(len(page.images) > 1)
     self.assertEqual(page.label, "Douglas Adams")
     self.assertEqual(page.lang, "en")
     self.assertTrue("page" in page.modified)
     self.assertEqual(page.pageid, 8091)
     self.assertTrue(wptools.utils.is_text(page.random))
     self.assertEqual(page.title, "Douglas_Adams")
     self.assertTrue(page.url.endswith("Adams"))
     self.assertTrue("Douglas_Adams" in page.url_raw)
     self.assertEqual(str(page.wikibase), "Q42")
     self.assertTrue(page.wikidata_url.endswith("Q42"))
Example #55
0
def get(args):
    """
    invoke wptools and assemble selected output
    """

    html = args.H
    lang = args.l
    nowrap = args.n
    query = args.q
    silent = args.s
    title = args.t
    verbose = args.v
    wiki = args.w

    start = time.time()

    if query:
        fetch = WPToolsFetch(lang=lang, verbose=verbose, wiki=wiki)
        if title:
            return fetch.query("query", title)
        return fetch.query("random", None)

    item = wptools.page(title, lang=lang, silent=silent, verbose=verbose, wiki=wiki)
    item.get_query()

    if not hasattr(item, "extract") or not item.extract:
        return "NOT_FOUND"

    out = _item_text(item, nowrap)
    if html:
        out = _item_html(item)

    if not silent:
        print("%5.3f seconds" % (time.time() - start), file=sys.stderr)

    try:
        return out.encode("utf-8")
    except KeyError:
        return out
Example #56
0
def get(args):
    """
    invoke wptools and assemble selected output
    """

    html = args.H
    lang = args.l
    nowrap = args.n
    query = args.q
    silent = args.s
    title = args.t
    verbose = args.v
    wiki = args.w

    if query:
        qobj = WPToolsQuery(lang=lang, wiki=wiki)
        if title:
            return qobj.query(title)
        return qobj.random()

    page = wptools.page(title, lang=lang, silent=silent,
                        verbose=verbose, wiki=wiki)

    try:
        page.get_query()
    except (StandardError, ValueError, LookupError):
        return "NOT_FOUND"

    if not page.data.get('extext'):
        out = page.cache['query']['query']

    out = _page_text(page, nowrap)
    if html:
        out = _page_html(page)

    try:
        return out.encode('utf-8')
    except KeyError:
        return out
Example #57
0
def main(args):
    """
    GET top pages or random pages forever
    """
    delay = args.delay
    lang = args.lang
    top = args.top

    start = int(time.time())

    pages = ['forever']
    if top:
        pages = popular(lang)

    print_header(delay, lang, pages)

    try:
        count = 0
        requests = 0
        elapsed = 0

        while len(pages) > 0:
            language = lang or random.choice(languages())
            if top and not lang:
                language = 'en'

            page = wptools.page(lang=language, silent=True)
            if top:
                page = wptools.page(pages.pop(0), lang=language, silent=True)

            page.get()

            preview = page.data.get('extext')
            if preview:
                preview = preview.strip().replace("\n", '')[:64]

            url = page.data.get('url')

            elapsed = int(time.time()) - start

            count += 1
            nrq = len(page.data.get('requests'))
            requests += nrq
            rps = float(0)
            if elapsed > 0:
                rps = float(requests) / elapsed
            frps = '{:.1f}'.format(rps)

            print("[%d] %d %s %s" % (count, nrq, frps, url))
            print("%s %s" % (page.data.get('wikibase'), preview))

            time.sleep(delay)

    except KeyboardInterrupt:
        print("Done. %d requests %d seconds" % (requests, elapsed))

    except:
        page.flags['silent'] = False
        page.show()
        print("EXCEPTION %d requests %d seconds" % (requests, elapsed))
        raise
Example #58
0
 def test_mixed_lang(self):
     """
     Get mixed language
     """
     p = wptools.page('Abraham Lincoln', lang='zh').get_query(False)
     self.assertEqual(p.wikibase, 'Q91')
Example #59
0
 def test_wikidata_title(self):
     """
     Get wikidata from title only
     """
     w = wptools.page('Les Misérables').get_wikidata(False)
     self.assertTrue(w.wikibase is not None)
Example #60
0
 def test_random_wiki(self):
     """
     Get random title from random Wikmedia project
     """
     page = wptools.page(wiki=random.choice(WIKIS))
     self.assertTrue(page.data['pageid'] is not None)