def articles_by_category(): """Returns list of published articles grouped by category. """ categories = [] titles_by_category = {} cache_key = make_cache_key('wikiprox:encyclopedia:articles_by_category') cached = cache.get(cache_key) if cached: categories = json.loads(cached) else: published = [page['title'] for page in published_pages()] cat_titles = [page['title'] for page in category_article_types()] for category in cat_titles: category = category.replace('Category:','') # TODO fix this, this is bad titles = [ page for page in category_members( category, namespace_id=namespaces_reversed()['Default'] ) if page['title'] in published ] if titles: categories.append(category) titles_by_category[category] = titles cache.set(cache_key, json.dumps(categories), settings.CACHE_TIMEOUT) return categories,titles_by_category
def events(): """Returns list of events and a status message. """ objects = [] cache_key = make_cache_key('wikiprox:events:events') cached = cache.get(cache_key) if cached: objects = json.loads(cached) else: url = '%s/events/' % settings.SOURCES_API r = requests.get( url, params={'limit':1000}, headers={'content-type':'application/json'}, timeout=3) if r and r.status_code == 200: response = json.loads(r.text) for obj in response['objects']: objects.append(obj) cache.set(cache_key, json.dumps(objects), settings.CACHE_TIMEOUT) # convert all the dates for obj in objects: if obj.get('start_date',None): obj['start_date'] = datetime.strptime(obj['start_date'], '%Y-%m-%d') if obj.get('end_date',None): obj['end_date'] = datetime.strptime(obj['end_date'], '%Y-%m-%d') return objects
def what_links_here(title): """Returns titles of published pages that link to this one. """ cache_key = make_cache_key('wikiprox:encyclopedia:what_links_here:%s' % title) cached = cache.get(cache_key) if cached: titles = json.loads(cached) else: url = '%s?format=json&action=query&list=backlinks&bltitle=%s&bllimit=5000' % (settings.MEDIAWIKI_API, title) r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT) if r.status_code == 200: titles = _whatlinkshere(published_pages(), r.text) cache.set(cache_key, json.dumps(titles), settings.CACHE_TIMEOUT) return titles
def namespaces(): """Returns dict of namespaces and their codes. """ cache_key = make_cache_key('wikiprox:encyclopedia:namespaces') cached = cache.get(cache_key) if cached: namespaces = json.loads(cached) else: url = '%s?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json' % (settings.MEDIAWIKI_API) r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT) if r.status_code == 200: namespaces = _namespaces(r.text) cache.set(cache_key, json.dumps(namespaces), settings.CACHE_TIMEOUT) return namespaces
def published_authors(cached_ok=True): """Returns a list of *published* authors (pages), with timestamp of latest revision. @param cached_ok: boolean Whether cached results are OK. """ cache_key = make_cache_key('wikiprox:encyclopedia:published_authors') cached = cache.get(cache_key) if cached and cached_ok: authors = json.loads(cached) else: authors = _published_authors( published_pages(), category_authors() ) cache.set(cache_key, json.dumps(authors), settings.CACHE_TIMEOUT) return authors
def articles_a_z(): """Returns a list of published article titles arranged A-Z. """ cache_key = make_cache_key('wikiprox:encyclopedia:articles_a_z') cached = cache.get(cache_key) if cached: titles = json.loads(cached) else: titles = _articles_a_z( category_members('Published', namespace_id=namespaces_reversed()['Default']), published_authors(), NON_ARTICLE_PAGES ) cache.set(cache_key, json.dumps(titles), settings.CACHE_TIMEOUT) return titles
def page_categories(title, whitelist=[]): """Returns list of article subcategories the page belongs to. """ cache_key = make_cache_key('wikiprox:encyclopedia:page_categories:%s' % title) cached = cache.get(cache_key) if cached: categories = json.loads(cached) else: url = '%s?format=json&action=query&prop=categories&titles=%s' % (settings.MEDIAWIKI_API, title) r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT) if r.status_code == 200: if not whitelist: whitelist = category_article_types() categories = _page_categories(whitelist, r.text) cache.set(cache_key, json.dumps(categories), settings.CACHE_TIMEOUT) return categories
def published_sources(): """Returns list of published Sources. """ sources = [] cache_key = make_cache_key('wikiprox:sources:published_sources') cached = cache.get(cache_key) if cached: sources = json.loads(cached) for source in sources: source['modified'] = datetime.strptime(source['modified'], TS_FORMAT) else: url = '%s/primarysource/sitemap/' % settings.SOURCES_API r = requests.get(url, headers={'content-type':'application/json'}) if r.status_code == 200: response = json.loads(r.text) sources = [source for source in response['objects']] cache.set(cache_key, json.dumps(sources), settings.CACHE_TIMEOUT) return sources
def all_pages(): """Returns a list of all pages, with timestamp of latest revision. """ cache_key = make_cache_key('wikiprox:encyclopedia:all_pages') cached = cache.get(cache_key) if cached: pages = json.loads(cached) for page in pages: page['timestamp'] = datetime.strptime(page['timestamp'], mediawiki.TS_FORMAT_ZONED) else: cookies = api_login() # all articles LIMIT=5000 url = '%s?action=query&generator=allpages&prop=revisions&rvprop=timestamp&gaplimit=5000&format=json' % (settings.MEDIAWIKI_API) r = requests.get(url, headers={'content-type':'application/json'}, cookies=cookies, timeout=TIMEOUT) if r.status_code == 200: pages = _all_pages(r.text) api_logout() cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT) return pages
def published_pages(cached_ok=True): """Returns a list of *published* articles (pages), with timestamp of latest revision. @param cached_ok: boolean Whether cached results are OK. """ cache_key = make_cache_key('wikiprox:encyclopedia:published_pages') cached = cache.get(cache_key) if cached and cached_ok: pages = json.loads(cached) for page in pages: page['timestamp'] = datetime.strptime(page['timestamp'], mediawiki.TS_FORMAT_ZONED) else: pages = _published_pages( all_pages(), category_members('Published', namespace_id=namespaces_reversed()['Default']) ) for page in pages: if not isinstance(page['timestamp'], basestring): page['timestamp'] = datetime.strftime(page['timestamp'], mediawiki.TS_FORMAT_ZONED) cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT) return pages
def locations(): """Returns list of locations and a status message. """ locations = [] cache_key = make_cache_key('wikiprox:locations:locations') cached = cache.get(cache_key) if cached: locations = json.loads(cached) else: url = '%s/locations/' % settings.SOURCES_API r = requests.get( url, params={'limit':'1000'}, headers={'content-type':'application/json'}, timeout=3) if (r.status_code == 200) and ('json' in r.headers['content-type']): response = json.loads(r.text) for location in response['objects']: locations.append(location) cache.set(cache_key, json.dumps(locations), settings.CACHE_TIMEOUT) return locations
def _term_documents(term_id, size): """Get objects for specified term from DDR REST API. @param term_id: int @param size: int Maximum number of results to return. @returns: list of dicts """ cache_key = make_cache_key('wikiprox:ddr:termdocs:%s:%s' % (term_id,size)) cached = cache.get(cache_key) if cached: objects = json.loads(cached) else: url = '{api}/facet/topics/{term_id}/objects/?limit={limit}&{local}=1'.format( api=settings.DDR_API, term_id=term_id, limit=size, local=settings.DDR_MEDIA_URL_LOCAL_MARKER ) r = requests.get( url, headers={'content-type':'application/json'}, timeout=3) if (r.status_code not in [200]): raise requests.exceptions.ConnectionError( 'Error %s' % (r.status_code)) objects = [] if ('json' in r.headers['content-type']): data = json.loads(r.text) if isinstance(data, dict) and data.get('objects'): objects = data['objects'] elif isinstance(data, list): objects = data # add img_url_local for o in objects: if o.get('links',{}).get('html'): o['absolute_url'] = o['links']['html'] if o.get('links',{}).get('thumb'): o['img_url_local'] = o['links']['thumb'] cache.set(cache_key, json.dumps(objects), settings.CACHE_TIMEOUT) return objects
def category_members(category_name, namespace_id=None): """Returns titles of pages with specified Category: tag. NOTE: Rather than just returning a list of title strings, this returns a list of _dicts_ containing namespace id, title, and sortkey. This is so certain views (e.g. Contents A-Z can grab the first letter of the title (or sortkey) to use for grouping purposes. """ cache_key = make_cache_key('wikiprox:encyclopedia:category_members:%s:%s' % (category_name, namespace_id)) cached = cache.get(cache_key) if cached: pages = json.loads(cached) else: cookies = api_login() LIMIT = 5000 url = '%s?format=json&action=query&list=categorymembers&cmsort=sortkey&cmprop=ids|sortkeyprefix|title&cmtitle=Category:%s&cmlimit=5000' % (settings.MEDIAWIKI_API, category_name) if namespace_id != None: url = '%s&gcmnamespace=%s' % (url, namespace_id) r = requests.get(url, headers={'content-type':'application/json'}, cookies=cookies, timeout=TIMEOUT) if r.status_code == 200: pages = _category_members(r.text) api_logout() cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT) return pages