Exemple #1
0
def articles_by_category():
    """Returns list of published articles grouped by category.
    """
    categories = []
    titles_by_category = {}
    cache_key = make_cache_key('wikiprox:encyclopedia:articles_by_category')
    cached = cache.get(cache_key)
    if cached:
        categories = json.loads(cached)
    else:
        published = [page['title'] for page in published_pages()]
        cat_titles = [page['title'] for page in category_article_types()]
        for category in cat_titles:
            category = category.replace('Category:','')
            # TODO fix this, this is bad
            titles = [
                page
                for page in category_members(
                        category, namespace_id=namespaces_reversed()['Default']
                )
                if page['title'] in published
            ]
            if titles:
                categories.append(category)
                titles_by_category[category] = titles
        cache.set(cache_key, json.dumps(categories), settings.CACHE_TIMEOUT)
    return categories,titles_by_category
Exemple #2
0
def events():
    """Returns list of events and a status message.
    """
    objects = []
    cache_key = make_cache_key('wikiprox:events:events')
    cached = cache.get(cache_key)
    if cached:
        objects = json.loads(cached)
    else:
        url = '%s/events/' % settings.SOURCES_API
        r = requests.get(
            url, params={'limit':1000},
            headers={'content-type':'application/json'},
            timeout=3)
        if r and r.status_code == 200:
            response = json.loads(r.text)
            for obj in response['objects']:
                objects.append(obj)
        cache.set(cache_key, json.dumps(objects), settings.CACHE_TIMEOUT)
    # convert all the dates
    for obj in objects:
        if obj.get('start_date',None):
            obj['start_date'] = datetime.strptime(obj['start_date'], '%Y-%m-%d')
        if obj.get('end_date',None):
            obj['end_date'] = datetime.strptime(obj['end_date'], '%Y-%m-%d')
    return objects
Exemple #3
0
def what_links_here(title):
    """Returns titles of published pages that link to this one.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:what_links_here:%s' % title)
    cached = cache.get(cache_key)
    if cached:
        titles = json.loads(cached)
    else:
        url = '%s?format=json&action=query&list=backlinks&bltitle=%s&bllimit=5000' % (settings.MEDIAWIKI_API, title)
        r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT)
        if r.status_code == 200:
            titles = _whatlinkshere(published_pages(), r.text)
        cache.set(cache_key, json.dumps(titles), settings.CACHE_TIMEOUT)
    return titles
Exemple #4
0
def namespaces():
    """Returns dict of namespaces and their codes.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:namespaces')
    cached = cache.get(cache_key)
    if cached:
        namespaces = json.loads(cached)
    else:
        url = '%s?action=query&meta=siteinfo&siprop=namespaces|namespacealiases&format=json' % (settings.MEDIAWIKI_API)
        r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT)
        if r.status_code == 200:
            namespaces = _namespaces(r.text)
        cache.set(cache_key, json.dumps(namespaces), settings.CACHE_TIMEOUT)
    return namespaces
Exemple #5
0
def published_authors(cached_ok=True):
    """Returns a list of *published* authors (pages), with timestamp of latest revision.
    @param cached_ok: boolean Whether cached results are OK.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:published_authors')
    cached = cache.get(cache_key)
    if cached and cached_ok:
        authors = json.loads(cached)
    else:
        authors = _published_authors(
            published_pages(),
            category_authors()
        )
        cache.set(cache_key, json.dumps(authors), settings.CACHE_TIMEOUT)
    return authors
Exemple #6
0
def articles_a_z():
    """Returns a list of published article titles arranged A-Z.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:articles_a_z')
    cached = cache.get(cache_key)
    if cached:
        titles = json.loads(cached)
    else:
        titles = _articles_a_z(
            category_members('Published', namespace_id=namespaces_reversed()['Default']),
            published_authors(),
            NON_ARTICLE_PAGES
        )
        cache.set(cache_key, json.dumps(titles), settings.CACHE_TIMEOUT)
    return titles
Exemple #7
0
def page_categories(title, whitelist=[]):
    """Returns list of article subcategories the page belongs to.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:page_categories:%s' % title)
    cached = cache.get(cache_key)
    if cached:
        categories = json.loads(cached)
    else:
        url = '%s?format=json&action=query&prop=categories&titles=%s' % (settings.MEDIAWIKI_API, title)
        r = requests.get(url, headers={'content-type':'application/json'}, timeout=TIMEOUT)
        if r.status_code == 200:
            if not whitelist:
                whitelist = category_article_types()
            categories = _page_categories(whitelist, r.text)
        cache.set(cache_key, json.dumps(categories), settings.CACHE_TIMEOUT)
    return categories
Exemple #8
0
def published_sources():
    """Returns list of published Sources.
    """
    sources = []
    cache_key = make_cache_key('wikiprox:sources:published_sources')
    cached = cache.get(cache_key)
    if cached:
        sources = json.loads(cached)
        for source in sources:
            source['modified'] = datetime.strptime(source['modified'], TS_FORMAT)
    else:
        url = '%s/primarysource/sitemap/' % settings.SOURCES_API
        r = requests.get(url, headers={'content-type':'application/json'})
        if r.status_code == 200:
            response = json.loads(r.text)
            sources = [source for source in response['objects']]
        cache.set(cache_key, json.dumps(sources), settings.CACHE_TIMEOUT)
    return sources
Exemple #9
0
def all_pages():
    """Returns a list of all pages, with timestamp of latest revision.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:all_pages')
    cached = cache.get(cache_key)
    if cached:
        pages = json.loads(cached)
        for page in pages:
            page['timestamp'] = datetime.strptime(page['timestamp'], mediawiki.TS_FORMAT_ZONED)
    else:
        cookies = api_login()
        # all articles
        LIMIT=5000
        url = '%s?action=query&generator=allpages&prop=revisions&rvprop=timestamp&gaplimit=5000&format=json' % (settings.MEDIAWIKI_API)
        r = requests.get(url, headers={'content-type':'application/json'}, cookies=cookies, timeout=TIMEOUT)
        if r.status_code == 200:
            pages = _all_pages(r.text)
        api_logout()
        cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT)
    return pages
Exemple #10
0
def published_pages(cached_ok=True):
    """Returns a list of *published* articles (pages), with timestamp of latest revision.
    @param cached_ok: boolean Whether cached results are OK.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:published_pages')
    cached = cache.get(cache_key)
    if cached and cached_ok:
        pages = json.loads(cached)
        for page in pages:
            page['timestamp'] = datetime.strptime(page['timestamp'], mediawiki.TS_FORMAT_ZONED)
    else:
        pages = _published_pages(
            all_pages(),
            category_members('Published', namespace_id=namespaces_reversed()['Default'])
        )
        for page in pages:
            if not isinstance(page['timestamp'], basestring):
                page['timestamp'] = datetime.strftime(page['timestamp'], mediawiki.TS_FORMAT_ZONED)
        cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT)
    return pages
Exemple #11
0
def locations():
    """Returns list of locations and a status message.
    """
    locations = []
    cache_key = make_cache_key('wikiprox:locations:locations')
    cached = cache.get(cache_key)
    if cached:
        locations = json.loads(cached)
    else:
        url = '%s/locations/' % settings.SOURCES_API
        r = requests.get(
            url, params={'limit':'1000'},
            headers={'content-type':'application/json'},
            timeout=3)
        if (r.status_code == 200) and ('json' in r.headers['content-type']):
            response = json.loads(r.text)
            for location in response['objects']:
                locations.append(location)
        cache.set(cache_key, json.dumps(locations), settings.CACHE_TIMEOUT)
    return locations
Exemple #12
0
def _term_documents(term_id, size):
    """Get objects for specified term from DDR REST API.
    
    @param term_id: int
    @param size: int Maximum number of results to return.
    @returns: list of dicts
    """
    cache_key = make_cache_key('wikiprox:ddr:termdocs:%s:%s' % (term_id,size))
    cached = cache.get(cache_key)
    if cached:
        objects = json.loads(cached)
    else:
        url = '{api}/facet/topics/{term_id}/objects/?limit={limit}&{local}=1'.format(
            api=settings.DDR_API,
            term_id=term_id,
            limit=size,
            local=settings.DDR_MEDIA_URL_LOCAL_MARKER
        )
        r = requests.get(
            url,
            headers={'content-type':'application/json'},
            timeout=3)
        if (r.status_code not in [200]):
            raise requests.exceptions.ConnectionError(
                'Error %s' % (r.status_code))
        objects = []
        if ('json' in r.headers['content-type']):
            data = json.loads(r.text)
            if isinstance(data, dict) and data.get('objects'):
                objects = data['objects']
            elif isinstance(data, list):
                objects = data
        # add img_url_local
        for o in objects:
            if o.get('links',{}).get('html'):
                o['absolute_url'] = o['links']['html']
            if o.get('links',{}).get('thumb'):
                o['img_url_local'] = o['links']['thumb']
        cache.set(cache_key, json.dumps(objects), settings.CACHE_TIMEOUT)
    return objects
Exemple #13
0
def category_members(category_name, namespace_id=None):
    """Returns titles of pages with specified Category: tag.
    
    NOTE: Rather than just returning a list of title strings, this returns
    a list of _dicts_ containing namespace id, title, and sortkey.
    This is so certain views (e.g. Contents A-Z can grab the first letter
    of the title (or sortkey) to use for grouping purposes.
    """
    cache_key = make_cache_key('wikiprox:encyclopedia:category_members:%s:%s' % (category_name, namespace_id))
    cached = cache.get(cache_key)
    if cached:
        pages = json.loads(cached)
    else:
        cookies = api_login()
        LIMIT = 5000
        url = '%s?format=json&action=query&list=categorymembers&cmsort=sortkey&cmprop=ids|sortkeyprefix|title&cmtitle=Category:%s&cmlimit=5000' % (settings.MEDIAWIKI_API, category_name)
        if namespace_id != None:
            url = '%s&gcmnamespace=%s' % (url, namespace_id)
        r = requests.get(url, headers={'content-type':'application/json'}, cookies=cookies, timeout=TIMEOUT)
        if r.status_code == 200:
            pages = _category_members(r.text)
        api_logout()
        cache.set(cache_key, json.dumps(pages), settings.CACHE_TIMEOUT)
    return pages