Example #1
0
 def test_normalize_case(self):
     for uri, expected0, expected1 in case_normalization_tests:
         testname = uri
         uri = iri.split_uri_ref(uri)
         self.assertEqual(expected0, iri.unsplit_uri_ref(iri.normalize_case(uri)), testname)
         self.assertEqual(
             expected1, iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)), testname + " (host too)"
         )
Example #2
0
 def test_normalize_case(self):
     for uri, expected0, expected1 in case_normalization_tests:
         testname = uri
         uri = iri.split_uri_ref(uri)
         self.assertEqual(expected0,
                          iri.unsplit_uri_ref(iri.normalize_case(uri)),
                          testname)
         self.assertEqual(
             expected1,
             iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)),
             testname + ' (host too)')
Example #3
0
File: moin.py Project: mredar/akara
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query,
         fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/') + '/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/')
    return rest_uri, abs_link
Example #4
0
File: moin.py Project: dpla/akara
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query, fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/')+'/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/')
    return rest_uri, abs_link
Example #5
0
# Specifies the default max-age of Moin pages
CACHE_MAX_AGE = module_config().get("CACHE_MAX_AGE", None)

# Specifies a Wiki path (currently only one, FIXME) under which no caching will occur
NO_CACHE_PATHS = module_config().get("NO_CACHE_PATHS", None)

# Look at each Wiki URL and build an appropriate opener object for retrieving
# pages.   If the URL includes HTTP authentication information such as
# http://user:[email protected]/mywiki, the opener is built with
# basic authentication enabled.   For details, see:
# 
#     : HTTP basic auth: http://www.voidspace.org.uk/python/articles/urllib2.shtml#id6
for k, v in TARGET_WIKIS.items():
    #The target wiki base URI must end in '/'
    v = v.rstrip('/') + '/'
    (scheme, authority, path, query, fragment) = split_uri_ref(v)
    auth, host, port = split_authority(authority)
    authority = host + ':' + port if port else host
    schemeless_url = authority + path
    if auth:
        TARGET_WIKIS[k] = unsplit_uri_ref((scheme, authority, path, query, fragment))
        auth = auth.split(':')
        password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
        # Not setting the realm for now, so use None
        password_mgr.add_password(None, scheme+"://"+host+path, auth[0], auth[1])
        password_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
        TARGET_WIKI_OPENERS[k] = urllib2.build_opener(
            password_handler,
            urllib2.HTTPCookieProcessor(),
            multipart_post_handler.MultipartPostHandler)
    else:
Example #6
0
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'):
    '''
    A generator of CDM records
    First generates header info

    >>> from zen.contentdm import read_contentdm
    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None)
    >>> results.next()
    {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'}
    >>> results.next()
    {u'Title': u'60 years in darkness.  ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...}

    The first yielded value is global metadata; the  second is the record
    for the first item  in the collection/query, and so on until all the items
    are returned, or the limit reached.

    If you want to see the debug messages, just do (before calling read_contentdm for the first time):

    >>> import logging; logging.basicConfig(level=logging.DEBUG)

    for a nice-sized collection to try:
    >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps')

    Auburn theater collection:

    >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01')
    >>> read_contentdm('http://content.lib.auburn.edu', collection='/football')

    i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps

    See also:

    * /cdm4/browse.php?CISOROOT=/football (51 items)

    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]')

    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return

    items = follow_pagination(resultsdoc)

    at_least_one = False
    count = 0
    for it in items:
        at_least_one = True
        pageuri = absolutize(it.href, site)
        if pageuri in seen_links:
            continue
        seen_links.add(pageuri)
        entry = {}
        logger.debug("Processing item URL: {0}".format(pageuri))
        (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri)
        entry['domain'] = netloc
        params = parse_qs(query)
        entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0]
        entry['id'] = params['CISOPTR'][0]
        logger.debug("Item id: {0}".format(entry['id']))
        if entry['id'] in seen_ids:
            continue
        seen_ids.add(entry['id'])
        entry['link'] = unicode(pageuri)
        entry['local_link'] = '#' + entry['id']

        resp, page, cachekey, cached = cdmsite.item_page(pageuri)

        if cached:
            entry = cached
        else:
            image = first_item(page.xml_select(u'//td[@class="tdimage"]//img'))
            if image:
                imageuri = absolutize(image.src, site)
                entry['imageuri'] = imageuri
                try:
                    entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site)
                except AttributeError:
                    logger.debug("No thumbnail")
            #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root)
            #fields = page.xml_select(u'//tr[td[@class="tdtext"]]')
            #fields = page.xml_select(u'//table[@class="metatable"]/tr')
            fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr'))
            for f in fields:
                #key = unicode(f.td[0].span.b).replace(' ', '_')
                key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b')))
                #logger.debug("{0}".format(key))
                value = u''.join(CONTENT.dispatch(f.td[1]))
                #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]')))
                entry[key] = unicode(value)
            if u'Title' in entry:
                #logger.debug("{0}".format(entry['Title']))
                entry['label'] = entry['Title']
            else:
                entry['label'] = u'[NO LABEL AVAILABLE]'
            if u"Location_Depicted" in entry:
                locations = entry[u"Location_Depicted"].split(u', ')
                #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ]
                locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ]
                #print >> sys.stderr, "LOCATIONS", repr(locations)
                entry[u"Locations_Depicted"] = locations
            if u"Date_Original" in entry:
                entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') 
            entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ]
            if cachedir:
                try:
                    json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w')
                    json.dump(entry, json_stream)
                except IOError, ValueError:
                    pass

        yield entry
        count += 1
        if limit and count >= limit:
            logger.debug("Limit reached")
            break
Example #7
0
# Specifies the default max-age of Moin pages
CACHE_MAX_AGE = module_config().get("CACHE_MAX_AGE", None)

# Specifies a Wiki path (currently only one, FIXME) under which no caching will occur
NO_CACHE_PATHS = module_config().get("NO_CACHE_PATHS", None)

# Look at each Wiki URL and build an appropriate opener object for retrieving
# pages.   If the URL includes HTTP authentication information such as
# http://user:[email protected]/mywiki, the opener is built with
# basic authentication enabled.   For details, see:
#
#     : HTTP basic auth: http://www.voidspace.org.uk/python/articles/urllib2.shtml#id6
for k, v in TARGET_WIKIS.items():
    #The target wiki base URI must end in '/'
    v = v.rstrip('/') + '/'
    (scheme, authority, path, query, fragment) = split_uri_ref(v)
    auth, host, port = split_authority(authority)
    authority = host + ':' + port if port else host
    schemeless_url = authority + path
    if auth:
        TARGET_WIKIS[k] = unsplit_uri_ref(
            (scheme, authority, path, query, fragment))
        auth = auth.split(':')
        password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
        # Not setting the realm for now, so use None
        password_mgr.add_password(None, scheme + "://" + host + path, auth[0],
                                  auth[1])
        password_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
        TARGET_WIKI_OPENERS[k] = urllib2.build_opener(
            password_handler, urllib2.HTTPCookieProcessor(),
            multipart_post_handler.MultipartPostHandler)