def test_normalize_case(self): for uri, expected0, expected1 in case_normalization_tests: testname = uri uri = iri.split_uri_ref(uri) self.assertEqual(expected0, iri.unsplit_uri_ref(iri.normalize_case(uri)), testname) self.assertEqual( expected1, iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)), testname + " (host too)" )
def test_normalize_case(self): for uri, expected0, expected1 in case_normalization_tests: testname = uri uri = iri.split_uri_ref(uri) self.assertEqual(expected0, iri.unsplit_uri_ref(iri.normalize_case(uri)), testname) self.assertEqual( expected1, iri.unsplit_uri_ref(iri.normalize_case(uri, doHost=1)), testname + ' (host too)')
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/') + '/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/') return rest_uri, abs_link
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/')+'/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/') return rest_uri, abs_link
# Specifies the default max-age of Moin pages CACHE_MAX_AGE = module_config().get("CACHE_MAX_AGE", None) # Specifies a Wiki path (currently only one, FIXME) under which no caching will occur NO_CACHE_PATHS = module_config().get("NO_CACHE_PATHS", None) # Look at each Wiki URL and build an appropriate opener object for retrieving # pages. If the URL includes HTTP authentication information such as # http://user:[email protected]/mywiki, the opener is built with # basic authentication enabled. For details, see: # # : HTTP basic auth: http://www.voidspace.org.uk/python/articles/urllib2.shtml#id6 for k, v in TARGET_WIKIS.items(): #The target wiki base URI must end in '/' v = v.rstrip('/') + '/' (scheme, authority, path, query, fragment) = split_uri_ref(v) auth, host, port = split_authority(authority) authority = host + ':' + port if port else host schemeless_url = authority + path if auth: TARGET_WIKIS[k] = unsplit_uri_ref((scheme, authority, path, query, fragment)) auth = auth.split(':') password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() # Not setting the realm for now, so use None password_mgr.add_password(None, scheme+"://"+host+path, auth[0], auth[1]) password_handler = urllib2.HTTPBasicAuthHandler(password_mgr) TARGET_WIKI_OPENERS[k] = urllib2.build_opener( password_handler, urllib2.HTTPCookieProcessor(), multipart_post_handler.MultipartPostHandler) else:
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'): ''' A generator of CDM records First generates header info >>> from zen.contentdm import read_contentdm >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None) >>> results.next() {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'} >>> results.next() {u'Title': u'60 years in darkness. ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...} The first yielded value is global metadata; the second is the record for the first item in the collection/query, and so on until all the items are returned, or the limit reached. If you want to see the debug messages, just do (before calling read_contentdm for the first time): >>> import logging; logging.basicConfig(level=logging.DEBUG) for a nice-sized collection to try: >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps') Auburn theater collection: >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01') >>> read_contentdm('http://content.lib.auburn.edu', collection='/football') i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps See also: * /cdm4/browse.php?CISOROOT=/football (51 items) >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]') def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return items = follow_pagination(resultsdoc) at_least_one = False count = 0 for it in items: at_least_one = True pageuri = absolutize(it.href, site) if pageuri in seen_links: continue seen_links.add(pageuri) entry = {} logger.debug("Processing item URL: {0}".format(pageuri)) (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri) entry['domain'] = netloc params = parse_qs(query) entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0] entry['id'] = params['CISOPTR'][0] logger.debug("Item id: {0}".format(entry['id'])) if entry['id'] in seen_ids: continue seen_ids.add(entry['id']) entry['link'] = unicode(pageuri) entry['local_link'] = '#' + entry['id'] resp, page, cachekey, cached = cdmsite.item_page(pageuri) if cached: entry = cached else: image = first_item(page.xml_select(u'//td[@class="tdimage"]//img')) if image: imageuri = absolutize(image.src, site) entry['imageuri'] = imageuri try: entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site) except AttributeError: logger.debug("No thumbnail") #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root) #fields = page.xml_select(u'//tr[td[@class="tdtext"]]') #fields = page.xml_select(u'//table[@class="metatable"]/tr') fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr')) for f in fields: #key = unicode(f.td[0].span.b).replace(' ', '_') key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b'))) #logger.debug("{0}".format(key)) value = u''.join(CONTENT.dispatch(f.td[1])) #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]'))) entry[key] = unicode(value) if u'Title' in entry: #logger.debug("{0}".format(entry['Title'])) entry['label'] = entry['Title'] else: entry['label'] = u'[NO LABEL AVAILABLE]' if u"Location_Depicted" in entry: locations = entry[u"Location_Depicted"].split(u', ') #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ] locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ] #print >> sys.stderr, "LOCATIONS", repr(locations) entry[u"Locations_Depicted"] = locations if u"Date_Original" in entry: entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ] if cachedir: try: json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w') json.dump(entry, json_stream) except IOError, ValueError: pass yield entry count += 1 if limit and count >= limit: logger.debug("Limit reached") break
# Specifies the default max-age of Moin pages CACHE_MAX_AGE = module_config().get("CACHE_MAX_AGE", None) # Specifies a Wiki path (currently only one, FIXME) under which no caching will occur NO_CACHE_PATHS = module_config().get("NO_CACHE_PATHS", None) # Look at each Wiki URL and build an appropriate opener object for retrieving # pages. If the URL includes HTTP authentication information such as # http://user:[email protected]/mywiki, the opener is built with # basic authentication enabled. For details, see: # # : HTTP basic auth: http://www.voidspace.org.uk/python/articles/urllib2.shtml#id6 for k, v in TARGET_WIKIS.items(): #The target wiki base URI must end in '/' v = v.rstrip('/') + '/' (scheme, authority, path, query, fragment) = split_uri_ref(v) auth, host, port = split_authority(authority) authority = host + ':' + port if port else host schemeless_url = authority + path if auth: TARGET_WIKIS[k] = unsplit_uri_ref( (scheme, authority, path, query, fragment)) auth = auth.split(':') password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() # Not setting the realm for now, so use None password_mgr.add_password(None, scheme + "://" + host + path, auth[0], auth[1]) password_handler = urllib2.HTTPBasicAuthHandler(password_mgr) TARGET_WIKI_OPENERS[k] = urllib2.build_opener( password_handler, urllib2.HTTPCookieProcessor(), multipart_post_handler.MultipartPostHandler)