def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError,e: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError, e: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def fill_attachment_form(page, attachment, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + '?action=AttachFile', None, headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: # Comment concerning the behavior of MoinMoin. If an attempt is made to post to a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def fill_page_edit_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + "?action=edit&editor=text", None, headers) #logger.debug('GRIPPO ' + repr((headers))) try: with closing(opener.open(request)) as resp: x = resp.read() resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: # Comment concerning the behavior of MoinMoin. If an attempt is made to edit a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def index_page(self, url, logtag="Requesting index at URL: {0}"): if self._proxy: url = "{0}?url={1}".format(self._proxy, quote(url)) self._logger.debug(logtag.format(url)) start_t = time.time() resp, content = self._h.request(url) retrieved_t = time.time() self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t)) doc = htmlparse(content) parsed_t = time.time() self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t)) return resp, doc
def __init__(self, source, html=False): self.records = [] if html: from amara.bindery.html import parse as htmlparse self.doc = htmlparse(source) #html5lib generates adjacent text nodes self.doc.xml_normalize() else: self.doc = amara.parse(source) self.new_record() self.common_ancestor = None self.record_pattern = None return
def fill_page_delete_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=DeletePage", None, headers) try: with closing(opener.open(request)) as resp: x = resp.read(); resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def fill_page_delete_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + "?action=DeletePage", None, headers) try: with closing(opener.open(request)) as resp: x = resp.read() resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def scrape_page_history(page, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=info", None, headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: # Comment concerning the behavior of MoinMoin. If an attempt is made to post to a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]') targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets) ) )) return output.read(), ctype
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" + repr((wiki_id, base, opener, original_page, wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError, e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def fill_page_edit_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=edit&editor=text", None, headers) #logger.debug('GRIPPO ' + repr((headers))) try: with closing(opener.open(request)) as resp: x = resp.read(); resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: # Comment concerning the behavior of MoinMoin. If an attempt is made to edit a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def item_page(self, url, logtag="Requesting item at URL: {0}"): if self._proxy: url = "{0}?url={1}".format(self._proxy, quote(url)) self._logger.debug(logtag.format(url)) start_t = time.time() resp, content = self._h.request(url) retrieved_t = time.time() self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t)) cachekey = hashlib.md5(content).hexdigest() self._logger.debug('MD5 Hash of HTTP body: {0}'.format(cachekey)) if self._cachedir: try: json_stream = open(os.path.join(self._cachedir, cachekey+'.extract.js')) cached = json.load(json_stream) self._logger.debug('Loaded from cache: {0}'.format(cachekey)) doc = None except (IOError, ValueError): doc = htmlparse(content) cached = None parsed_t = time.time() self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t)) return resp, doc, cachekey, cached
def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select( u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]' ) targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets)))) return output.read(), ctype
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError,e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'): ''' A generator of CDM records First generates header info >>> from zen.contentdm import read_contentdm >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None) >>> results.next() {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'} >>> results.next() {u'Title': u'60 years in darkness. ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...} The first yielded value is global metadata; the second is the record for the first item in the collection/query, and so on until all the items are returned, or the limit reached. If you want to see the debug messages, just do (before calling read_contentdm for the first time): >>> import logging; logging.basicConfig(level=logging.DEBUG) for a nice-sized collection to try: >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps') Auburn theater collection: >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01') >>> read_contentdm('http://content.lib.auburn.edu', collection='/football') i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps See also: * /cdm4/browse.php?CISOROOT=/football (51 items) >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]') def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return items = follow_pagination(resultsdoc) at_least_one = False count = 0 for it in items: at_least_one = True pageuri = absolutize(it.href, site) if pageuri in seen_links: continue seen_links.add(pageuri) entry = {} logger.debug("Processing item URL: {0}".format(pageuri)) (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri) entry['domain'] = netloc params = parse_qs(query) entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0] entry['id'] = params['CISOPTR'][0] logger.debug("Item id: {0}".format(entry['id'])) if entry['id'] in seen_ids: continue seen_ids.add(entry['id']) entry['link'] = unicode(pageuri) entry['local_link'] = '#' + entry['id'] resp, page, cachekey, cached = cdmsite.item_page(pageuri) if cached: entry = cached else: image = first_item(page.xml_select(u'//td[@class="tdimage"]//img')) if image: imageuri = absolutize(image.src, site) entry['imageuri'] = imageuri try: entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site) except AttributeError: logger.debug("No thumbnail") #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root) #fields = page.xml_select(u'//tr[td[@class="tdtext"]]') #fields = page.xml_select(u'//table[@class="metatable"]/tr') fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr')) for f in fields: #key = unicode(f.td[0].span.b).replace(' ', '_') key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b'))) #logger.debug("{0}".format(key)) value = u''.join(CONTENT.dispatch(f.td[1])) #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]'))) entry[key] = unicode(value) if u'Title' in entry: #logger.debug("{0}".format(entry['Title'])) entry['label'] = entry['Title'] else: entry['label'] = u'[NO LABEL AVAILABLE]' if u"Location_Depicted" in entry: locations = entry[u"Location_Depicted"].split(u', ') #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ] locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ] #print >> sys.stderr, "LOCATIONS", repr(locations) entry[u"Locations_Depicted"] = locations if u"Date_Original" in entry: entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ] if cachedir: try: json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w') json.dump(entry, json_stream) except IOError, ValueError: pass yield entry count += 1 if limit and count >= limit: logger.debug("Limit reached") break
def execute(top=None): ''' Sample request: curl -F "pattern=wiki/path" -F "wiki=http://localhost:8880/moin/foo/" "http://*****:*****@class="navigation"]//@href'): link = navchild.xml_value #print >> sys.stderr, 'LINK:', link #uri = split_fragment(item.resource)[0] #relative = uri[wikibase_len:] #print >> sys.stderr, uri, relative #if rewrite: # uri = uri.replace(rewrite, wikibase) rest_uri = wrapped_uri(original_wiki_base, link) #print >> sys.stderr, 'rest uri:', rest_uri items.append(freemix(rest_uri, opener).render()) return json.dumps({'items': items}, indent=4)
def test(self): """ docstring """ doc = htmlparse(QUERY) self.assertEqual(doc.xml_type, 'document') self.assertEqual(doc.hasContent(), True)