def wrapped_uri(original_wiki_base, link): abs_link = absolutize(link, original_wiki_base) #print >> sys.stderr, 'abs_link: ', abs_link rel_link = relativize(abs_link, original_wiki_base) #print >> sys.stderr, 'rel_link: ', rel_link rest_uri = absolutize(rel_link, REST_WIKI_BASE) #print >> sys.stderr, 'rest_uri: ', rest_uri return rest_uri
def load(self, webget): """ >>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph()) >>> g.load(WebMemo()) >>> g.nsURI u'http://www.w3.org/2003/g/po-ex' >>> len(g.graph) 15 """ super(XMLNSGlean, self).load(webget) self.nsURI = None if self.doc: self.nsURI = self.doc.xml_select(u'/*')[0].xml_namespace #@@DWC: hmm... why is NSDispatchTermination not recursive? if not self.nsURI or self.nsURI in NSDispatchTermination or self.nsURI == self.url: return #glean GRDDL result from the namespace document try: nsresult = Graph() GRDDLAgent(absolutize(self.nsURI, self.baseURI), nsresult, webget, DEBUG = self.DEBUG) if self.DEBUG: print >>sys.stderr, "ns doc graph size", len(nsresult) except IOError: pass # don't bother if we can't get a namespace document else: continueRecursion = True #setup a set of processed transforms to avoid infinite #namespace snooping cycles processedNSXForms = set() #Recursively find 'new' namespace transformations while continueRecursion: todoXForms = set() pat = (URIRef(absolutize(self.nsURI, self.baseURI)), GRDDL_VOCAB.namespaceTransformation, None) for s, p, xform in nsresult.triples(pat): if self.DEBUG: print >>sys.stderr, "found txform in NS doc:", xform if xform not in processedNSXForms: todoXForms.add(xform) #continue only if we have xforms to apply continueRecursion = bool(todoXForms) #apply the new namespace transforms on the GRDDL #source, merging the GRDDL results as we go for newXForm in todoXForms: self.transform(newXForm, webget) processedNSXForms.add(newXForm)
def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError,e: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def fill_page_edit_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + "?action=edit&editor=text", None, headers) #logger.debug('GRIPPO ' + repr((headers))) try: with closing(opener.open(request)) as resp: x = resp.read() resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: # Comment concerning the behavior of MoinMoin. If an attempt is made to edit a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def fill_attachment_form(page, attachment, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + '?action=AttachFile', None, headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: # Comment concerning the behavior of MoinMoin. If an attempt is made to post to a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def moincms(wikibase, outputdir, pattern): if pattern: pattern = re.compile(pattern) #print (wikibase, outputdir, rewrite) req = urllib2.Request(wikibase, headers={'Accept': RDF_IMT}) resp = urllib2.urlopen(req) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] feed = bindery.parse(resp) process_list = [] for item in feed.RDF.channel.items.Seq.li: uri = split_fragment(item.resource)[0] #print >> sys.stderr, (uri, str(item.resource), split_fragment(item.resource)) #Deal with the wrapped URI if original_wiki_base: #print >> sys.stderr, (uri, original_wiki_base.rstrip('/')+'/') relative = relativize(uri, original_wiki_base.rstrip('/')+'/').lstrip('/') uri = absolutize(relative, wikibase) #print >> sys.stderr, (uri, relative) if pattern and not pattern.match(relative): continue n = node.factory(uri, relative, outputdir) if n.up_to_date(): pass #print >> sys.stderr, 'Up to date. Skipped...' else: process_list.append(n) #Process nodes needing update according to priority for n in sorted(process_list, key=attrgetter('PRIORITY'), reverse=True): #print >> sys.stderr, "processing ", n.rest_uri n.render() return
def check_auth(environ, start_response, base, opener, headers=None): ''' Warning: mutates environ in place If HTTP auth succeeds will also attach a cookie to the opener object in place ''' auth = environ.get('HTTP_AUTHORIZATION') #logger.debug('GRIPPO ' + repr((headers))) if not auth: return False scheme, data = auth.split(None, 1) if scheme.lower() != 'basic': raise RuntimeError('Unsupported HTTP auth scheme: %s' % scheme) username, password = data.decode('base64').split(':', 1) url = absolutize( '?action=login&name=%s&password=%s&login=login' % (username, urllib.quote(password)), base) request = urllib2.Request(url, None, headers) try: with closing(opener.open(request)) as resp: #Don't need to do anything with the response. The cookies will be captured automatically pass except urllib2.URLError, e: if e.code == 401: # If we're here, the backend HTTP server has likely rejected our request due to HTTP auth raise HTTPAuthorizationError(url=url) elif e.code == 403: # If we get a forbidden response, we made it to MoinMoin but the user name/pass was rejected raise MoinAuthorizationError(url=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def moincms(wikibase, outputdir, pattern): if pattern: pattern = re.compile(pattern) #print (wikibase, outputdir, rewrite) req = urllib2.Request(wikibase, headers={'Accept': RDF_IMT}) resp = urllib2.urlopen(req) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] feed = bindery.parse(resp) process_list = [] for item in feed.RDF.channel.items.Seq.li: uri = split_fragment(item.resource)[0] #print >> sys.stderr, (uri, str(item.resource), split_fragment(item.resource)) #Deal with the wrapped URI if original_wiki_base: #print >> sys.stderr, (uri, original_wiki_base.rstrip('/')+'/') relative = relativize(uri, original_wiki_base.rstrip('/') + '/').lstrip('/') uri = absolutize(relative, wikibase) #print >> sys.stderr, (uri, relative) if pattern and not pattern.match(relative): continue n = node.factory(uri, relative, outputdir) if n.up_to_date(): pass #print >> sys.stderr, 'Up to date. Skipped...' else: process_list.append(n) #Process nodes needing update according to priority for n in sorted(process_list, key=attrgetter('PRIORITY'), reverse=True): #print >> sys.stderr, "processing ", n.rest_uri n.render() return
def check_auth(environ, start_response, base, opener, headers=None): ''' Warning: mutates environ in place If HTTP auth succeeds will also attach a cookie to the opener object in place ''' auth = environ.get('HTTP_AUTHORIZATION') #logger.debug('GRIPPO ' + repr((headers))) if not auth: return False scheme, data = auth.split(None, 1) if scheme.lower() != 'basic': raise RuntimeError('Unsupported HTTP auth scheme: %s'%scheme) username, password = data.decode('base64').split(':', 1) url = absolutize('?action=login&name=%s&password=%s&login=login'%(username, password), base) request = urllib2.Request(url, None, headers) try: with closing(opener.open(request)) as resp: #Don't need to do anything with the response. The cookies will be captured automatically pass except urllib2.URLError,e: if e.code == 401: # If we're here, the backend HTTP server has likely rejected our request due to HTTP auth raise HTTPAuthorizationError(url=url) elif e.code == 403: # If we get a forbidden response, we made it to MoinMoin but the user name/pass was rejected raise MoinAuthorizationError(url=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def test_relativize(self): for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases: res = iri.relativize(targetUri, againstUri) self.assertEqual(relativeUri, res, "target=%r against=%r (subPathOnly=False)" % (targetUri, againstUri)) if res is not None: res = iri.absolutize(res, againstUri) self.assertEqual( res, targetUri, "target=%r against=%r (subPathOnly=False, Absolutize)" % (targetUri, againstUri) ) res = iri.relativize(targetUri, againstUri, True) self.assertEqual(subPathUri, res, "target=%r against=%r (subPathOnly=True)" % (targetUri, againstUri)) if res is not None: res = iri.absolutize(res, againstUri) self.assertEqual( res, targetUri, "target=%r against=%r (subPathOnly=True, Absolutize)" % (targetUri, againstUri) )
def test_xslt_uo_20010503_2(): _run_xml( source_xml = """<?xml version='1.0'?> <x xmlns:xi="http://www.w3.org/2001/XInclude"> <xi:include href="include2.xi"/> </x> """, transform_xml = common_transform, expected = """<?xml version="1.0" encoding="UTF-8"?> <x xmlns:xi="http://www.w3.org/2001/XInclude"> <foo xml:base="%s"> <foo xml:base="%s"/> </foo> </x>""" % (iri.absolutize("include2.xi", BASE_URI), iri.absolutize("include1.xi", BASE_URI)), )
def test_xslt_uo_20010503_2(): _run_xml( source_xml="""<?xml version='1.0'?> <x xmlns:xi="http://www.w3.org/2001/XInclude"> <xi:include href="include2.xi"/> </x> """, transform_xml=common_transform, expected="""<?xml version="1.0" encoding="UTF-8"?> <x xmlns:xi="http://www.w3.org/2001/XInclude"> <foo xml:base="%s"> <foo xml:base="%s"/> </foo> </x>""" % (iri.absolutize("include2.xi", BASE_URI), iri.absolutize("include1.xi", BASE_URI)), )
def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError, e: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def load(self, webget): """ >>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph()) >>> g.load(WebMemo()) >>> GRDDL_PROFILE in g.profiles True """ super(XHTMLProfileGlean, self).load(webget) self.profiles = [] if self.doc: profile = self.doc.xml_select(u'/xhtml:html/xhtml:head/@profile', {u'xhtml':XHTML_NS}) if profile: self.profiles = U(profile[0]).split() for profile in self.profiles: if profile == GRDDL_PROFILE or profile == self.url: #@@What about if a document is it's own profile? continue if self.DEBUG: print >>sys.stderr, "processing profile url: ", profile #glean GRDDL result from the profile document prresult = Graph() GRDDLAgent(absolutize(profile, self.baseURI), prresult, webget, DEBUG = self.DEBUG) continueRecursion = True #setup a set of processed transforms to avoid #infinite profile snooping cycles processedProfileXForms = set() #Recursively find 'new' namespace transformations while continueRecursion: todoXForms = et() if self.DEBUG: print >>sys.stderr, "checking for profileTransformation triples with subject of: ",absolutize(profile, self.baseURI) pat = (URIRef(absolutize(profile, self.baseURI)), GRDDL_VOCAB.profileTransformation, None) for s, p, xform in prresult.triples(pat): if self.DEBUG: print >>sys.stderr, "Found: (%s,%s)"%(p,xform) if xform not in processedProfileXForms: todoXForms.add(xform) #continue only if we have xforms to apply continueRecursion = bool(todoXForms) #apply the new namespace transforms on the #GRDDL source, merging the GRDDL results as we #go for newXForm in todoXForms: self.transform(newXForm, webget) processedProfileXForms.add(newXForm)
def test_absolutize(self): for uriRef, baseUri, expectedUri in absolutize_test_cases: res = iri.absolutize(uriRef, baseUri) # in a couple cases, there's more than one correct result if isinstance(expectedUri, tuple): self.assertEqual(1, res in expectedUri, "base=%r ref=%r" % (baseUri, uriRef)) else: self.assertEqual(expectedUri, res, "base=%r ref=%r" % (baseUri, uriRef))
def zen_type(space, data): ''' Computer a Zen type full moinrest uri as well as a path relative to top of the wiki instance ''' rtype = data['zen:metadata']['zen:type'] if logger: logger.debug('zen_type link: ' + repr(rtype)) tpath, tid = rtype, absolutize(rtype, space.remotedb) if logger: logger.debug('Retrieved zen_type: ' + repr((tid, tpath))) return (tid, tpath)
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/') + '/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/') return rest_uri, abs_link
def test_absolutize(self): for uriRef, baseUri, expectedUri in absolutize_test_cases: res = iri.absolutize(uriRef, baseUri) # in a couple cases, there's more than one correct result if isinstance(expectedUri, tuple): self.assertEqual(1, res in expectedUri, 'base=%r ref=%r' % (baseUri, uriRef)) else: self.assertEqual(expectedUri, res, 'base=%r ref=%r' % (baseUri, uriRef))
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/')+'/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/') return rest_uri, abs_link
def handleirilist(ltext, **kwargs): ''' A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() newlist = model.generate_resource() for i in iris: model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base))) return newlist
def handleirilist(ltext, **kwargs): ''' A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' base=kwargs.get('base', VERSA_BASEIRI) model=kwargs.get('model') iris = ltext.strip().split() newlist = model.generate_resource() for i in iris: model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base))) return newlist
def handleiriset(ltext, **kwargs): ''' A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' fullprop=kwargs.get('fullprop') rid=kwargs.get('rid') base=kwargs.get('base', VERSA_BASEIRI) model=kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def handleiriset(ltext, **kwargs): ''' A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs ''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None
def evaluate_as_nodeset(self, context): arg0, arg1 = self._args if arg1 is None: base_uri = context.instruction.baseUri else: for node in arg1.evaluate_as_nodeset(context): base_uri = node.xml_base break else: raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET, context.instruction) arg0 = arg0.evaluate(context) if isinstance(arg0, datatypes.nodeset): uris = set() for node in arg0: uri = datatypes.string(node) if arg1 is None: base_uri = node.xml_base assert base_uri or iri.is_absolute(uri) uris.add(iri.absolutize(uri, base_uri)) else: uri = datatypes.string(arg0) assert base_uri or iri.is_absolute(uri) uris = [iri.absolutize(uri, base_uri)] documents = context.documents sources = context.transform.root.sources result = [] for uri in uris: if uri in documents: doc = documents[uri] else: if uri in sources: doc = amara.parse(StringIO(sources[uri]), uri) else: doc = amara.parse(uri) documents[uri] = doc result.append(doc) return datatypes.nodeset(result)
def test_relativize(self): for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases: res = iri.relativize(targetUri, againstUri) self.assertEqual( relativeUri, res, 'target=%r against=%r (subPathOnly=False)' % (targetUri, againstUri)) if res is not None: res = iri.absolutize(res, againstUri) self.assertEqual( res, targetUri, 'target=%r against=%r (subPathOnly=False, Absolutize)' % (targetUri, againstUri)) res = iri.relativize(targetUri, againstUri, True) self.assertEqual( subPathUri, res, 'target=%r against=%r (subPathOnly=True)' % (targetUri, againstUri)) if res is not None: res = iri.absolutize(res, againstUri) self.assertEqual( res, targetUri, 'target=%r against=%r (subPathOnly=True, Absolutize)' % (targetUri, againstUri))
def fill_page_delete_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=DeletePage", None, headers) try: with closing(opener.open(request)) as resp: x = resp.read(); resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def atom_moin(body, ctype, maxcount=None, folder=None, feed=None): #Sample query: #curl --request POST "http://localhost:8880/atom.moin?feed=http://bitworking.org/news/feed/&maxcount=10&folder=foo091023" #You can set ...&maxcount=100 or whatever number, if you like maxcount = int(maxcount if maxcount else DEFAULT_MAX) H = httplib2.Http('.cache') if USER: H.add_credentials(USER, PASSWD) #Prepare the envelope for the output (POST response) w = structencoder() output = w.cofeed(ROOT(E_CURSOR(u'updates', {u'feed': feed}))) logger.debug('Feed: ' + feed) entries = atomtools.ejsonize(feed) for entry in islice(entries, 0, maxcount): try: logger.debug('ENTRY: ' + repr(entry)) aid = entry[u'label'] slug = atomtools.slug_from_title(aid) #logger.debug('GRIPPO' + repr((id,))) dest = folder + '/' + slug chunks = [ ' title:: ' + entry[u'title'] ] chunks.append(' last changed:: ' + entry[u'updated']) chunks.append(' link:: ' + (first_item(entry[u'link']) or '')) if u'summary' in entry: chunks.append('= Summary =\n' + entry[u'summary']) if u'content_src' in entry: chunks.append('= Content =\n' + entry[u'content_src']) if u'content_text' in entry: chunks.append('= Content =\n' + entry[u'content_text']) #logger.debug("Result IDs: " + ids) if u'categories' in entry: chunks.append(u'= Categories =') for categories in entry[u'categories']: chunks.append(' * ' + categories) chunks.append(' id:: ' + entry[u'id']) chunks.append('= akara:metadata =\n akara:type:: http://purl.org/com/zepheira/zen/resource/webfeed\n') url = absolutize(dest, MOINBASE) headers = {'Content-Type' : 'text/plain'} resp, content = H.request(url, "PUT", body='\n'.join(chunks).encode('utf-8'), headers=headers) logger.debug("Result: " + repr((resp, content))) output.send(E(u'update', {u'entry-id': entry[u'id'], u'page': url})) except (KeyboardInterrupt, SystemExit): raise except Exception, e: logger.info('Exception handling Entry page: ' + repr(e)) output.send(E(u'failure', {u'entry-id': entry[u'id']}))
def handle_statement(elem, docuri): subject = elem.xml_select(u'ancestor::*/@about') subject = absolutize(subject[0].xml_value, docuri) if subject else docuri datatype = unicode(elem.xml_select(u'string(@datatype)')) if datatype: datatype = expand(datatype, elem) if elem.xml_select(u'@property') and elem.xml_select(u'@content'): return ( subject , expand(elem.property, elem), elem.content, datatype or None ) elif elem.xml_select(u'@property'): return ( subject, expand(elem.property, elem), expand(unicode(elem)), datatype or None ) elif elem.xml_select(u'@rel') and elem.xml_select(u'@resource'): return ( subject, expand(elem.rel, elem), elem.resource, datatype or None ) elif elem.xml_select(u'@rel') and elem.xml_select(u'@href'): return ( subject, expand(elem.rel, elem), elem.href, datatype or None ) else: return ()
def scrape_page_history(page, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=info", None, headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: # Comment concerning the behavior of MoinMoin. If an attempt is made to post to a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def fill_page_delete_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url + "?action=DeletePage", None, headers) try: with closing(opener.open(request)) as resp: x = resp.read() resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(), code=e.code, error=str(e))
def unwrap_uri(original_base, wrapped_base, rest_uri): ''' Constructs an absolute URL to the original Moin page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance rest_uri - moinrest-wrapped URI Returns a tuple unwrapped_link >>> from akara.util.moin import unwrap_uri >>> unwrap_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam') 'http://example.com/mywiki/spam' >>> unwrap_uri('http://example.com/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam') 'http://example.com/spam' ''' rel = relativize(rest_uri, wrapped_base.rstrip('/')+'/') return absolutize(rel, original_base.rstrip('/')+'/')
def unwrap_uri(original_base, wrapped_base, rest_uri): ''' Constructs an absolute URL to the original Moin page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance rest_uri - moinrest-wrapped URI Returns a tuple unwrapped_link >>> from akara.util.moin import unwrap_uri >>> unwrap_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam') 'http://example.com/mywiki/spam' >>> unwrap_uri('http://example.com/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam') 'http://example.com/spam' ''' rel = relativize(rest_uri, wrapped_base.rstrip('/') + '/') return absolutize(rel, original_base.rstrip('/') + '/')
def handle_statement(elem, docuri): subject = elem.xml_select(u'ancestor::*/@about') subject = absolutize(subject[0].xml_value, docuri) if subject else docuri datatype = unicode(elem.xml_select(u'string(@datatype)')) if datatype: datatype = expand(datatype, elem) if elem.xml_select(u'@property') and elem.xml_select(u'@content'): return ( subject , expand(elem.property, elem), elem.content, datatype or None ) elif elem.xml_select(u'@property'): return ( subject, expand(elem.property, elem), expand(unicode(elem)), datatype or None ) elif elem.xml_select(u'@rel') and elem.xml_select(u'@resource'): return ( subject, expand(elem.rel, elem), elem.resource, datatype or None ) elif elem.xml_select(u'@rel') and elem.xml_select(u'@href'): return ( subject, expand(elem.rel, elem), elem.href, datatype or None ) elif elem.xml_select(u'@rel'): return ( subject, expand(elem.rel, elem), elem.href, datatype or None ) else: return ()
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" + repr((wiki_id, base, opener, original_page, wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError, e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return
def fill_page_edit_form(page, wiki_id, base, opener, headers=None): url = absolutize(page, base) request = urllib2.Request(url+"?action=edit&editor=text", None, headers) #logger.debug('GRIPPO ' + repr((headers))) try: with closing(opener.open(request)) as resp: x = resp.read(); resp = x doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: # Comment concerning the behavior of MoinMoin. If an attempt is made to edit a page # and the user is not authenticated, you will either get a 403 or 404 error depending # on whether or not the page being edited exists or not. If it doesn't exist, # MoinMoin sends back a 404 which is misleading. We raise MoinMustAuthenticateError # to signal the error wrapper to issue a 401 back to the client #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA if e.code == 403 or e.code == 404: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) else: raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError,e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def from_markdown(md, output, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text output -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) No return value """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap[u'h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap[u'h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap[u'h3'] = config.get('autotype-h3') interp = config.get('interpretations', {}) #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.iteritems(): if interp_key in PREP_METHODS: interp[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interp[prop] = lambda x, **kwargs: x #Parse the Markdown h = markdown.markdown(md.decode(encoding)) doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest top_section_fields = results_until( doc.xml_select(u'//h1[1]/following-sibling::h2'), u'self::h1') docheader = doc.xml_select(u'//h1[.="@docheader"]')[0] sections = doc.xml_select(u'//h1|h2|h3[not(.="@docheader")]') def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section sect_body_items = results_until( sect.xml_select(u'following-sibling::*'), u'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ] field_list = [ li for elem in sect_body_items for li in elem.xml_select(u'li') ] def parse_pair(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _(u'Syntax error in relationship expression: {0}'. format(field))) prop = matched.group(1).strip() val = matched.group(2).strip() #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val return None, None #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if li.xml_select(u'ul'): main = ''.join([ U(node) for node in results_until(li.xml_select(u'node()'), u'self::ul') ]) #main = li.xml_select(u'string(ul/preceding-sibling::node())') prop, val = parse_pair(main) subfield_list = [sli for sli in li.xml_select(u'ul/li')] subfield_dict = dict( [parse_pair(U(pair)) for pair in subfield_list]) if None in subfield_dict: del subfield_dict[None] yield prop, val, subfield_dict #Just a regular, unadorned property else: prop, val = parse_pair(U(li)) if prop: yield prop, val, None #Gather the document-level metadata base = propbase = rbase = None for prop, val, subfield_dict in fields(docheader): if prop == '@base': base = val if prop == '@property-base': propbase = val if prop == '@resource-base': rbase = val if not propbase: propbase = base if not rbase: rbase = base #Go through the resources expressed in remaining sections for sect in sections: #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(U(sect)) if not matched: raise ValueError( _(u'Syntax error in resource header: {0}'.format(U(sect)))) rid = matched.group(1) rtype = matched.group(3) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = I(iri.absolutize(output.generate_resource(), base)) if rtype: rtype = I(iri.absolutize(rtype, base)) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_local) if rtype: output.add(rid, RDFTYPE, rtype) #Add the property for prop, val, subfield_dict in fields(sect): attrs = subfield_dict or {} fullprop = I(iri.absolutize(prop, propbase)) resinfo = AB_RESOURCE_PAT.match(val) if resinfo: val = resinfo.group(1) valtype = resinfo.group(3) if not val: val = output.generate_resource() if valtype: attrs[RDFTYPE] = valtype if fullprop in interp: val = interp[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=output) if val is not None: output.add(rid, fullprop, val) else: output.add(rid, fullprop, val, attrs) return base
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item(dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev' : params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({'value' : searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text'}) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?'+query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page+'?'+params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]') targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets) ) )) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize(page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E(u'rev', {u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T')}) for r in revs) ) )) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append(("Cache-Control","max-age="+str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ),backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item( dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev': params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({ 'value': searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text' }) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?' + query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page + '?' + params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select( u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]' ) targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets)))) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize( page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E( u'rev', { u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T') }) for r in revs)))) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append( ("Cache-Control", "max-age=" + str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def absolutize(uriref, docuri): try: return absolutize(uriref, docuri) except: return uriref
result = urllib2.HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers) result.status = code return result RIF_NS = Namespace('http://www.w3.org/2007/rif#') XSD_NS = Namespace('http://www.w3.org/2001/XMLSchema#') ENT = Namespace("http://www.w3.org/ns/entailment/") mimetypes = { 'application/rdf+xml' : 'xml', 'text/n3' : 'n3', 'text/turtle' : 'turtle', } TRANSFORM_URI = iri.absolutize('rif-core-rdf.xsl',iri.os_path_to_uri(__file__)) IMPORT_PARTS=\ """ SELECT DISTINCT ?location ?profile { [] a rif:Import; rif:location ?location; rif:profile ?profile . }""" IMPLIES_PARTS=\ """ SELECT DISTINCT ?impl ?body ?bodyType ?head ?headType { ?impl a rif:Implies; rif:if ?body; rif:then ?head .
''' fullprop = kwargs.get('fullprop') rid = kwargs.get('rid') base = kwargs.get('base', VERSA_BASEIRI) model = kwargs.get('model') iris = ltext.strip().split() for i in iris: model.add(rid, fullprop, I(iri.absolutize(i, base))) return None PREP_METHODS = { VERSA_BASEIRI + 'text': lambda x, **kwargs: x, VERSA_BASEIRI + 'resource': lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)), VERSA_BASEIRI + 'resourceset': handleiriset, } #FIXME: Isn't this just itertools.islice? def results_until(items, end_criteria): for node in items: if node.xml_select(end_criteria): break else: yield node def from_markdown(md, output, encoding='utf-8', config=None):
def generate_resource(self): if self._baseuri: return iri.absolutize(str(self._id_counter), self._baseuri) else: return str(self._id_counter)
def guess_self_uri(environ): return absolutize(environ['SCRIPT_NAME'].rstrip('/'), request_uri(environ, include_query=False))
elif o == "--no-xi-filename": noXIfilename = a if len(args) < 1: usage() return 2 elif docTesting: test() return 2 graph = Graph() namespace_manager = NamespaceManager(Graph()) for prefix,uri in nsBinds.items(): if DEBUG: print >>sys.stderr, "binding %s to %s"%(prefix,uri) namespace_manager.bind(prefix, uri, override=False) graph.namespace_manager = namespace_manager addr = absolutize(argv[-1], "file://%s/" % os.getcwd()) try: GRDDLAgent(addr, graph, WebMemo(zones,DEBUG),DEBUG=DEBUG) except IOError, e: print >>sys.stderr, str(e) return 2 print graph.serialize(format=output) if noXIfilename is not None: graph = Graph() try: GRDDLAgent(addr, graph, WebMemo(zones), False,DEBUG = DEBUG) except IOError, e: print >>sys.stderr, str(e)
def transform(self, transformURLs, webget): """ Takes a space seperated list of transform url's and applies them against the pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied """ for xformURL in transformURLs.split(): if self.DEBUG: print >>sys.stderr, "applying transformation %s" % (xformURL) if xformURL not in self.appliedTransforms: self.appliedTransforms.append(xformURL) #The transform url is resolved against the source URL (to #accomodate relative urls) stylesheetLoc = absolutize(xformURL, self.baseURI) lastUri, (content, info) = webget(stylesheetLoc, (XSLT_MT,)) _transform = InputSource.DefaultFactory.fromString(content, stylesheetLoc) iSrc = InputSource.DefaultFactory.fromString(self.docSrc,self.url) processor = Processor.Processor() processor.appendStylesheet(_transform) #see: http://www.w3.org/TR/grddl/#stylepi #Note, for the XSLT transform, the base URI of the source document #is passed in, instead of the base URI of the root node result = processor.run( iSrc,ignorePis=1 ) #get output method / media-type # <!-- Category: top-level-element --> # <xsl:output # method = "xml" | "html" | "text" | qname-but-not-ncname # version = nmtoken # encoding = string # omit-xml-declaration = "yes" | "no" # standalone = "yes" | "no" # doctype-public = string # doctype-system = string # cdata-section-elements = qnames # indent = "yes" | "no" # media-type = string /> #How to accomodate @media-type? method = processor.outputParams.method[-1] currLen = len(self.graph) if method == 'xml': self.graph.parse(StringIO(result), publicID=self.baseURI) replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \ self.graph.triples((URIRef(''),None,None))] if replace: if self.DEBUG: print >>sys.stderr, \ "Replacing empty string URI ref with %s" % ( self.baseURI) self.graph.remove((URIRef(''),None,None)) self.graph.addN(replace) if self.DEBUG: print >>sys.stderr,\ "Parsed %s triples (using baseURI: %s) as RDF/XML" % ( max(0,len(self.graph) - currLen),self.baseURI) elif method == 'text': #Attempt a Notation 3 parse (covers NTriples, and Turtle) try: self.graph.parse(StringIO(result), format='n3', publicID=self.baseURI) #@@This is mostly as a workaround for RDFLib 2.4 which will #force an empty URI string as the subject if xml:base = '' replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \ self.graph.triples((URIRef(''),None,None))] if replace: if self.DEBUG: print >>sys.stderr, \ "Replacing empty string URI ref with %s" % ( self.baseURI) self.graph.remove((URIRef(''),None,None)) self.graph.addN(replace) if self.DEBUG: print >>sys.stderr, \ "Parsed %s triples (using baseURI: %s) as Notation 3" % ( max(0,len(self.graph) - currLen),self.baseURI) except: if self.DEBUG: print >>sys.stderr, "Unknown text-based RDF serialization" else: #HTML result - recursive GRDDL mechanism? raise Exception("unsupported output type")
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'): ''' A generator of CDM records First generates header info >>> from zen.contentdm import read_contentdm >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None) >>> results.next() {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'} >>> results.next() {u'Title': u'60 years in darkness. ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...} The first yielded value is global metadata; the second is the record for the first item in the collection/query, and so on until all the items are returned, or the limit reached. If you want to see the debug messages, just do (before calling read_contentdm for the first time): >>> import logging; logging.basicConfig(level=logging.DEBUG) for a nice-sized collection to try: >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps') Auburn theater collection: >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01') >>> read_contentdm('http://content.lib.auburn.edu', collection='/football') i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps See also: * /cdm4/browse.php?CISOROOT=/football (51 items) >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]') def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return items = follow_pagination(resultsdoc) at_least_one = False count = 0 for it in items: at_least_one = True pageuri = absolutize(it.href, site) if pageuri in seen_links: continue seen_links.add(pageuri) entry = {} logger.debug("Processing item URL: {0}".format(pageuri)) (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri) entry['domain'] = netloc params = parse_qs(query) entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0] entry['id'] = params['CISOPTR'][0] logger.debug("Item id: {0}".format(entry['id'])) if entry['id'] in seen_ids: continue seen_ids.add(entry['id']) entry['link'] = unicode(pageuri) entry['local_link'] = '#' + entry['id'] resp, page, cachekey, cached = cdmsite.item_page(pageuri) if cached: entry = cached else: image = first_item(page.xml_select(u'//td[@class="tdimage"]//img')) if image: imageuri = absolutize(image.src, site) entry['imageuri'] = imageuri try: entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site) except AttributeError: logger.debug("No thumbnail") #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root) #fields = page.xml_select(u'//tr[td[@class="tdtext"]]') #fields = page.xml_select(u'//table[@class="metatable"]/tr') fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr')) for f in fields: #key = unicode(f.td[0].span.b).replace(' ', '_') key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b'))) #logger.debug("{0}".format(key)) value = u''.join(CONTENT.dispatch(f.td[1])) #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]'))) entry[key] = unicode(value) if u'Title' in entry: #logger.debug("{0}".format(entry['Title'])) entry['label'] = entry['Title'] else: entry['label'] = u'[NO LABEL AVAILABLE]' if u"Location_Depicted" in entry: locations = entry[u"Location_Depicted"].split(u', ') #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ] locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ] #print >> sys.stderr, "LOCATIONS", repr(locations) entry[u"Locations_Depicted"] = locations if u"Date_Original" in entry: entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ] if cachedir: try: json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w') json.dump(entry, json_stream) except IOError, ValueError: pass yield entry count += 1 if limit and count >= limit: logger.debug("Limit reached") break