Esempio n. 1
0
def wrapped_uri(original_wiki_base, link):
    abs_link = absolutize(link, original_wiki_base)
    #print >> sys.stderr, 'abs_link: ', abs_link
    rel_link = relativize(abs_link, original_wiki_base)
    #print >> sys.stderr, 'rel_link: ', rel_link
    rest_uri = absolutize(rel_link, REST_WIKI_BASE)
    #print >> sys.stderr, 'rest_uri: ', rest_uri
    return rest_uri
Esempio n. 2
0
def wrapped_uri(original_wiki_base, link):
    abs_link = absolutize(link, original_wiki_base)
    #print >> sys.stderr, 'abs_link: ', abs_link
    rel_link = relativize(abs_link, original_wiki_base)
    #print >> sys.stderr, 'rel_link: ', rel_link
    rest_uri = absolutize(rel_link, REST_WIKI_BASE)
    #print >> sys.stderr, 'rest_uri: ', rest_uri
    return rest_uri
Esempio n. 3
0
    def load(self, webget):
        """
        
        >>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph())
        >>> g.load(WebMemo())
        >>> g.nsURI
        u'http://www.w3.org/2003/g/po-ex'
        >>> len(g.graph)
        15
        """
        super(XMLNSGlean, self).load(webget)
        self.nsURI = None
        if self.doc:
            self.nsURI = self.doc.xml_select(u'/*')[0].xml_namespace

            #@@DWC: hmm... why is NSDispatchTermination not recursive?
            if not self.nsURI or self.nsURI in NSDispatchTermination or self.nsURI == self.url:
                return

            #glean GRDDL result from the namespace document
            try:
                nsresult = Graph()
                GRDDLAgent(absolutize(self.nsURI, self.baseURI), nsresult, webget, DEBUG = self.DEBUG)
                if self.DEBUG:
                    print >>sys.stderr, "ns doc graph size", len(nsresult)
            except IOError:
                pass # don't bother if we can't get a namespace document
            else:
                continueRecursion = True
                #setup a set of processed transforms to avoid infinite
                #namespace snooping cycles
                processedNSXForms = set()
                #Recursively find 'new' namespace transformations
                while continueRecursion:
                    todoXForms = set()
                    pat = (URIRef(absolutize(self.nsURI, self.baseURI)), GRDDL_VOCAB.namespaceTransformation, None)
                    for s, p, xform in nsresult.triples(pat):
                        if self.DEBUG:
                            print >>sys.stderr, "found txform in NS doc:", xform
                        if xform not in processedNSXForms:
                            todoXForms.add(xform)
                    #continue only if we have xforms to apply
                    continueRecursion = bool(todoXForms)
                    #apply the new namespace transforms on the GRDDL
                    #source, merging the GRDDL results as we go
                    for newXForm in todoXForms:
                        self.transform(newXForm, webget)
                        processedNSXForms.add(newXForm)
Esempio n. 4
0
def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError,e:
        raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
Esempio n. 5
0
def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError, e:

        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
Esempio n. 6
0
def fill_page_edit_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + "?action=edit&editor=text", None, headers)
    #logger.debug('GRIPPO ' + repr((headers)))
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read()
            resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to edit a page
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist,
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client

        #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))
Esempio n. 7
0
def fill_attachment_form(page,
                         attachment,
                         wiki_id,
                         base,
                         opener,
                         headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + '?action=AttachFile', None, headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to post to a page
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist,
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))
Esempio n. 8
0
def moincms(wikibase, outputdir, pattern):
    if pattern: pattern = re.compile(pattern)
    #print (wikibase, outputdir, rewrite)
    req = urllib2.Request(wikibase, headers={'Accept': RDF_IMT})
    resp = urllib2.urlopen(req)
    original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
    feed = bindery.parse(resp)
    process_list = []
    for item in feed.RDF.channel.items.Seq.li:
        uri = split_fragment(item.resource)[0]
        #print >> sys.stderr, (uri, str(item.resource), split_fragment(item.resource))
        #Deal with the wrapped URI
        if original_wiki_base:
            #print >> sys.stderr, (uri, original_wiki_base.rstrip('/')+'/')
            relative = relativize(uri, original_wiki_base.rstrip('/')+'/').lstrip('/')
            uri = absolutize(relative, wikibase)
        #print >> sys.stderr, (uri, relative)
        if pattern and not pattern.match(relative):
            continue
        n = node.factory(uri, relative, outputdir)
        if n.up_to_date():
            pass
            #print >> sys.stderr, 'Up to date.  Skipped...'
        else:
            process_list.append(n)
            
    #Process nodes needing update according to priority
    for n in sorted(process_list, key=attrgetter('PRIORITY'), reverse=True):
        #print >> sys.stderr, "processing ", n.rest_uri
        n.render()
    return
Esempio n. 9
0
def check_auth(environ, start_response, base, opener, headers=None):
    '''
    Warning: mutates environ in place
    
    If HTTP auth succeeds will also attach a cookie to the opener object in place
    '''
    auth = environ.get('HTTP_AUTHORIZATION')
    #logger.debug('GRIPPO ' + repr((headers)))
    if not auth:
        return False

    scheme, data = auth.split(None, 1)
    if scheme.lower() != 'basic':
        raise RuntimeError('Unsupported HTTP auth scheme: %s' % scheme)
    username, password = data.decode('base64').split(':', 1)
    url = absolutize(
        '?action=login&name=%s&password=%s&login=login' %
        (username, urllib.quote(password)), base)
    request = urllib2.Request(url, None, headers)
    try:
        with closing(opener.open(request)) as resp:
            #Don't need to do anything with the response.  The cookies will be captured automatically
            pass
    except urllib2.URLError, e:
        if e.code == 401:
            # If we're here, the backend HTTP server has likely rejected our request due to HTTP auth
            raise HTTPAuthorizationError(url=url)
        elif e.code == 403:
            # If we get a forbidden response, we made it to MoinMoin but the user name/pass was rejected
            raise MoinAuthorizationError(url=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
Esempio n. 10
0
def moincms(wikibase, outputdir, pattern):
    if pattern: pattern = re.compile(pattern)
    #print (wikibase, outputdir, rewrite)
    req = urllib2.Request(wikibase, headers={'Accept': RDF_IMT})
    resp = urllib2.urlopen(req)
    original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
    feed = bindery.parse(resp)
    process_list = []
    for item in feed.RDF.channel.items.Seq.li:
        uri = split_fragment(item.resource)[0]
        #print >> sys.stderr, (uri, str(item.resource), split_fragment(item.resource))
        #Deal with the wrapped URI
        if original_wiki_base:
            #print >> sys.stderr, (uri, original_wiki_base.rstrip('/')+'/')
            relative = relativize(uri,
                                  original_wiki_base.rstrip('/') +
                                  '/').lstrip('/')
            uri = absolutize(relative, wikibase)
        #print >> sys.stderr, (uri, relative)
        if pattern and not pattern.match(relative):
            continue
        n = node.factory(uri, relative, outputdir)
        if n.up_to_date():
            pass
            #print >> sys.stderr, 'Up to date.  Skipped...'
        else:
            process_list.append(n)

    #Process nodes needing update according to priority
    for n in sorted(process_list, key=attrgetter('PRIORITY'), reverse=True):
        #print >> sys.stderr, "processing ", n.rest_uri
        n.render()
    return
Esempio n. 11
0
def check_auth(environ, start_response, base, opener, headers=None):
    '''
    Warning: mutates environ in place
    
    If HTTP auth succeeds will also attach a cookie to the opener object in place
    '''
    auth = environ.get('HTTP_AUTHORIZATION')
    #logger.debug('GRIPPO ' + repr((headers)))
    if not auth: 
        return False

    scheme, data = auth.split(None, 1)
    if scheme.lower() != 'basic':
        raise RuntimeError('Unsupported HTTP auth scheme: %s'%scheme)
    username, password = data.decode('base64').split(':', 1)
    url = absolutize('?action=login&name=%s&password=%s&login=login'%(username, password), base)
    request = urllib2.Request(url, None, headers)
    try:
        with closing(opener.open(request)) as resp:
            #Don't need to do anything with the response.  The cookies will be captured automatically
            pass
    except urllib2.URLError,e:
        if e.code == 401:
            # If we're here, the backend HTTP server has likely rejected our request due to HTTP auth
            raise HTTPAuthorizationError(url=url)
        elif e.code == 403:
            # If we get a forbidden response, we made it to MoinMoin but the user name/pass was rejected
            raise MoinAuthorizationError(url=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
Esempio n. 12
0
 def test_relativize(self):
     for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases:
         res = iri.relativize(targetUri, againstUri)
         self.assertEqual(relativeUri, res, "target=%r against=%r (subPathOnly=False)" % (targetUri, againstUri))
         if res is not None:
             res = iri.absolutize(res, againstUri)
             self.assertEqual(
                 res, targetUri, "target=%r against=%r (subPathOnly=False, Absolutize)" % (targetUri, againstUri)
             )
         res = iri.relativize(targetUri, againstUri, True)
         self.assertEqual(subPathUri, res, "target=%r against=%r (subPathOnly=True)" % (targetUri, againstUri))
         if res is not None:
             res = iri.absolutize(res, againstUri)
             self.assertEqual(
                 res, targetUri, "target=%r against=%r (subPathOnly=True, Absolutize)" % (targetUri, againstUri)
             )
Esempio n. 13
0
def test_xslt_uo_20010503_2():
  _run_xml(
    source_xml = """<?xml version='1.0'?>
  <x xmlns:xi="http://www.w3.org/2001/XInclude">
  <xi:include href="include2.xi"/>
  </x>
  """,
    transform_xml = common_transform,
    expected = """<?xml version="1.0" encoding="UTF-8"?>
<x xmlns:xi="http://www.w3.org/2001/XInclude">
<foo xml:base="%s">
  <foo xml:base="%s"/>
</foo>
</x>""" % (iri.absolutize("include2.xi", BASE_URI),
           iri.absolutize("include1.xi", BASE_URI)),
    )
Esempio n. 14
0
def test_xslt_uo_20010503_2():
    _run_xml(
        source_xml="""<?xml version='1.0'?>
  <x xmlns:xi="http://www.w3.org/2001/XInclude">
  <xi:include href="include2.xi"/>
  </x>
  """,
        transform_xml=common_transform,
        expected="""<?xml version="1.0" encoding="UTF-8"?>
<x xmlns:xi="http://www.w3.org/2001/XInclude">
<foo xml:base="%s">
  <foo xml:base="%s"/>
</foo>
</x>""" % (iri.absolutize("include2.xi",
                          BASE_URI), iri.absolutize("include1.xi", BASE_URI)),
    )
Esempio n. 15
0
def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError, e:
        raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
Esempio n. 16
0
def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.
    
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError,e:
        
        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
Esempio n. 17
0
 def load(self, webget):
     """
     >>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph())
     >>> g.load(WebMemo())
     >>> GRDDL_PROFILE in g.profiles
     True
     
     """
     super(XHTMLProfileGlean, self).load(webget)
     self.profiles = []
     if self.doc:
         profile = self.doc.xml_select(u'/xhtml:html/xhtml:head/@profile',
                                  {u'xhtml':XHTML_NS})
         if profile:
             self.profiles = U(profile[0]).split()
             for profile in self.profiles:
                 if profile == GRDDL_PROFILE or profile == self.url:
                     #@@What about if a document is it's own profile?
                     continue
                 if self.DEBUG:
                     print >>sys.stderr, "processing profile url: ", profile
                 #glean GRDDL result from the profile document
                 prresult = Graph()
                 GRDDLAgent(absolutize(profile, self.baseURI), prresult, webget, DEBUG = self.DEBUG)
                 continueRecursion = True
                 #setup a set of processed transforms to avoid
                 #infinite profile snooping cycles
                 processedProfileXForms = set()
                 #Recursively find 'new' namespace transformations
                 while continueRecursion:
                     todoXForms = et()
                     if self.DEBUG:
                         print >>sys.stderr, "checking for profileTransformation triples with subject of: ",absolutize(profile, self.baseURI)
                     pat = (URIRef(absolutize(profile, self.baseURI)), GRDDL_VOCAB.profileTransformation, None)
                     for s, p, xform in prresult.triples(pat):
                         if self.DEBUG:
                             print >>sys.stderr, "Found: (%s,%s)"%(p,xform) 
                         if xform not in processedProfileXForms:
                             todoXForms.add(xform)
                     #continue only if we have xforms to apply
                     continueRecursion = bool(todoXForms)
                     #apply the new namespace transforms on the
                     #GRDDL source, merging the GRDDL results as we
                     #go
                     for newXForm in todoXForms:
                         self.transform(newXForm, webget)
                         processedProfileXForms.add(newXForm)
Esempio n. 18
0
 def test_absolutize(self):
     for uriRef, baseUri, expectedUri in absolutize_test_cases:
         res = iri.absolutize(uriRef, baseUri)
         # in a couple cases, there's more than one correct result
         if isinstance(expectedUri, tuple):
             self.assertEqual(1, res in expectedUri, "base=%r ref=%r" % (baseUri, uriRef))
         else:
             self.assertEqual(expectedUri, res, "base=%r ref=%r" % (baseUri, uriRef))
Esempio n. 19
0
File: couchdb.py Progetto: dpla/zen
 def zen_type(space, data):
     '''
     Computer a Zen type full moinrest uri as well as a path relative to top of the wiki instance
     '''
     rtype = data['zen:metadata']['zen:type']
     if logger: logger.debug('zen_type link: ' + repr(rtype))
     tpath, tid = rtype, absolutize(rtype, space.remotedb)
     if logger: logger.debug('Retrieved zen_type: ' + repr((tid, tpath)))
     return (tid, tpath)
Esempio n. 20
0
File: moin.py Progetto: mredar/akara
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query,
         fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/') + '/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/')
    return rest_uri, abs_link
Esempio n. 21
0
 def test_absolutize(self):
     for uriRef, baseUri, expectedUri in absolutize_test_cases:
         res = iri.absolutize(uriRef, baseUri)
         # in a couple cases, there's more than one correct result
         if isinstance(expectedUri, tuple):
             self.assertEqual(1, res in expectedUri,
                              'base=%r ref=%r' % (baseUri, uriRef))
         else:
             self.assertEqual(expectedUri, res,
                              'base=%r ref=%r' % (baseUri, uriRef))
Esempio n. 22
0
File: moin.py Progetto: dpla/akara
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False):
    '''
    Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    link - the relative link, generally from one wiki page to another
    relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base
    raw - the link is a full hierarchical path, rather than relative to the wiki base

    Returns a tuple (wrapped_uri, abs_link)
    
    wrapped_uri - the URI wrapped for REST ops
    abs_link - the full, original wiki URL
    
    >>> from akara.util.moin import wiki_uri
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam')
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam')
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True)
    (None, None)
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True)
    ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam')
    >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam')
    ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam')
    '''
    #rel_link = relativize(abs_link, original_wiki_base)
    #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b
    #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that
    #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/'))))
    if raw and not is_absolute(link):
        (scheme, authority, path, query, fragment) = split_uri_ref(original_base)
        link = link[len(path):]
    link = link.lstrip('/')
    abs_link = absolutize(link, original_base.rstrip('/')+'/')
    rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/')
    if not rel_to_wikibase:
        #It's not a relative wiki link
        return None, None
    rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/')
    return rest_uri, abs_link
Esempio n. 23
0
def handleirilist(ltext, **kwargs):
    '''
    A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    newlist = model.generate_resource()
    for i in iris:
        model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base)))
    return newlist
Esempio n. 24
0
File: md.py Progetto: erimille/versa
def handleirilist(ltext, **kwargs):
    '''
    A helper that converts lists of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    base=kwargs.get('base', VERSA_BASEIRI)
    model=kwargs.get('model')
    iris = ltext.strip().split()
    newlist = model.generate_resource()
    for i in iris:
        model.add(newlist, VERSA_BASEIRI + 'item', I(iri.absolutize(i, base)))
    return newlist
Esempio n. 25
0
File: md.py Progetto: erimille/versa
def handleiriset(ltext, **kwargs):
    '''
    A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    fullprop=kwargs.get('fullprop')
    rid=kwargs.get('rid')
    base=kwargs.get('base', VERSA_BASEIRI)
    model=kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Esempio n. 26
0
def handleiriset(ltext, **kwargs):
    '''
    A helper that converts sets of resources from a textual format such as Markdown, including absolutizing relative IRIs
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None
Esempio n. 27
0
    def evaluate_as_nodeset(self, context):
        arg0, arg1 = self._args
        if arg1 is None:
            base_uri = context.instruction.baseUri
        else:
            for node in arg1.evaluate_as_nodeset(context):
                base_uri = node.xml_base
                break
            else:
                raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET,
                                       context.instruction)
        arg0 = arg0.evaluate(context)
        if isinstance(arg0, datatypes.nodeset):
            uris = set()
            for node in arg0:
                uri = datatypes.string(node)
                if arg1 is None:
                    base_uri = node.xml_base
                assert base_uri or iri.is_absolute(uri)
                uris.add(iri.absolutize(uri, base_uri))
        else:
            uri = datatypes.string(arg0)
            assert base_uri or iri.is_absolute(uri)
            uris = [iri.absolutize(uri, base_uri)]

        documents = context.documents
        sources = context.transform.root.sources
        result = []
        for uri in uris:
            if uri in documents:
                doc = documents[uri]
            else:
                if uri in sources:
                    doc = amara.parse(StringIO(sources[uri]), uri)
                else:
                    doc = amara.parse(uri)
                documents[uri] = doc
            result.append(doc)
        return datatypes.nodeset(result)
Esempio n. 28
0
    def evaluate_as_nodeset(self, context):
        arg0, arg1 = self._args
        if arg1 is None:
            base_uri = context.instruction.baseUri
        else:
            for node in arg1.evaluate_as_nodeset(context):
                base_uri = node.xml_base
                break
            else:
                raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET,
                                       context.instruction)
        arg0 = arg0.evaluate(context)
        if isinstance(arg0, datatypes.nodeset):
            uris = set()
            for node in arg0:
                uri = datatypes.string(node)
                if arg1 is None:
                    base_uri = node.xml_base
                assert base_uri or iri.is_absolute(uri)
                uris.add(iri.absolutize(uri, base_uri))
        else:
            uri = datatypes.string(arg0)
            assert base_uri or iri.is_absolute(uri)
            uris = [iri.absolutize(uri, base_uri)]

        documents = context.documents
        sources = context.transform.root.sources
        result = []
        for uri in uris:
            if uri in documents:
                doc = documents[uri]
            else:
                if uri in sources:
                    doc = amara.parse(StringIO(sources[uri]), uri)
                else:
                    doc = amara.parse(uri)
                documents[uri] = doc
            result.append(doc)
        return datatypes.nodeset(result)
Esempio n. 29
0
 def test_relativize(self):
     for targetUri, againstUri, relativeUri, subPathUri in relativize_test_cases:
         res = iri.relativize(targetUri, againstUri)
         self.assertEqual(
             relativeUri, res, 'target=%r against=%r (subPathOnly=False)' %
             (targetUri, againstUri))
         if res is not None:
             res = iri.absolutize(res, againstUri)
             self.assertEqual(
                 res, targetUri,
                 'target=%r against=%r (subPathOnly=False, Absolutize)' %
                 (targetUri, againstUri))
         res = iri.relativize(targetUri, againstUri, True)
         self.assertEqual(
             subPathUri, res, 'target=%r against=%r (subPathOnly=True)' %
             (targetUri, againstUri))
         if res is not None:
             res = iri.absolutize(res, againstUri)
             self.assertEqual(
                 res, targetUri,
                 'target=%r against=%r (subPathOnly=True, Absolutize)' %
                 (targetUri, againstUri))
Esempio n. 30
0
def fill_page_delete_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=DeletePage", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read(); resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
Esempio n. 31
0
File: atom_zen.py Progetto: dpla/zen
def atom_moin(body, ctype, maxcount=None, folder=None, feed=None):
    #Sample query:
    #curl --request POST "http://localhost:8880/atom.moin?feed=http://bitworking.org/news/feed/&maxcount=10&folder=foo091023"
    #You can set ...&maxcount=100 or whatever number, if you like
    maxcount = int(maxcount if maxcount else DEFAULT_MAX)

    H = httplib2.Http('.cache')
    if USER:
        H.add_credentials(USER, PASSWD)

    #Prepare the envelope for the output (POST response)
    w = structencoder()
    output = w.cofeed(ROOT(E_CURSOR(u'updates', {u'feed': feed})))
    logger.debug('Feed: ' + feed)
    
    entries = atomtools.ejsonize(feed)
    for entry in islice(entries, 0, maxcount):
        try:
            logger.debug('ENTRY: ' + repr(entry))
            aid = entry[u'label']
            slug = atomtools.slug_from_title(aid)
            #logger.debug('GRIPPO' + repr((id,)))
            dest = folder + '/' + slug
            chunks = [ ' title:: ' + entry[u'title'] ]
            chunks.append(' last changed:: ' + entry[u'updated'])
            chunks.append(' link:: ' + (first_item(entry[u'link']) or ''))

            if u'summary' in entry: chunks.append('= Summary =\n' + entry[u'summary'])
            if u'content_src' in entry: chunks.append('= Content =\n' + entry[u'content_src'])
            if u'content_text' in entry: chunks.append('= Content =\n' + entry[u'content_text'])
            #logger.debug("Result IDs: " + ids)
            if u'categories' in entry:
                chunks.append(u'= Categories =')
                for categories in entry[u'categories']:
                    chunks.append(' * ' + categories)

            chunks.append(' id:: ' + entry[u'id'])
            chunks.append('= akara:metadata =\n akara:type:: http://purl.org/com/zepheira/zen/resource/webfeed\n')

            url = absolutize(dest, MOINBASE)
            headers = {'Content-Type' : 'text/plain'}
            resp, content = H.request(url, "PUT", body='\n'.join(chunks).encode('utf-8'), headers=headers)
            logger.debug("Result: " + repr((resp, content)))
            output.send(E(u'update', {u'entry-id': entry[u'id'], u'page': url}))
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception, e:
            logger.info('Exception handling Entry page: ' + repr(e))
            output.send(E(u'failure', {u'entry-id': entry[u'id']}))
Esempio n. 32
0
def handle_statement(elem, docuri):
    subject = elem.xml_select(u'ancestor::*/@about')
    subject = absolutize(subject[0].xml_value, docuri) if subject else docuri
    
    datatype = unicode(elem.xml_select(u'string(@datatype)'))
    if datatype: datatype = expand(datatype, elem)
    
    if elem.xml_select(u'@property') and elem.xml_select(u'@content'):
        return ( subject , expand(elem.property, elem), elem.content, datatype or None )
    elif elem.xml_select(u'@property'):
        return ( subject, expand(elem.property, elem), expand(unicode(elem)), datatype or None )
    elif elem.xml_select(u'@rel') and elem.xml_select(u'@resource'):
        return ( subject, expand(elem.rel, elem), elem.resource, datatype or None )
    elif elem.xml_select(u'@rel') and elem.xml_select(u'@href'):
        return ( subject, expand(elem.rel, elem), elem.href, datatype or None )
    else:
        return ()
Esempio n. 33
0
def scrape_page_history(page, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=info", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to post to a page 
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist, 
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
Esempio n. 34
0
def fill_page_delete_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + "?action=DeletePage", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read()
            resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))
Esempio n. 35
0
File: moin.py Progetto: dpla/akara
def unwrap_uri(original_base, wrapped_base, rest_uri):
    '''
    Constructs an absolute URL to the original Moin page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    rest_uri - moinrest-wrapped URI

    Returns a tuple unwrapped_link
    
    >>> from akara.util.moin import unwrap_uri
    >>> unwrap_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam')
    'http://example.com/mywiki/spam'
    >>> unwrap_uri('http://example.com/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam')
    'http://example.com/spam'
    '''
    rel = relativize(rest_uri, wrapped_base.rstrip('/')+'/')
    return absolutize(rel, original_base.rstrip('/')+'/')
Esempio n. 36
0
File: moin.py Progetto: mredar/akara
def unwrap_uri(original_base, wrapped_base, rest_uri):
    '''
    Constructs an absolute URL to the original Moin page
    
    original_base - The base URI of the actual Moin instance
    wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance
    rest_uri - moinrest-wrapped URI

    Returns a tuple unwrapped_link
    
    >>> from akara.util.moin import unwrap_uri
    >>> unwrap_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam')
    'http://example.com/mywiki/spam'
    >>> unwrap_uri('http://example.com/', 'http://localhost:8880/moin/w/', 'http://localhost:8880/moin/w/spam')
    'http://example.com/spam'
    '''
    rel = relativize(rest_uri, wrapped_base.rstrip('/') + '/')
    return absolutize(rel, original_base.rstrip('/') + '/')
Esempio n. 37
0
def handle_statement(elem, docuri):
    subject = elem.xml_select(u'ancestor::*/@about')
    subject = absolutize(subject[0].xml_value, docuri) if subject else docuri
    
    datatype = unicode(elem.xml_select(u'string(@datatype)'))
    if datatype: datatype = expand(datatype, elem)
    
    if elem.xml_select(u'@property') and elem.xml_select(u'@content'):
        return ( subject , expand(elem.property, elem), elem.content, datatype or None )
    elif elem.xml_select(u'@property'):
        return ( subject, expand(elem.property, elem), expand(unicode(elem)), datatype or None )
    elif elem.xml_select(u'@rel') and elem.xml_select(u'@resource'):
        return ( subject, expand(elem.rel, elem), elem.resource, datatype or None )
    elif elem.xml_select(u'@rel') and elem.xml_select(u'@href'):
        return ( subject, expand(elem.rel, elem), elem.href, datatype or None )
    elif elem.xml_select(u'@rel'):
        return ( subject, expand(elem.rel, elem), elem.href, datatype or None )
    else:
        return ()
Esempio n. 38
0
def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" +
                 repr((wiki_id, base, opener, original_page,
                       wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
    #    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener,
                                     req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError, e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
Esempio n. 39
0
    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return
Esempio n. 40
0
def fill_page_edit_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=edit&editor=text", None, headers)
    #logger.debug('GRIPPO ' + repr((headers)))
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read(); resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to edit a page 
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist, 
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        
        #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))
Esempio n. 41
0
def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
#    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError,e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
Esempio n. 42
0
def from_markdown(md, output, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    output -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    No return value
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'):
        syntaxtypemap[u'h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'):
        syntaxtypemap[u'h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'):
        syntaxtypemap[u'h3'] = config.get('autotype-h3')
    interp = config.get('interpretations', {})
    #Map the interpretation IRIs to functions to do the data prep
    for prop, interp_key in interp.iteritems():
        if interp_key in PREP_METHODS:
            interp[prop] = PREP_METHODS[interp_key]
        else:
            #just use the identity, i.e. no-op
            interp[prop] = lambda x, **kwargs: x

    #Parse the Markdown
    h = markdown.markdown(md.decode(encoding))

    doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    top_section_fields = results_until(
        doc.xml_select(u'//h1[1]/following-sibling::h2'), u'self::h1')

    docheader = doc.xml_select(u'//h1[.="@docheader"]')[0]
    sections = doc.xml_select(u'//h1|h2|h3[not(.="@docheader")]')

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        sect_body_items = results_until(
            sect.xml_select(u'following-sibling::*'),
            u'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ]
        field_list = [
            li for elem in sect_body_items for li in elem.xml_select(u'li')
        ]

        def parse_pair(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _(u'Syntax error in relationship expression: {0}'.
                          format(field)))
                prop = matched.group(1).strip()
                val = matched.group(2).strip()
                #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val
            return None, None

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if li.xml_select(u'ul'):
                main = ''.join([
                    U(node) for node in results_until(li.xml_select(u'node()'),
                                                      u'self::ul')
                ])
                #main = li.xml_select(u'string(ul/preceding-sibling::node())')
                prop, val = parse_pair(main)
                subfield_list = [sli for sli in li.xml_select(u'ul/li')]
                subfield_dict = dict(
                    [parse_pair(U(pair)) for pair in subfield_list])
                if None in subfield_dict: del subfield_dict[None]
                yield prop, val, subfield_dict
            #Just a regular, unadorned property
            else:
                prop, val = parse_pair(U(li))
                if prop: yield prop, val, None

    #Gather the document-level metadata
    base = propbase = rbase = None
    for prop, val, subfield_dict in fields(docheader):
        if prop == '@base':
            base = val
        if prop == '@property-base':
            propbase = val
        if prop == '@resource-base':
            rbase = val
    if not propbase: propbase = base
    if not rbase: rbase = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(U(sect))
        if not matched:
            raise ValueError(
                _(u'Syntax error in resource header: {0}'.format(U(sect))))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = I(iri.absolutize(output.generate_resource(), base))
        if rtype:
            rtype = I(iri.absolutize(rtype, base))
        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_local)
        if rtype:
            output.add(rid, RDFTYPE, rtype)
        #Add the property
        for prop, val, subfield_dict in fields(sect):
            attrs = subfield_dict or {}
            fullprop = I(iri.absolutize(prop, propbase))
            resinfo = AB_RESOURCE_PAT.match(val)
            if resinfo:
                val = resinfo.group(1)
                valtype = resinfo.group(3)
                if not val: val = output.generate_resource()
                if valtype: attrs[RDFTYPE] = valtype
            if fullprop in interp:
                val = interp[fullprop](val,
                                       rid=rid,
                                       fullprop=fullprop,
                                       base=base,
                                       model=output)
                if val is not None: output.add(rid, fullprop, val)
            else:
                output.add(rid, fullprop, val, attrs)

    return base
Esempio n. 43
0
def get_page(environ, start_response):
    #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO'])))
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)
    upstream_handler = None
    status = httplib.OK
    params = cgi.parse_qs(environ['QUERY_STRING'])
    #Note: probably a better solution here: http://code.google.com/p/mimeparse/
    accepted_imts = environ.get('HTTP_ACCEPT', '').split(',')
    #logger.debug('accepted_imts: ' + repr(accepted_imts))
    imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
    #logger.debug('imt: ' + repr(imt))
    params_for_moin = {}
    cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used
    if NO_CACHE_PATHS and first_item(dropwhile(lambda x: x not in page, NO_CACHE_PATHS)):
        cache_max_age = None

    if 'rev' in params:
        #XXX: Not compatible with search
        #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'}
        params_for_moin = {'rev' : params['rev'][0]}
    if 'search' in params:
        searchq = params['search'][0]
        query = urllib.urlencode({'value' : searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text'})
        #?action=fullsearch&context=180&value=foo&=Text
        url = absolutize('?'+query, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
        cache_max_age = None
    #elif 'action' in params and params['action'][0] == 'recall':
    elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''):
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page+'?'+params, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.HTML_IMT
    elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''):
        #FIXME: Make unique flag optional
        #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1'
        url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base)
        #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', )
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
    elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''):
        url = absolutize(page + '?action=AttachFile', base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.ATTACHMENTS_IMT
        def upstream_handler():
            #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            doc = htmlparse(rbody)
            raise_embedded_error(doc)
            attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]')
            targets = []
            for node in attachment_nodes:
                target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0]
                targets.append(target)
            output = structencoder(indent=u"yes")
            output.feed(
            ROOT(
                E((u'attachments'),
                    (E(u'attachment', {u'href': unicode(t)}) for t in targets)
                )
            ))
            return output.read(), ctype
    #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw
    elif ';attachment=' in page:
        page, attachment = page.split(';attachment=', 1)
        url = absolutize(page + '?action=AttachFile&do=get&target=' + attachment, base)
        request = urllib2.Request(url, None, req_headers)
        def upstream_handler():
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            return rbody, dict(resp.info())['content-type']
    #
    elif ';history' in page:
        cache_max_age = None
        page, discard = page.split(';history', 1)
        ctype = moin.XML_IMT
        def upstream_handler():
            revs = scrape_page_history(page, base, opener, req_headers)
            output = structencoder(indent=u"yes")
            output.feed(
            ROOT(
                E((u'history'),
                    (E(u'rev', {u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T')}) for r in revs)
                )
            ))
            return output.read(), ctype
    elif imt:
        params_for_moin.update({'mimetype': imt})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.DOCBOOK_IMT
    else:
        params_for_moin.update({'action': 'raw'})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.WIKITEXT_IMT
    try:
        if upstream_handler:
            rbody, ctype = upstream_handler()
        else:
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
        
        #headers = {moin.ORIG_BASE_HEADER: base}
        #moin_base = absolutize(wiki_id, base)
        moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
        response_headers = [("Content-Type", ctype),
                            ("Vary", "Accept"),
                            (moin.ORIG_BASE_HEADER, moin_base_info)]
        if cache_max_age:
            response_headers.append(("Cache-Control","max-age="+str(cache_max_age)))

        start_response(status_response(status), response_headers)
        return rbody
    except urllib2.URLError, e:
        if e.code == 401:
            raise HTTPAuthorizationError(url=request.get_full_url())
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ),backurl=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
Esempio n. 44
0
def get_page(environ, start_response):
    #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO'])))
    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)
    upstream_handler = None
    status = httplib.OK
    params = cgi.parse_qs(environ['QUERY_STRING'])
    #Note: probably a better solution here: http://code.google.com/p/mimeparse/
    accepted_imts = environ.get('HTTP_ACCEPT', '').split(',')
    #logger.debug('accepted_imts: ' + repr(accepted_imts))
    imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
    #logger.debug('imt: ' + repr(imt))
    params_for_moin = {}
    cache_max_age = CACHE_MAX_AGE  # max-age of this response. If set to None, it will not be used
    if NO_CACHE_PATHS and first_item(
            dropwhile(lambda x: x not in page, NO_CACHE_PATHS)):
        cache_max_age = None

    if 'rev' in params:
        #XXX: Not compatible with search
        #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'}
        params_for_moin = {'rev': params['rev'][0]}
    if 'search' in params:
        searchq = params['search'][0]
        query = urllib.urlencode({
            'value': searchq,
            'action': 'fullsearch',
            'context': '180',
            'fullsearch': 'Text'
        })
        #?action=fullsearch&context=180&value=foo&=Text
        url = absolutize('?' + query, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
        cache_max_age = None
    #elif 'action' in params and params['action'][0] == 'recall':
    elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''):
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page + '?' + params, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.HTML_IMT
    elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''):
        #FIXME: Make unique flag optional
        #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1'
        url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base)
        #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', )
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
    elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''):
        url = absolutize(page + '?action=AttachFile', base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.ATTACHMENTS_IMT

        def upstream_handler():
            #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            doc = htmlparse(rbody)
            raise_embedded_error(doc)
            attachment_nodes = doc.xml_select(
                u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]'
            )
            targets = []
            for node in attachment_nodes:
                target = [
                    param.split('=', 1)[1] for param in node.href.split(u'&')
                    if param.startswith('target=')
                ][0]
                targets.append(target)
            output = structencoder(indent=u"yes")
            output.feed(
                ROOT(
                    E((u'attachments'),
                      (E(u'attachment', {u'href': unicode(t)})
                       for t in targets))))
            return output.read(), ctype
    #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw
    elif ';attachment=' in page:
        page, attachment = page.split(';attachment=', 1)
        url = absolutize(
            page + '?action=AttachFile&do=get&target=' + attachment, base)
        request = urllib2.Request(url, None, req_headers)

        def upstream_handler():
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            return rbody, dict(resp.info())['content-type']
    #
    elif ';history' in page:
        cache_max_age = None
        page, discard = page.split(';history', 1)
        ctype = moin.XML_IMT

        def upstream_handler():
            revs = scrape_page_history(page, base, opener, req_headers)
            output = structencoder(indent=u"yes")
            output.feed(
                ROOT(
                    E((u'history'), (E(
                        u'rev', {
                            u'id': unicode(r['rev']),
                            u'editor': unicode(r['editor']),
                            u'date': unicode(r['date']).replace(' ', 'T')
                        }) for r in revs))))
            return output.read(), ctype
    elif imt:
        params_for_moin.update({'mimetype': imt})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.DOCBOOK_IMT
    else:
        params_for_moin.update({'action': 'raw'})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.WIKITEXT_IMT
    try:
        if upstream_handler:
            rbody, ctype = upstream_handler()
        else:
            with closing(opener.open(request)) as resp:
                rbody = resp.read()

        #headers = {moin.ORIG_BASE_HEADER: base}
        #moin_base = absolutize(wiki_id, base)
        moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
        response_headers = [("Content-Type", ctype), ("Vary", "Accept"),
                            (moin.ORIG_BASE_HEADER, moin_base_info)]
        if cache_max_age:
            response_headers.append(
                ("Cache-Control", "max-age=" + str(cache_max_age)))

        start_response(status_response(status), response_headers)
        return rbody
    except urllib2.URLError, e:
        if e.code == 401:
            raise HTTPAuthorizationError(url=request.get_full_url())
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
Esempio n. 45
0
def absolutize(uriref, docuri):
    try:
        return absolutize(uriref, docuri)
    except:
        return uriref
Esempio n. 46
0
        result = urllib2.HTTPRedirectHandler.http_error_302(
            self, req, fp, code, msg, headers)
        result.status = code
        return result

RIF_NS = Namespace('http://www.w3.org/2007/rif#')
XSD_NS = Namespace('http://www.w3.org/2001/XMLSchema#')
ENT    = Namespace("http://www.w3.org/ns/entailment/")

mimetypes = {
    'application/rdf+xml' : 'xml',
    'text/n3'             : 'n3',
    'text/turtle'         : 'turtle',
}

TRANSFORM_URI = iri.absolutize('rif-core-rdf.xsl',iri.os_path_to_uri(__file__))

IMPORT_PARTS=\
"""
SELECT DISTINCT ?location ?profile {
    []    a             rif:Import;
          rif:location  ?location;
          rif:profile   ?profile .
}"""

IMPLIES_PARTS=\
"""
SELECT DISTINCT ?impl ?body ?bodyType ?head ?headType {
    ?impl a             rif:Implies;
          rif:if        ?body;
          rif:then      ?head .
Esempio n. 47
0
    '''
    fullprop = kwargs.get('fullprop')
    rid = kwargs.get('rid')
    base = kwargs.get('base', VERSA_BASEIRI)
    model = kwargs.get('model')
    iris = ltext.strip().split()
    for i in iris:
        model.add(rid, fullprop, I(iri.absolutize(i, base)))
    return None


PREP_METHODS = {
    VERSA_BASEIRI + 'text':
    lambda x, **kwargs: x,
    VERSA_BASEIRI + 'resource':
    lambda x, base=VERSA_BASEIRI, **kwargs: I(iri.absolutize(x, base)),
    VERSA_BASEIRI + 'resourceset':
    handleiriset,
}


#FIXME: Isn't this just itertools.islice?
def results_until(items, end_criteria):
    for node in items:
        if node.xml_select(end_criteria):
            break
        else:
            yield node


def from_markdown(md, output, encoding='utf-8', config=None):
Esempio n. 48
0
 def generate_resource(self):
     if self._baseuri:
         return iri.absolutize(str(self._id_counter), self._baseuri)
     else:
         return str(self._id_counter)
Esempio n. 49
0
def guess_self_uri(environ):
    return absolutize(environ['SCRIPT_NAME'].rstrip('/'),
                      request_uri(environ, include_query=False))
Esempio n. 50
0
        elif o == "--no-xi-filename":
            noXIfilename = a
    if len(args) < 1:
        usage()
        return 2
    elif docTesting:
        test()
        return 2
    graph = Graph()
    namespace_manager = NamespaceManager(Graph())
    for prefix,uri in nsBinds.items():
        if DEBUG:
            print >>sys.stderr, "binding %s to %s"%(prefix,uri)
        namespace_manager.bind(prefix, uri, override=False)        
    graph.namespace_manager = namespace_manager
    addr = absolutize(argv[-1], "file://%s/" % os.getcwd())

    try:
        GRDDLAgent(addr, graph, WebMemo(zones,DEBUG),DEBUG=DEBUG)
    except IOError, e:
        print >>sys.stderr, str(e)
        return 2

    print graph.serialize(format=output)

    if noXIfilename is not None:
        graph = Graph()
        try:
            GRDDLAgent(addr, graph, WebMemo(zones), False,DEBUG = DEBUG)
        except IOError, e:
            print >>sys.stderr, str(e)
Esempio n. 51
0
    def transform(self, transformURLs, webget):
        """
        Takes a space seperated list of transform url's and applies
        them against the pre-parsed DOM of the GRDDL source - making
        sure to avoid transformation already applied
        """                
        for xformURL in transformURLs.split():
            if self.DEBUG:
                print >>sys.stderr, "applying transformation %s" % (xformURL)
            if xformURL not in self.appliedTransforms:
                self.appliedTransforms.append(xformURL)
            #The transform url is resolved against the source URL (to
            #accomodate relative urls)
            stylesheetLoc = absolutize(xformURL, self.baseURI)
            lastUri, (content, info) = webget(stylesheetLoc, (XSLT_MT,))
            _transform = InputSource.DefaultFactory.fromString(content,
                                                              stylesheetLoc)
            iSrc = InputSource.DefaultFactory.fromString(self.docSrc,self.url)
            processor = Processor.Processor()
            processor.appendStylesheet(_transform)
            #see: http://www.w3.org/TR/grddl/#stylepi
            #Note, for the XSLT transform, the base URI of the source document
            #is passed in, instead of the base URI of the root node   
            result = processor.run(
                iSrc,ignorePis=1
            )
            #get output method / media-type
#            <!-- Category: top-level-element -->
#            <xsl:output
#              method = "xml" | "html" | "text" | qname-but-not-ncname
#              version = nmtoken
#              encoding = string
#              omit-xml-declaration = "yes" | "no"
#              standalone = "yes" | "no"
#              doctype-public = string
#              doctype-system = string
#              cdata-section-elements = qnames
#              indent = "yes" | "no"
#              media-type = string />

            #How to accomodate @media-type?
            method = processor.outputParams.method[-1]
            currLen = len(self.graph)
            if method == 'xml':
                self.graph.parse(StringIO(result), 
                                 publicID=self.baseURI)
                replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \
                               self.graph.triples((URIRef(''),None,None))]
                if replace:
                    if self.DEBUG:
                        print >>sys.stderr, \
                          "Replacing empty string URI ref with %s" % (
                            self.baseURI)                        
                    self.graph.remove((URIRef(''),None,None))
                    self.graph.addN(replace)                
                if self.DEBUG:
                    print >>sys.stderr,\
                     "Parsed %s triples (using baseURI: %s) as RDF/XML" % (
                        max(0,len(self.graph) - currLen),self.baseURI)
            elif method == 'text':
                #Attempt a Notation 3 parse (covers NTriples, and Turtle)
                try:
                    self.graph.parse(StringIO(result), format='n3',
                                     publicID=self.baseURI)
                    #@@This is mostly as a workaround for RDFLib 2.4 which will 
                    #force an empty URI string as the subject if xml:base = ''                    
                    replace = [(URIRef(self.baseURI),p,o,self.graph) for s,p,o in \
                                   self.graph.triples((URIRef(''),None,None))]
                    if replace:
                        if self.DEBUG:
                            print >>sys.stderr, \
                              "Replacing empty string URI ref with %s" % (
                                self.baseURI)                        
                        self.graph.remove((URIRef(''),None,None))
                        self.graph.addN(replace)                    
                    if self.DEBUG:
                        print >>sys.stderr, \
                        "Parsed %s triples (using baseURI: %s) as Notation 3" % (
                            max(0,len(self.graph) - currLen),self.baseURI)
                except:
                    if self.DEBUG:
                        print >>sys.stderr, "Unknown text-based RDF serialization"
            else:
                #HTML result - recursive GRDDL mechanism?
                raise Exception("unsupported output type")
Esempio n. 52
0
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'):
    '''
    A generator of CDM records
    First generates header info

    >>> from zen.contentdm import read_contentdm
    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None)
    >>> results.next()
    {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'}
    >>> results.next()
    {u'Title': u'60 years in darkness.  ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...}

    The first yielded value is global metadata; the  second is the record
    for the first item  in the collection/query, and so on until all the items
    are returned, or the limit reached.

    If you want to see the debug messages, just do (before calling read_contentdm for the first time):

    >>> import logging; logging.basicConfig(level=logging.DEBUG)

    for a nice-sized collection to try:
    >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps')

    Auburn theater collection:

    >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01')
    >>> read_contentdm('http://content.lib.auburn.edu', collection='/football')

    i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps

    See also:

    * /cdm4/browse.php?CISOROOT=/football (51 items)

    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]')

    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return

    items = follow_pagination(resultsdoc)

    at_least_one = False
    count = 0
    for it in items:
        at_least_one = True
        pageuri = absolutize(it.href, site)
        if pageuri in seen_links:
            continue
        seen_links.add(pageuri)
        entry = {}
        logger.debug("Processing item URL: {0}".format(pageuri))
        (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri)
        entry['domain'] = netloc
        params = parse_qs(query)
        entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0]
        entry['id'] = params['CISOPTR'][0]
        logger.debug("Item id: {0}".format(entry['id']))
        if entry['id'] in seen_ids:
            continue
        seen_ids.add(entry['id'])
        entry['link'] = unicode(pageuri)
        entry['local_link'] = '#' + entry['id']

        resp, page, cachekey, cached = cdmsite.item_page(pageuri)

        if cached:
            entry = cached
        else:
            image = first_item(page.xml_select(u'//td[@class="tdimage"]//img'))
            if image:
                imageuri = absolutize(image.src, site)
                entry['imageuri'] = imageuri
                try:
                    entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site)
                except AttributeError:
                    logger.debug("No thumbnail")
            #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root)
            #fields = page.xml_select(u'//tr[td[@class="tdtext"]]')
            #fields = page.xml_select(u'//table[@class="metatable"]/tr')
            fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr'))
            for f in fields:
                #key = unicode(f.td[0].span.b).replace(' ', '_')
                key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b')))
                #logger.debug("{0}".format(key))
                value = u''.join(CONTENT.dispatch(f.td[1]))
                #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]')))
                entry[key] = unicode(value)
            if u'Title' in entry:
                #logger.debug("{0}".format(entry['Title']))
                entry['label'] = entry['Title']
            else:
                entry['label'] = u'[NO LABEL AVAILABLE]'
            if u"Location_Depicted" in entry:
                locations = entry[u"Location_Depicted"].split(u', ')
                #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ]
                locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ]
                #print >> sys.stderr, "LOCATIONS", repr(locations)
                entry[u"Locations_Depicted"] = locations
            if u"Date_Original" in entry:
                entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') 
            entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ]
            if cachedir:
                try:
                    json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w')
                    json.dump(entry, json_stream)
                except IOError, ValueError:
                    pass

        yield entry
        count += 1
        if limit and count >= limit:
            logger.debug("Limit reached")
            break
Esempio n. 53
0
 def generate_resource(self):
     if self._baseuri:
         return iri.absolutize(str(self._id_counter), self._baseuri)
     else:
         return str(self._id_counter)