Python htmlparse Exemples, amara.bindery.html.htmlparse Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError,e:
        raise UnexpectedResponseError(url=url,code=e.code,error=str(e))

Exemple #2

0

Afficher le fichier

def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError, e:
        raise UnexpectedResponseError(url=url, code=e.code, error=str(e))

Exemple #3

0

Afficher le fichier

def fill_attachment_form(page,
                         attachment,
                         wiki_id,
                         base,
                         opener,
                         headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + '?action=AttachFile', None, headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to post to a page
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist,
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))

Exemple #4

0

Afficher le fichier

def fill_page_edit_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + "?action=edit&editor=text", None, headers)
    #logger.debug('GRIPPO ' + repr((headers)))
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read()
            resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to edit a page
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist,
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client

        #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))

Exemple #5

0

Afficher le fichier

def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError, e:

        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))

Exemple #6

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.
    
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError,e:
        
        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))

Exemple #7

0

Afficher le fichier

Fichier : contentdm.py Projet : dpla/zen

 def index_page(self, url, logtag="Requesting index at URL: {0}"):
     if self._proxy:
         url = "{0}?url={1}".format(self._proxy, quote(url))
     self._logger.debug(logtag.format(url))
     start_t = time.time()
     resp, content = self._h.request(url)
     retrieved_t = time.time()
     self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t))
     doc = htmlparse(content)
     parsed_t = time.time()
     self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t))
     return resp, doc

Exemple #8

0

Afficher le fichier

Fichier : scrapesniff.py Projet : mredar/amara

 def __init__(self, source, html=False):
     self.records = []
     if html:
         from amara.bindery.html import parse as htmlparse
         self.doc = htmlparse(source)
         #html5lib generates adjacent text nodes
         self.doc.xml_normalize()
     else:
         self.doc = amara.parse(source)
     self.new_record()
     self.common_ancestor = None
     self.record_pattern = None
     return

Exemple #9

0

Afficher le fichier

Fichier : scrapesniff.py Projet : abed-hawa/amara

 def __init__(self, source, html=False):
     self.records = []
     if html:
         from amara.bindery.html import parse as htmlparse
         self.doc = htmlparse(source)
         #html5lib generates adjacent text nodes
         self.doc.xml_normalize()
     else:
         self.doc = amara.parse(source)
     self.new_record()
     self.common_ancestor = None
     self.record_pattern = None
     return

Exemple #10

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def fill_page_delete_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=DeletePage", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read(); resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))

Exemple #11

0

Afficher le fichier

def fill_page_delete_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url + "?action=DeletePage", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read()
            resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError, e:
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),
                                          code=e.code,
                                          error=str(e))

Exemple #12

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def scrape_page_history(page, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=info", None, headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to post to a page 
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist, 
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))

Exemple #13

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

 def upstream_handler():
     #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
     with closing(opener.open(request)) as resp:
         rbody = resp.read()
     doc = htmlparse(rbody)
     raise_embedded_error(doc)
     attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]')
     targets = []
     for node in attachment_nodes:
         target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0]
         targets.append(target)
     output = structencoder(indent=u"yes")
     output.feed(
     ROOT(
         E((u'attachments'),
             (E(u'attachment', {u'href': unicode(t)}) for t in targets)
         )
     ))
     return output.read(), ctype

Exemple #14

0

Afficher le fichier

def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" +
                 repr((wiki_id, base, opener, original_page,
                       wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
    #    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener,
                                     req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError, e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))

Exemple #15

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def fill_page_edit_form(page, wiki_id, base, opener, headers=None):
    url = absolutize(page, base)
    request = urllib2.Request(url+"?action=edit&editor=text", None, headers)
    #logger.debug('GRIPPO ' + repr((headers)))
    try:
        with closing(opener.open(request)) as resp:
            x = resp.read(); resp = x
            doc = htmlparse(resp)
            raise_embedded_error(doc)

    except urllib2.URLError,e:
        # Comment concerning the behavior of MoinMoin.  If an attempt is made to edit a page 
        # and the user is not authenticated, you will either get a 403 or 404 error depending
        # on whether or not the page being edited exists or not.   If it doesn't exist, 
        # MoinMoin sends back a 404 which is misleading.   We raise MoinMustAuthenticateError
        # to signal the error wrapper to issue a 401 back to the client
        
        #Note: Moin for somereason seems to give 403 errors on some URLs in response to Curl's UA
        if e.code == 403 or e.code == 404:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        else:
            raise UnexpectedResponseError(url=request.get_full_url(),code=e.code,error=str(e))

Exemple #16

0

Afficher le fichier

Fichier : contentdm.py Projet : dpla/zen

 def item_page(self, url, logtag="Requesting item at URL: {0}"):
     if self._proxy:
         url = "{0}?url={1}".format(self._proxy, quote(url))
     self._logger.debug(logtag.format(url))
     start_t = time.time()
     resp, content = self._h.request(url)
     retrieved_t = time.time()
     self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t))
     cachekey = hashlib.md5(content).hexdigest()
     self._logger.debug('MD5 Hash of HTTP body: {0}'.format(cachekey))
     if self._cachedir:
         try:
             json_stream = open(os.path.join(self._cachedir, cachekey+'.extract.js'))
             cached = json.load(json_stream)
             self._logger.debug('Loaded from cache: {0}'.format(cachekey))
             doc = None
         except (IOError, ValueError):
             doc = htmlparse(content)
             cached = None
     parsed_t = time.time()
     self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t))
     return resp, doc, cachekey, cached

Exemple #17

0

Afficher le fichier

 def upstream_handler():
     #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
     with closing(opener.open(request)) as resp:
         rbody = resp.read()
     doc = htmlparse(rbody)
     raise_embedded_error(doc)
     attachment_nodes = doc.xml_select(
         u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]'
     )
     targets = []
     for node in attachment_nodes:
         target = [
             param.split('=', 1)[1] for param in node.href.split(u'&')
             if param.startswith('target=')
         ][0]
         targets.append(target)
     output = structencoder(indent=u"yes")
     output.feed(
         ROOT(
             E((u'attachments'),
               (E(u'attachment', {u'href': unicode(t)})
                for t in targets))))
     return output.read(), ctype

Exemple #18

0

Afficher le fichier

Fichier : moinrest.py Projet : pombredanne/akara

def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
#    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError,e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))

Exemple #19

0

Afficher le fichier

Fichier : contentdm.py Projet : dpla/zen

def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'):
    '''
    A generator of CDM records
    First generates header info

    >>> from zen.contentdm import read_contentdm
    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None)
    >>> results.next()
    {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'}
    >>> results.next()
    {u'Title': u'60 years in darkness.  ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...}

    The first yielded value is global metadata; the  second is the record
    for the first item  in the collection/query, and so on until all the items
    are returned, or the limit reached.

    If you want to see the debug messages, just do (before calling read_contentdm for the first time):

    >>> import logging; logging.basicConfig(level=logging.DEBUG)

    for a nice-sized collection to try:
    >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps')

    Auburn theater collection:

    >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01')
    >>> read_contentdm('http://content.lib.auburn.edu', collection='/football')

    i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps

    See also:

    * /cdm4/browse.php?CISOROOT=/football (51 items)

    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]')

    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return

    items = follow_pagination(resultsdoc)

    at_least_one = False
    count = 0
    for it in items:
        at_least_one = True
        pageuri = absolutize(it.href, site)
        if pageuri in seen_links:
            continue
        seen_links.add(pageuri)
        entry = {}
        logger.debug("Processing item URL: {0}".format(pageuri))
        (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri)
        entry['domain'] = netloc
        params = parse_qs(query)
        entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0]
        entry['id'] = params['CISOPTR'][0]
        logger.debug("Item id: {0}".format(entry['id']))
        if entry['id'] in seen_ids:
            continue
        seen_ids.add(entry['id'])
        entry['link'] = unicode(pageuri)
        entry['local_link'] = '#' + entry['id']

        resp, page, cachekey, cached = cdmsite.item_page(pageuri)

        if cached:
            entry = cached
        else:
            image = first_item(page.xml_select(u'//td[@class="tdimage"]//img'))
            if image:
                imageuri = absolutize(image.src, site)
                entry['imageuri'] = imageuri
                try:
                    entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site)
                except AttributeError:
                    logger.debug("No thumbnail")
            #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root)
            #fields = page.xml_select(u'//tr[td[@class="tdtext"]]')
            #fields = page.xml_select(u'//table[@class="metatable"]/tr')
            fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr'))
            for f in fields:
                #key = unicode(f.td[0].span.b).replace(' ', '_')
                key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b')))
                #logger.debug("{0}".format(key))
                value = u''.join(CONTENT.dispatch(f.td[1]))
                #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]')))
                entry[key] = unicode(value)
            if u'Title' in entry:
                #logger.debug("{0}".format(entry['Title']))
                entry['label'] = entry['Title']
            else:
                entry['label'] = u'[NO LABEL AVAILABLE]'
            if u"Location_Depicted" in entry:
                locations = entry[u"Location_Depicted"].split(u', ')
                #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ]
                locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ]
                #print >> sys.stderr, "LOCATIONS", repr(locations)
                entry[u"Locations_Depicted"] = locations
            if u"Date_Original" in entry:
                entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') 
            entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ]
            if cachedir:
                try:
                    json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w')
                    json.dump(entry, json_stream)
                except IOError, ValueError:
                    pass

        yield entry
        count += 1
        if limit and count >= limit:
            logger.debug("Limit reached")
            break

Exemple #20

0

Afficher le fichier

def execute(top=None):
    '''
    Sample request:
    curl -F "pattern=wiki/path" -F "wiki=http://localhost:8880/moin/foo/" "http://*****:*****@class="navigation"]//@href'):
        link = navchild.xml_value
        #print >> sys.stderr, 'LINK:', link
        #uri = split_fragment(item.resource)[0]
        #relative = uri[wikibase_len:]
        #print >> sys.stderr, uri, relative
        #if rewrite:
        #    uri = uri.replace(rewrite, wikibase)
        rest_uri = wrapped_uri(original_wiki_base, link)
        #print >> sys.stderr, 'rest uri:', rest_uri
        items.append(freemix(rest_uri, opener).render())
    return json.dumps({'items': items}, indent=4)

Exemple #21

0

Afficher le fichier

Fichier : moincms.py Projet : dpla/akara

def execute(top=None):
    '''
    Sample request:
    curl -F "pattern=wiki/path" -F "wiki=http://localhost:8880/moin/foo/" "http://*****:*****@class="navigation"]//@href'):
        link = navchild.xml_value
        #print >> sys.stderr, 'LINK:', link
        #uri = split_fragment(item.resource)[0]
        #relative = uri[wikibase_len:]
        #print >> sys.stderr, uri, relative
        #if rewrite:
        #    uri = uri.replace(rewrite, wikibase)
        rest_uri = wrapped_uri(original_wiki_base, link)
        #print >> sys.stderr, 'rest uri:', rest_uri
        items.append(freemix(rest_uri, opener).render())
    return json.dumps({'items': items}, indent=4)

Exemple #22

0

Afficher le fichier

Fichier : test_contentdm.py Projet : dpla/zen

 def test(self):
     """ docstring """
     doc = htmlparse(QUERY)
     self.assertEqual(doc.xml_type, 'document')
     self.assertEqual(doc.hasContent(), True)