Esempio n. 1
0
def couch_rev_check_recs_old(docs, src):
    """
    Insert revisions for all records into structure using CouchDB bulk interface.
    Uses key ranges to narrow bulk query to the source being ingested.

    Deprecated: has performance issue
    """

    uri = join(COUCH_DATABASE,'_all_docs')
    start = quote(COUCH_ID_BUILDER(src,''))
    end = quote(COUCH_ID_BUILDER(src,'Z'*100)) # FIXME. Is this correct?
    uri += '?startkey=%s&endkey=%s'%(start,end)

    # REVU: it fetches all docs from db again and again for each doc bulk
    # by killing performance and can cause memory issues with big collections
    # so, if you need to set revisions for each 100 doc among 10000, you
    # will be getting by 10000 docs for each hundred (100 times)
    #
    # new version is implemented in couch_rev_check_recs2, see details
    resp, cont = H.request(join(COUCH_DATABASE,'_all_docs'), 'GET', headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        rows = json.loads(cont)["rows"]
        #revs = { r["id"]:r["value"]["rev"] for r in rows } # 2.7 specific
        revs = {}
        for r in rows:
            revs[r["id"]] = r["value"]["rev"]
        for doc in docs:
            id = doc['_id']
            if id in revs:
                doc['_rev'] = revs[id]
    else:
        logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(resp))
        logger.warn('Request old: ' + uri)
Esempio n. 2
0
def save_document(document):
    """
    Saves the document in the couchdb.

    Arguments:
        document - document to save

    Returns:
        If saving succeeded: the value returned by akara.
        If saving failed: a bunch of error logs is written - returns False.

    """
    logging.info("Updating document in database")
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(conf['AKARA_SERVER'], conf['UPDATE_DOCUMENT_URL'], document[u'id'])
    logging.debug("Calling url: " + url)
    doc = json.dumps(document[u'value'])
    resp, content = h.request(url, 'POST', body=doc)
    if str(resp.status).startswith('2'):
        return content
    else:
        logging.error("Couldn't update document [id=%s]" % (document[u'id']))
        logging.error("    … with data: %s" % (pp.pformat(document)))
        logging.error("    … with raw data: %s" % (doc,))
        return False
Esempio n. 3
0
def couch_rev_check_recs(docs):
    """
    Insert revisions for all records into structure using CouchDB bulk interface.
    Uses key ranges to narrow bulk query to the source being ingested.

    Performance improved version of couch_rev_check_recs_old, but it uses another input format:
    Input:
     {doc["_id"]: doc, ...}
    """
    if not docs:
        return
    uri = join(COUCH_DATABASE, '_all_docs')
    docs_ids = sorted(docs)
    start = docs_ids[0]
    end = docs_ids[-1:][0]
#    uri += "?" + urlencode({"startkey": start, "endkey": end})
    uri += '?startkey="%s"&endkey="%s"' % (quote_plus(start), quote_plus(end))
    response, content = H.request(uri, 'GET', headers=COUCH_AUTH_HEADER)
    if str(response.status).startswith('2'):
        rows = json.loads(content)["rows"]
        for r in rows:
            if r["id"] in docs:
                docs[r["id"]]["_rev"] = r["value"]["rev"]
    else:
        logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(response))
        logger.warn('Request: ' + uri)
Esempio n. 4
0
def target(environ):
    wiki_id = shift_path_info(environ)
    full_incoming_request = request_uri(environ)
    if wiki_id not in TARGET_WIKIS:
        raise BadTargetError(fronturl=request_uri(environ), target=wiki_id)
    original_page = join(TARGET_WIKIS[wiki_id].rstrip('/')+'/', environ['PATH_INFO'].lstrip('/'))
    #relative_to_wrapped = relativize(, full_incoming_request)
    wrapped_wiki_base = full_incoming_request[:-len(environ['PATH_INFO'])]
    return wiki_id, TARGET_WIKIS[wiki_id], TARGET_WIKI_OPENERS.get(wiki_id), original_page, wrapped_wiki_base
Esempio n. 5
0
File: couchdb.py Progetto: dpla/zen
    def update_resource(self, path=None):
        '''
        Update a resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'

        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))

        body = self.environ['wsgi.input'].read()

        # If the document already exists, we need to determine its current rev and add it to the
        # input body, skipping the process if rev is provided in the PUT request body
        body_js = json.loads(body)
        rev = json.loads(body).get('_rev',None)
        if not rev:
            # Need to GET the rev
            resp, content = self.h.request(join(self.remotedb, docid), "GET")
            if str(resp.status).startswith('2'):
                rev = json.loads(content).get('_rev',None)

            logger.debug('update_resource: found existing rev = '+repr(rev))

        if rev:
            body_js['_rev'] = rev
            body = json.dumps(body_js)

        headers = {'content-type':self.environ['CONTENT_TYPE']}
        resp, content = self.h.request(join(self.remotedb, docid), "PUT", body=body, headers=headers)
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        return content
Esempio n. 6
0
File: couchdb.py Progetto: dpla/zen
 def get_rulesheet(self):
     rsheet = self.data['zen:metadata']['zen:rulesheet']
     if rsheet == '.':
         #The rulesheet is in a standalone attachment to thios doc
         rev = self.data['_rev']
         self.rulesheet = join(self.slave_uri, u'attachment?rev=' + rev)
     else:
         #self.rulesheet = UNSPECIFIED
         self.rulesheet = rsheet
     if self.space: self.space.logger.debug('resource_type.get_rulesheet slave_uri, rulesheet: ' + repr((self.slave_uri, self.rulesheet)))
     return self.rulesheet
Esempio n. 7
0
def listrecords(limit=100):
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME)
    url += '?limit=' + str(limit)
    logger.debug(url)
    resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER)
    logger.debug("Content: " + content)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't get documents via: " + repr(resp))
Esempio n. 8
0
def couch_rev_check_recs(docs,src):
    '''
    Insert revisions for all records into structure using CouchDB bulk interface.
    Uses key ranges to narrow bulk query to the source being ingested.
    '''
    uri = join(COUCH_DATABASE,'_all_docs')
    start = quote(COUCH_ID_BUILDER(src,''))
    end = quote(COUCH_ID_BUILDER(src,'Z'*100)) # FIXME. Is this correct?
    uri += '?startkey=%s&endkey=%s'%(start,end)
    resp, cont = H.request(join(COUCH_DATABASE,'_all_docs'),'GET',headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        rows = json.loads(cont)["rows"]
        #revs = { r["id"]:r["value"]["rev"] for r in rows } # 2.7 specific
        revs = {}
        for r in rows:
            revs[r["id"]] = r["value"]["rev"]
        for doc in docs:
            id = doc['_id']
            if id in revs:
                doc['_rev'] = revs[id]
    else:
        logger.debug('Unable to retrieve document revisions via bulk interface: '+repr(resp))
Esempio n. 9
0
def listrecords(limit=100):
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME)
    url += '?limit=' + str(limit)
    logger.debug(url)
    resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER)
    logger.debug("Content: " + content)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't get documents via: " + repr(resp))
Esempio n. 10
0
def target(environ):
    wiki_id = shift_path_info(environ)
    full_incoming_request = request_uri(environ)
    if wiki_id not in TARGET_WIKIS:
        raise BadTargetError(fronturl=request_uri(environ), target=wiki_id)
    original_page = join(TARGET_WIKIS[wiki_id].rstrip('/') + '/',
                         environ['PATH_INFO'].lstrip('/'))
    #relative_to_wrapped = relativize(, full_incoming_request)
    if len(environ['PATH_INFO']) > 0:
        wrapped_wiki_base = full_incoming_request[:-len(environ['PATH_INFO'])]
    else:
        wrapped_wiki_base = full_incoming_request
    return wiki_id, TARGET_WIKIS[wiki_id], TARGET_WIKI_OPENERS.get(
        wiki_id), original_page, wrapped_wiki_base
Esempio n. 11
0
File: couchdb.py Progetto: dpla/zen
    def delete_resource(self, path=None):
        '''
        Delete a resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'

        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))
        resp, content = self.h.request(join(self.remotedb, docid), "DELETE")#, headers=headers)
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        return content
Esempio n. 12
0
File: couchdb.py Progetto: dpla/zen
    def __init__(self, space, docid, data, rtype=None):
        '''
        '''
        self.docid = docid
        self.space = space
        self.slave_uri = join(space.remotedb, docid)
        self.data = data
        self.rulesheet = None

        if logger: logger.debug('GRIPPO: ' + repr(rtype))
        if isinstance(rtype, basestring) and rtype != RESOURCE_TYPE_TYPE:
            self.type = space.resource_factory(rtype)
        else:
            self.type = rtype
        return
Esempio n. 13
0
File: couchdb.py Progetto: dpla/zen
    def resource_factory(self, path=None):
        '''
        Look up and retrieve a new resource based on WSGI environment or a uri path
        '''
        if path:
            docid = path
            if is_absolute(path):
                docid = relativize(path, self.remotedb)
        else:
            docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc'
        #resp, content = self.h.request(slave_uri + ';history', "GET", headers=auth_headers)
        if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid))))
        resp, content = self.h.request(join(self.remotedb, urllib.quote_plus(docid)))
        
        if logger: logger.debug('resp ' + repr((content[:100], resp)))

        self.prep_slave_response(resp)

        if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')):
            if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status))
            return '' #No resource could be retrieved

        data = json.loads(content)
        return resource.factory(self, docid, data)
Esempio n. 14
0
def update_document(body, ctype):
    from StringIO import StringIO
    io = StringIO(body) 
    parsed_doc = json.load(io) 
    document_id = parsed_doc[u"id"]
    document  = body

    logger.debug("Storing the document: " + document_id)
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, document_id)
    resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
Esempio n. 15
0
File: couchdb.py Progetto: dpla/zen
    def setup_request(self, environ):
        '''
        Prepare to service a forwarded call from Zen central
        environ - the WSGI environ of the original invocation
        '''
        #Prepare the WSGI start_response function, which covers response headers and status
        self.resp_status = None
        self.resp_headers = None
        self.exc_info = None
        self.environ = environ
        #FIXME: Use akara to get the right cache location
        self.h = httplib2.Http('/tmp/.cache')
        self.h.force_exception_to_status_code = True

        #Set up utility environ variable for rulesheets
        self.environ['zen.RESOURCE_URI'] = join(self.ZEN_BASEURI, environ['PATH_INFO'].lstrip('/').split('/')[0])
        self.environ['couchdb.RESOURCE_URI'] = self.remotedb
        return
Esempio n. 16
0
def update_document(body, ctype):
    logger.debug(body)
    from StringIO import StringIO
    io = StringIO(body) 
    parsed_doc = json.load(io) 
    document_id = parsed_doc[u"id"]
    document  = body

    logger.debug("Storing the document: " + document_id)
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, document_id)
    resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
Esempio n. 17
0
def find_peer_service(peer_id, environ=None):
    '''
    DEPRECATED! Use discover_service() and work with the resulting query_template property
    Find a peer service endpoint, by ID, mounted on this same Akara instance
    
    Must be caled from a running akara service, and it is highly recommended to call
    at the top of service functions, or at least before the request environ has been manipulated
    '''
    if not in_akara():
        raise RuntimeError('find_peer_service is meant to be called from within Akara process space')
    from akara.registry import _current_registry
    from akara import request
    if environ:
        serverbase = guess_self_uri(environ)
    else:
        serverbase = getattr(global_config, 'server_root')
    for (path, s) in _current_registry._registered_services.iteritems():
        if s.ident == peer_id:
            return join(serverbase, '..', path)
    return None
Esempio n. 18
0
def find_peer_service(peer_id, environ=None):
    '''
    DEPRECATED! Use discover_service() and work with the resulting query_template property
    Find a peer service endpoint, by ID, mounted on this same Akara instance
    
    Must be caled from a running akara service, and it is highly recommended to call
    at the top of service functions, or at least before the request environ has been manipulated
    '''
    if not in_akara():
        raise RuntimeError(
            'find_peer_service is meant to be called from within Akara process space'
        )
    from akara.registry import _current_registry
    from akara import request
    if environ:
        serverbase = guess_self_uri(environ)
    else:
        serverbase = getattr(global_config, 'server_root')
    for (path, s) in _current_registry._registered_services.iteritems():
        if s.ident == peer_id:
            return join(serverbase, '..', path)
    return None
Esempio n. 19
0
def get_documents():
    """
    Downloads a set of documents from couchdb. If there is an error with 
    downloading the docuemtns, the script exits.

    Arguments:
        None

    Returns:
        None
    """
    logging.info('Getting documents from akara.')
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(conf['AKARA_SERVER'], conf['GET_DOCUMENTS_URL']) + "?limit=%s" % conf['GET_DOCUMENTS_LIMIT']
    logging.debug('Using akara url: ' + url)
    resp, content = h.request(url, 'GET')
    if str(resp.status).startswith('2'):
        return content
    else:
        logging.error("Couldn't get documents using: " + url)
        logging.error("Emergency exit…")
        exit(1)
Esempio n. 20
0
def akara_calendar(highlight=None):
    '''
    Return a calendar in HTML
    Generates a calendar along the lines of:

        <  January, 2007   >
        Mo Tu We Th Fr Sa Su
               1  2  3  4  5
         6  7  8  9 10 11 12
        13 14 15 16 17 18 19
        20 21 22 23 24 25 26
        27 28 29 30 31

    Marks present date and those that have entries with archive links

    Defines the following classes (for use in CSS customization):

      - akaraCalCalendar
        - calendar table (note: month/year header e.g. January 2007 is in table/th)
      - akaraCalCalendarWeekHeaders
        - week header (Su, Mo, Tu, ...)
      - akaraCalCalendarEmpty
        - filler cell (e.g. days after Jan 31)
      - akaraCalCalendarLive
        - day for which there is an entry (also has links to that day's archives)

    And the following IDs:

      - akaraCalCalendarToday
        - today's calendar day
      - akaraCalCalendarSpecificDay
        - specific day being rendered (if any)

    Some ideas (e.g. CSS styling of the table) from pycalendar.py by Will Guaraldi

    Sample request:
    curl http://localhost:8880/akara.calendar
    curl http://localhost:8880/akara.calendar/2008/12
    curl http://localhost:8880/akara.calendar/2008/12?highlight=2008-12-03
    '''
    baseuri = request.environ['SCRIPT_NAME'] + '/'
    today = date.today()
    year = shift_path_info(request.environ)
    month = shift_path_info(request.environ)
    if highlight:
        #Fun axiom: date(*map(int, date.today().isoformat().split('-')))
        highlight = date(*map(int, highlight.split('-')))
    if year and month:
        #Use specified year & month
        year, month = int(year), int(month)
        if (year, month) == (today.year, today.month):
            present_day = today.day
        else:
            present_day = None
    else:
        #XXX We might want to return Bad Request of they specified year but not day
        #Use present year & month
        year, month = today.year, today.month
        present_day = today.day
    #logger.debug("year: " + repr(year))

    dayheaders = ''.join(
        ['<td>%s</td>' % dh for dh in calendar.weekheader(3).split()])
    monthcal = calendar.monthcalendar(year, month)
    c = []
    for wk in monthcal:
        c.append('<tr>\n')
        for d in wk:
            d_int = int(d)
            attrs = ''
            if d_int < 1:
                d = '&#160;'
                fulldate = date.max  #never to be found in archives
                attrs += ' class="akaraCalCalendarEmpty"'
            else:
                fulldate = date(year, month, d_int)
            # "today" trumps "specific day"
            if d_int == present_day:
                attrs += ' id="akaraCalCalendarToday"'
            elif highlight and d_int == highlight.day:
                attrs += ' id="akaraCalCalendarSpecificDay"'
            #if fulldate in archives:
            #    attrs += ' class="akaraCalCalendarLive"'
            #d = '<a href="%s%i/%i/%s/">%s</a>'%(self.weblog_base_url, year, month, d, d)
            #    d = '%s'%(d)
            c.append('\t<td%s>%s</td>\n' % (attrs, d))
        c.append('\n</tr>\n')
    monthname = calendar.month_name[month]
    prevmonth = date(year, month, 1) + relativedelta(months=-1)
    nextmonth = date(year, month, 1) + relativedelta(months=+1)
    #Yes, even checking if prevmonth > today, so if someone surfs
    #3 month in the future, there will be no month nav links
    if prevmonth > today:
        prevmonth = ''
    else:
        #prevmonth = '<th><a href="%s%i/%i/">&lt;&lt;</a></th>'%(self.weblog_base_url, prevmonth.year, prevmonth.month)
        prevmonth = '<th><a href="%s">&lt;&lt;</a></th>' % (join(
            baseuri, str(prevmonth.year), str(prevmonth.month)))
    if nextmonth > today:
        nextmonth = ''
    else:
        nextmonth = '<th><a href="%s">&gt;&gt;</a></th>' % (join(
            baseuri, str(nextmonth.year), str(nextmonth.month)))
    month = ''.join(c)
    cal = CAL_TEMPLATE.safe_substitute(locals())
    return cal
Esempio n. 21
0
def enrich(body,ctype):
    '''   
    Establishes a pipeline of services identified by an ordered list of URIs provided
    in two request headers, one for collections and one for records
    '''

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    collection_name = request_headers.get('Collection')

    if not (collection_name or source_name):
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Source and Collection request headers are required"

    coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',')
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    # First, we run the collection representation through its enrichment pipeline
    cid = "%s-%s"%(source_name,collection_name)
    at_id = "http://dp.la/api/collections/" + cid
    COLL = {
        "_id": cid,
        "@id": at_id,
    }

    enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL')
    enriched_collection = json.loads(enriched_coll_text)
    if COUCH_DATABASE:
        docuri = couch_rev_check(join(COUCH_DATABASE,cid))
        resp, cont = H.request(docuri,'PUT',body=enriched_coll_text,headers=CT_JSON)
        if not str(resp.status).startswith('2'):
            logger.debug("Error storing collection in Couch: "+repr((resp,content)))

    # Then the records
    for record in data[u'items']:
        # Preserve record prior to any enrichments
        record['original_record'] = record.copy()         

        # Add collection information
        record[u'collection'] = {
            '@id' : at_id,
            'title' : enriched_collection.get('title',"")
        }

        # Set id to value of the first handle, disambiguated w source. Not sure if
        # one is guaranteed or on what scale it's unique
        rid = "%s-%s"%(source_name,record[u'handle'][0].strip())
        record[u'_id'] = rid

        enriched_record = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        if COUCH_DATABASE:
            docuri = couch_rev_check(join(COUCH_DATABASE,rid))
            resp, cont = H.request(docuri,'PUT',body=enriched_record,headers=CT_JSON)
            if not str(resp.status).startswith('2'):
                logger.debug("Error storing record in Couch: "+repr((resp,content)))
                continue
    
    return json.dumps({})
Esempio n. 22
0
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError, e:
        raise UnexpectedResponseError(url=url, code=e.code, error=str(e))

    wrapped_url = join(wrapped_wiki_base, page)
    msg = 'Page updated OK: %s (%s)' % (url, wrapped_url)
    #response.add_header("Content-Length", str(len(msg)))
    moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
    headers = [
        ("Content-Type", "text/plain"),
        ("Content-Location", wrapped_url),
        (moin.ORIG_BASE_HEADER, moin_base_info),
        (moin.WIKI_RELATIVE_HEADER, relativize(wrapped_url,
                                               wrapped_wiki_base)),
    ]
    start_response(status_response(httplib.CREATED), headers)
    return [msg]


# POST handler
Esempio n. 23
0
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError,e:
        raise UnexpectedResponseError(url=url,code=e.code,error=str(e))

    wrapped_url = join(wrapped_wiki_base, page)
    msg = 'Page updated OK: %s (%s)'%(url, wrapped_url)
    #response.add_header("Content-Length", str(len(msg)))
    moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
    headers = [
        ("Content-Type", "text/plain"),
        ("Content-Location", wrapped_url),
        (moin.ORIG_BASE_HEADER, moin_base_info),
        (moin.WIKI_RELATIVE_HEADER, relativize(wrapped_url, wrapped_wiki_base)),
        ]
    start_response(status_response(httplib.CREATED), headers)
    return [msg]


# POST handler
@dispatcher.method("POST")
Esempio n. 24
0
def enrich(body,ctype):
    '''   
    Establishes a pipeline of services identified by an ordered list of URIs provided
    in two request headers, one for collections and one for records
    '''

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    collection_name = request_headers.get('Collection')

    if not (collection_name or source_name):
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Source and Collection request headers are required"

    coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',')
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    # First, we run the collection representation through its enrichment pipeline
    cid = COUCH_ID_BUILDER(source_name,collection_name)
    at_id = "http://dp.la/api/collections/" + cid
    COLL = {
        "_id": cid,
        "@id": at_id,
        "ingestType": "collection"
    }
    # Set collection title field from collection_name if no sets
    if not coll_enrichments[0]:
        COLL['title'] = collection_name 
    set_ingested_date(COLL)

    enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL')
    enriched_collection = json.loads(enriched_coll_text)
    # FIXME. Integrate collection storage into bulk call below
    if COUCH_DATABASE:
        docuri = join(COUCH_DATABASE,cid)
        couch_rev_check_coll(docuri,enriched_collection)
        resp, cont = H.request(docuri,'PUT',body=json.dumps(enriched_collection),headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items()))
        if not str(resp.status).startswith('2'):
            logger.debug("Error storing collection in Couch: "+repr((resp,cont)))

    # Then the records
    docs = []
    for record in data[u'items']:
        # Preserve record prior to any enrichments
        record['originalRecord'] = record.copy()         

        # Add collection information
        record[u'collection'] = {
            '@id' : at_id,
            'name' : enriched_collection.get('title',"")
        }
        if 'description' in enriched_collection:
            record[u'collection']['description'] = enriched_collection.get('description',"")

        record[u'ingestType'] = 'item'
        set_ingested_date(record)

        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        docs.append(doc)

    couch_rev_check_recs(docs,source_name)
    couch_docs_text = json.dumps({"docs":docs})
    if COUCH_DATABASE:
        resp, content = H.request(join(COUCH_DATABASE,'_bulk_docs'),'POST',body=couch_docs_text,headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items()))
        logger.debug("Couch bulk update response: "+content)
        if not str(resp.status).startswith('2'):
            logger.debug('HTTP error posting to CouchDB: '+repr((resp,content)))

    return json.dumps({'docs' : docs})
Esempio n. 25
0
File: calweb.py Progetto: dpla/akara
def akara_calendar(highlight=None):
    '''
    Return a calendar in HTML
    Generates a calendar along the lines of:

        <  January, 2007   >
        Mo Tu We Th Fr Sa Su
               1  2  3  4  5
         6  7  8  9 10 11 12
        13 14 15 16 17 18 19
        20 21 22 23 24 25 26
        27 28 29 30 31

    Marks present date and those that have entries with archive links

    Defines the following classes (for use in CSS customization):

      - akaraCalCalendar
        - calendar table (note: month/year header e.g. January 2007 is in table/th)
      - akaraCalCalendarWeekHeaders
        - week header (Su, Mo, Tu, ...)
      - akaraCalCalendarEmpty
        - filler cell (e.g. days after Jan 31)
      - akaraCalCalendarLive
        - day for which there is an entry (also has links to that day's archives)

    And the following IDs:

      - akaraCalCalendarToday
        - today's calendar day
      - akaraCalCalendarSpecificDay
        - specific day being rendered (if any)

    Some ideas (e.g. CSS styling of the table) from pycalendar.py by Will Guaraldi

    Sample request:
    curl http://localhost:8880/akara.calendar
    curl http://localhost:8880/akara.calendar/2008/12
    curl http://localhost:8880/akara.calendar/2008/12?highlight=2008-12-03
    '''
    baseuri = request.environ['SCRIPT_NAME'] + '/'
    today = date.today()
    year = shift_path_info(request.environ)
    month = shift_path_info(request.environ)
    if highlight:
        #Fun axiom: date(*map(int, date.today().isoformat().split('-')))
        highlight = date(*map(int, highlight.split('-')))
    if year and month:
        #Use specified year & month
        year, month = int(year), int(month)
        if (year, month) == (today.year, today.month):
            present_day = today.day
        else:
            present_day = None
    else:
        #XXX We might want to return Bad Request of they specified year but not day
        #Use present year & month
        year, month = today.year, today.month
        present_day = today.day
    #logger.debug("year: " + repr(year))

    dayheaders = ''.join(
        ['<td>%s</td>' % dh
         for dh in calendar.weekheader(3).split()]
    )
    monthcal = calendar.monthcalendar(year, month)
    c = []
    for wk in monthcal:
        c.append('<tr>\n')
        for d in wk:
            d_int = int(d)
            attrs = ''
            if d_int < 1:
                d = '&#160;'
                fulldate = date.max #never to be found in archives
                attrs += ' class="akaraCalCalendarEmpty"'
            else:
                fulldate = date(year, month, d_int)
            # "today" trumps "specific day"
            if d_int == present_day:
                attrs += ' id="akaraCalCalendarToday"'
            elif highlight and d_int == highlight.day:
                attrs += ' id="akaraCalCalendarSpecificDay"'
            #if fulldate in archives:
            #    attrs += ' class="akaraCalCalendarLive"'
                #d = '<a href="%s%i/%i/%s/">%s</a>'%(self.weblog_base_url, year, month, d, d)
            #    d = '%s'%(d)
            c.append('\t<td%s>%s</td>\n' % (attrs, d))
        c.append('\n</tr>\n')
    monthname =  calendar.month_name[month]
    prevmonth = date(year, month, 1) + relativedelta(months=-1)
    nextmonth = date(year, month, 1) + relativedelta(months=+1)
    #Yes, even checking if prevmonth > today, so if someone surfs
    #3 month in the future, there will be no month nav links
    if prevmonth > today:
        prevmonth = ''
    else:
        #prevmonth = '<th><a href="%s%i/%i/">&lt;&lt;</a></th>'%(self.weblog_base_url, prevmonth.year, prevmonth.month)
        prevmonth = '<th><a href="%s">&lt;&lt;</a></th>'%(join(baseuri, str(prevmonth.year), str(prevmonth.month)))
    if nextmonth > today:
        nextmonth = ''
    else:
        nextmonth = '<th><a href="%s">&gt;&gt;</a></th>'%(join(baseuri, str(nextmonth.year), str(nextmonth.month)))
    month = ''.join(c)
    cal = CAL_TEMPLATE.safe_substitute(locals())
    return cal
Esempio n. 26
0
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'):
    '''
    A generator of CDM records
    First generates header info

    >>> from zen.contentdm import read_contentdm
    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None)
    >>> results.next()
    {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'}
    >>> results.next()
    {u'Title': u'60 years in darkness.  ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...}

    The first yielded value is global metadata; the  second is the record
    for the first item  in the collection/query, and so on until all the items
    are returned, or the limit reached.

    If you want to see the debug messages, just do (before calling read_contentdm for the first time):

    >>> import logging; logging.basicConfig(level=logging.DEBUG)

    for a nice-sized collection to try:
    >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps')

    Auburn theater collection:

    >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01')
    >>> read_contentdm('http://content.lib.auburn.edu', collection='/football')

    i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps

    See also:

    * /cdm4/browse.php?CISOROOT=/football (51 items)

    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]')

    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return

    items = follow_pagination(resultsdoc)

    at_least_one = False
    count = 0
    for it in items:
        at_least_one = True
        pageuri = absolutize(it.href, site)
        if pageuri in seen_links:
            continue
        seen_links.add(pageuri)
        entry = {}
        logger.debug("Processing item URL: {0}".format(pageuri))
        (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri)
        entry['domain'] = netloc
        params = parse_qs(query)
        entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0]
        entry['id'] = params['CISOPTR'][0]
        logger.debug("Item id: {0}".format(entry['id']))
        if entry['id'] in seen_ids:
            continue
        seen_ids.add(entry['id'])
        entry['link'] = unicode(pageuri)
        entry['local_link'] = '#' + entry['id']

        resp, page, cachekey, cached = cdmsite.item_page(pageuri)

        if cached:
            entry = cached
        else:
            image = first_item(page.xml_select(u'//td[@class="tdimage"]//img'))
            if image:
                imageuri = absolutize(image.src, site)
                entry['imageuri'] = imageuri
                try:
                    entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site)
                except AttributeError:
                    logger.debug("No thumbnail")
            #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root)
            #fields = page.xml_select(u'//tr[td[@class="tdtext"]]')
            #fields = page.xml_select(u'//table[@class="metatable"]/tr')
            fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr'))
            for f in fields:
                #key = unicode(f.td[0].span.b).replace(' ', '_')
                key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b')))
                #logger.debug("{0}".format(key))
                value = u''.join(CONTENT.dispatch(f.td[1]))
                #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]')))
                entry[key] = unicode(value)
            if u'Title' in entry:
                #logger.debug("{0}".format(entry['Title']))
                entry['label'] = entry['Title']
            else:
                entry['label'] = u'[NO LABEL AVAILABLE]'
            if u"Location_Depicted" in entry:
                locations = entry[u"Location_Depicted"].split(u', ')
                #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ]
                locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ]
                #print >> sys.stderr, "LOCATIONS", repr(locations)
                entry[u"Locations_Depicted"] = locations
            if u"Date_Original" in entry:
                entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') 
            entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ]
            if cachedir:
                try:
                    json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w')
                    json.dump(entry, json_stream)
                except IOError, ValueError:
                    pass

        yield entry
        count += 1
        if limit and count >= limit:
            logger.debug("Limit reached")
            break