Example #1
0
def copy_auth(environ, top, realm=None):
    '''
    Get auth creds (HTTP basic only, for now) from the incoming request and return an
    HTTP auth handler for urllib2.  This handler allows you to "forward" this auth to
    remote services

    environ - The usual WSGI structure. Note: if you are using simple_service,
    in Akara services available as akara.request.environ, or perhaps passed right
    into the handler
    top - top URL to be used for this auth.
    '''
    #Useful: http://www.voidspace.org.uk/python/articles/authentication.shtml
    creds = extract_auth(environ)
    if creds:
        username, password = creds
    else:
        return None

    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    # HTTPPasswordMgr top must omit any URL components before the host (i.e. no scheme and no auth info in the authority section)
    #(scheme, authority, path, query, fragment) = split_uri_ref(top)
    #auth, host, port = split_authority(authority)
    #auth_top_url = (host + ':' + port if port else host) + path
    #print >> sys.stderr, 'Auth creds: %s:%s (%s)'%(username, password, auth_top_url)
    logger.debug('Auth creds: %s:%s (%s)' % (username, password, top))

    # Not setting the realm for now, so use None
    #password_mgr.add_password(None, auth_top_url, username, password)
    password_mgr.add_password(None, top, username, password)
    #password_handler = urllib2.HTTPDigestAuthHandler(password_mgr)
    password_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    return password_handler
def kentucky_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """
    data = {}
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    relation_field = "sourceResource/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        logger.debug("Field %s does not exist" % relation_field)
        return body

    base_url, ext = os.path.splitext(url)
    data["object"] = "%s_tb%s" % (base_url, ext)

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #3
0
def find_file_extension(mime):
    """
    Finds out the file extension based on the MIME type from the opened
    connection.

    Implementation:
        Function is using the configuration field 'mime_to_type' stored
        at akara.conf.

    Arguments:
        mime (String)   -   MIME type read from the HTTP headers

    Returns:
        file extension (String) - extension for the file -
        WITH DOT AT THE BEGINNING!!!

    Throws:
        throws exception if it cannot find the extension
    """

    if mime in MIME_TYPES:
        ext = MIME_TYPES[mime]
        logger.debug("MIME type is [%s], returning extension [%s]" % \
                (mime, ext))
        return ext
    else:
        msg = "Cannot find extension for mime type: [%s]." % mime
        logger.error(msg)
        raise FileExtensionException(msg)
def find_file_extension(mime):
    """
    Finds out the file extension based on the MIME type from the opened
    connection.

    Implementation:
        Function is using the configuration field 'mime_to_type' stored
        at akara.conf.

    Arguments:
        mime (String)   -   MIME type read from the HTTP headers

    Returns:
        file extension (String) - extension for the file -
        WITH DOT AT THE BEGINNING!!!

    Throws:
        throws exception if it cannot find the extension
    """

    if mime in MIME_TYPES:
        ext = MIME_TYPES[mime]
        logger.debug("MIME type is [%s], returning extension [%s]" % \
                (mime, ext))
        return ext
    else:
        msg = "Cannot find extension for mime type: [%s]." % mime
        logger.error(msg)
        raise FileExtensionException(msg)
def freemix(body, ctype, maxcount=None, diagnostics=None):
    '''
    Render the contents of a file as best as possible in Exhibit JSON
    * Supports Excel, BibTex and JSON for now

    Sample queries:
    * curl --request POST --data-binary @- http://localhost:8880/freemix.json?diagnostics=yes < test/data/load/iraq.xml
    * curl --request POST --data-binary @- http://localhost:8880/freemix.json < test/data/load/iraq.xml
    * curl --request POST --data-binary "@foo.xls" --header "Content-Type: application/vnd.ms-excel" "http://*****:*****@foo.xls" --header "Content-Type: application/msword" "http://localhost:8880/freemix.json"

    #FIXME: OK enough tower-of-pisa code.  Use more functions

    #DIAGNOSTICS config no longer used at all
    #if diagnostics is None:
    #    diagnostics = DIAGNOSTICS
    #else:
    diagnostics = diagnostics == u'yes'
    logger.debug('diagnostics: ' + repr(diagnostics))
    fixup_obj_labels = True
    imt_saved = imt = guess_imt_(body, ctype)
    #logger.debug("IMT: " + imt)
    ss_data = None
    diag_info = []
    if imt == UNKNOWN_IMT:
        try:
            source = speadsheet.read(body)
            ss_data = [ row for row in source.rows() ]
            imt = EXCEL_IMTS[0]
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception, e:
            pass
Example #6
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue  # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://"
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
def set_field_from_value_mode(data, field, mode, value, multivalue=True):
    '''Set the value for the data "field" from data in collection
    ckey field with the value passed in.
    '''
    logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue))
    if value: #no value don't bother
        if mode=='overwrite':
            if exists(data, field):
                setprop(data, field, value)
            else:
                pp,pn = tuple(field.lstrip('/').split('/',1))
                if not pp in data:
                    data[pp] = {}
                data[pp][pn] = value
        elif mode=='append':
            new_value = []
            if exists(data, field):
                old_value = getprop(data, field)
                if isinstance(old_value, list):
                    new_value.extend(old_value)
                else:
                    new_value.append(old_value)
            if isinstance(value, list):
                new_value.extend(value)
            else:
                new_value.append(value)
            setprop(data, field, new_value)
        else: # fill blanks
            if not exists(data, field) or not getprop(data,
                    field,keyErrorAsNone=True):
                if multivalue and not isinstance(value, list):
                    value = [value]
                setprop(data, field, value)
    return data
Example #8
0
def copy_auth(environ, top, realm=None):
    '''
    Get auth creds (HTTP basic only, for now) from the incoming request and return an
    HTTP auth handler for urllib2.  This handler allows you to "forward" this auth to
    remote services

    environ - The usual WSGI structure. Note: if you are using simple_service,
    in Akara services available as akara.request.environ, or perhaps passed right
    into the handler
    top - top URL to be used for this auth.
    '''
    #Useful: http://www.voidspace.org.uk/python/articles/authentication.shtml
    creds = extract_auth(environ)
    if creds:
        username, password = creds
    else:
        return None
    
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    # HTTPPasswordMgr top must omit any URL components before the host (i.e. no scheme and no auth info in the authority section)
    #(scheme, authority, path, query, fragment) = split_uri_ref(top)
    #auth, host, port = split_authority(authority)
    #auth_top_url = (host + ':' + port if port else host) + path
    #print >> sys.stderr, 'Auth creds: %s:%s (%s)'%(username, password, auth_top_url)
    logger.debug('Auth creds: %s:%s (%s)'%(username, password, top))
    
    # Not setting the realm for now, so use None
    #password_mgr.add_password(None, auth_top_url, username, password)
    password_mgr.add_password(None, top, username, password)
    #password_handler = urllib2.HTTPDigestAuthHandler(password_mgr)
    password_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    return password_handler
def kentucky_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """
    data = {}
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    relation_field = "sourceResource/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        logger.debug("Field %s does not exist" % relation_field)
        return body

    base_url, ext = os.path.splitext(url)
    data["object"] = "%s_tb%s" % (base_url, ext)

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
def add_ejson_profile(data, fixup_obj_labels=True):
    objkeys = dict([ (k, k) for obj in data for k in obj ])
    #FIXME: reduce from 3 full passes through obj to 2 (don't think we can go lower than 2)
    for k in objkeys:
        kcount = reduce(lambda count, obj, k=k: count + int(k in obj), data, 0)
        logger.debug("Key usage count %s: %i" % (k, kcount))
        if not kcount:
            del objkeys[k]
    logger.debug("Modified data profile keys: " + repr(objkeys))
    if fixup_obj_labels:
        for obj in data:
            for k in obj:
                #Yes we could receive non-string "labels"
                if not isinstance(k, basestring):
                    k = str(k)
                new_k = UNSUPPORTED_IN_EXHIBITKEY.sub('_', k)
                if not new_k or new_k[0].isdigit():
                    new_k = '_' + new_k
                if k != new_k:
                    objkeys[new_k] = k
                    try:
                        del objkeys[k]
                    except KeyError:
                        pass
                    obj[new_k] = obj[k]
                    del obj[k]
    #print >> sys.stderr, objkeys

    return {"properties": [
                {"property": k, "enabled": (k not in ("id", "label")), "label": v, "types": ["text"]} for k, v in objkeys.iteritems()
            ]}
Example #11
0
def pubmed_adapter(search=None, id=None):
    '''
    Sample queries:
    #curl "http://localhost:8880/pubmed?"
    curl "http://localhost:8880/pubmed?search=stem+cells"
    curl "http://localhost:8880/pubmed?id=19358275"
    '''
    #FIXME: How do we handle no search or id param?  Just serve up the latest entries?  Or error as below?
    #assert_(not(search and id), msg="You must specify the 'search' or 'id' query parameter is mandatory.")
    if search:
        #search = first_item(search)
        #reldate: only search for last N days
        #query = urllib.urlencode({'db' : NCBI_DB, 'term': query, 'reldate': '60', 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'})
        query = urllib.urlencode({'term': search, 'db' : NCBI_DB, 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'})
        search_url = NCBI_SEARCH_PATTERN + query
        logger.debug("Term search URL: " + search_url)
        doc = bindery.parse(search_url, standalone=True)
        search_terms = search
        ids = ( unicode(i) for i in doc.eSearchResult.IdList.Id )
        ids = ','.join(ids)
        self_link = '/pubmed?search='+search
    else:
        #ids = first_item(id)
        #fulltext = fulltext[0] if fulltext else u'no'
        #if fulltext == 'yes':
        search_terms = ids
        self_link = '/pubmed?id='+ids
    query = urllib.urlencode({'db' : NCBI_DB, 'id': ids, 'retmode': 'xml'})
    search_url = NCBI_ARTICLE_ACCESS_PATTERN + query
    logger.debug("ID search URL: " + search_url)
    alt_link = search_url
    doc = bindery.parse(search_url, standalone=True, model=PUBMED_MODEL)
    #doc = bindery.parse(open('/Users/uche/tmp/efetch.fcgi.html'), standalone=True, model=PUBMED_MODEL)
    metadata, first_id = metadata_dict(generate_metadata(doc))
    return atom_results(doc, metadata, self_link, alt_link, search_terms)
Example #12
0
File: z.py Project: dpla/zen
def post_resource(environ, start_response):
    '''
    Create a new record with a resource type
    '''
    slaveinfo, space_tag = setup_request(environ)

    temp_fpath = read_http_body_to_temp(environ, start_response)
    body = open(temp_fpath, "r").read()

    resource_type = slaveinfo.resource_factory()
    imt = environ['CONTENT_TYPE'].split(';')[0]
    lang = environ.get('CONTENT_LANGUAGE')

    handler = resource_type.run_rulesheet(environ, environ['REQUEST_METHOD'], imt, lang)

    new_path, content = handler(resource_type, body)

    logger.debug('rulesheet transform output & new uri path (post_resource): ' + repr((content[:100], new_path)))

    #Comes back as Unicode, but we need to feed it to slave as encoded byte string
    content = content.encode('utf-8')
    environ['wsgi.input'] = cStringIO.StringIO(content)
    environ['CONTENT_LENGTH'] = len(content)

    response = slaveinfo.create_resource(new_path)

    if not slaveinfo.resp_status.startswith('2'):
        start_response(status_response(slaveinfo.resp_status), slaveinfo.resp_headers)
        return ["Unable to create resource\n"]

    start_response(slaveinfo.resp_status, slaveinfo.resp_headers)
    return response
Example #13
0
def tocouch(**params):
    '''
    @xslt - URL to the XSLT transform to be applied
    all other query parameters are passed ot the XSLT processor as top-level params
    
    Sample request:
    curl --request POST --data-binary "@foo.xml" --header "Content-Type: application/xml" "http://*****:*****@xslt=http://hg.akara.info/amara/trunk/raw-file/tip/demo/data/identity.xslt"
    
    You can check after the fact by visiting http://sforza.ogbuji.net:5984/test1/_all_docs
    
    Then get the id and surf there
    
    http://sforza.ogbuji.net:5984/test1/b10d978ced600227e663d6503b1abec4
    
    or just explore it in Futon
    
    http://sforza.ogbuji.net:5984/_utils/database.html?test1
    '''
    logger.debug('params: ' + repr(params))
    title = params['t'].decode('UTF-8')
    url = params['url'].decode('UTF-8')
    tags = params['tags'].decode('UTF-8').split(u',')
    desc = params.get('d', u'').decode('UTF-8')
    body = json.dumps({'title': title, 'url': url, 'tags': tags, 'desc': desc}, indent=4)
    headers = {}
    resp, content = H.request(COUCHBASE, 'POST', body, headers=headers)
    return '<div>Couch updated?</div><pre>%s</pre>'%body

#{'ld': '2', 'd': 'I gave a number of talks this spring on jQuery and especially on some of the recent additions made in jQuery 1.4.', 'tlt': '2', 'url': 'http://ejohn.org/', 'blt': '1', 'tt': 'totag', 'nd': '1', 'bt': 'via for to unread', 'tl': '6', 'u': 'uche', 'user': '******', 'ned': '1', 'bld': '1', 'net': '1', 'bbt': '1', 'dt': 'todescribe', 't': 'John Resig - JavaScript Programmer'}
Example #14
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://" 
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
Example #15
0
def akara_cache_proxy(url=None):
    '''
    Sample request:
    curl -I "http://localhost:8880/akara.cache-proxy?url=http://poemtree.com/poems/UsefulAdvice.htm"
    '''
    logger.debug('remote URL {0}: '.format(repr(url)))
    if not url:
        raise ValueError('url query parameter required')
    resp, content = H.request(url)

    if OVERRIDE_STALE:
        response.add_header(*MAXAGE_HEADER(get_max_age(url)))
    else:
        (fresh, lifetime) = is_fresh(resp)
        if fresh:
            response.add_header(*MAXAGE_HEADER( max(get_max_age(url),lifetime) ))
        else:
            response.add_header(*MAXAGE_HEADER(0))
    
    logger.debug('remote response headers {0}: '.format(repr(resp)))
    #Oof. What about 'transfer-encoding' and other such headers
    for k in resp:
        if k not in ('server','status', 'transfer-encoding', 'content-length','cache-control','expires','date'):
            response.add_header(normalize_http_header_name(k), resp[k])
    #response.add_header(k, resp[k])
    #FIXME: This might distort return encoding, which would of course throw off content length & encoding.  Workaround for now is removal of e.g. transfer-encoding (above)

    return content
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = ("akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if latest_ingestion_doc and \
       getprop(latest_ingestion_doc, "delete_process/status") != "complete":
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path)
    logger.debug("Ingestion document %s created." % ingestion_document_id)

    return ingestion_document_id
Example #17
0
def replace_regex(body, ctype, prop=None, regex=None, new=None):
    """Replaces a regex in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    regex -- the regex to replace
    new -- the substring to replaced regex with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not regex:
        logger.error("No regex parameter supplied")
    else:
        if not new:
            logger.debug("NO New parameter, will replace with empty string")
            new = ''
        if exists(data, prop):
            v = getprop(data, prop)
            new_val = replace_regex_recurse_field(v, regex, new)
            setprop(data, prop, new_val)

    return json.dumps(data)
Example #18
0
def akara_cache_proxy(url=None):
    '''
    Sample request:
    curl -I "http://localhost:8880/akara.cache-proxy?url=http://poemtree.com/poems/UsefulAdvice.htm"
    '''
    logger.debug('remote URL {0}: '.format(repr(url)))
    if not url:
        raise ValueError('url query parameter required')
    resp, content = H.request(url)

    if OVERRIDE_STALE:
        response.add_header(*MAXAGE_HEADER(get_max_age(url)))
    else:
        (fresh, lifetime) = is_fresh(resp)
        if fresh:
            response.add_header(
                *MAXAGE_HEADER(max(get_max_age(url), lifetime)))
        else:
            response.add_header(*MAXAGE_HEADER(0))

    logger.debug('remote response headers {0}: '.format(repr(resp)))
    #Oof. What about 'transfer-encoding' and other such headers
    for k in resp:
        if k not in ('server', 'status', 'transfer-encoding', 'content-length',
                     'cache-control', 'expires', 'date'):
            response.add_header(normalize_http_header_name(k), resp[k])
    #response.add_header(k, resp[k])
    #FIXME: This might distort return encoding, which would of course throw off content length & encoding.  Workaround for now is removal of e.g. transfer-encoding (above)

    return content
def contentdm(collection='all', query=None, site=DEFAULT_SITE, limit=None):
    '''
    Search all collections in Louisville:

    curl "http://localhost:8880/contentdm.json?query=crutches&site=http://digital.library.louisville.edu/cdm4/&limit=100"

    Search just /jthom collection in Louisville:

    curl "http://localhost:8880/contentdm.json?collection=/jthom&query=crutches&site=http://digital.library.louisville.edu/cdm4/&limit=100"

    Search all collections in U Miami:

    curl "http://localhost:8880/contentdm.json?query=crutches&site=http://doyle.lib.muohio.edu/cdm4/&limit=100"
    '''
    limit = int(limit) if limit else None
    results = read_contentdm(site, collection=collection, query=query, limit=limit, logger=logger, proxy=CACHE_PROXY_SERVICE)
    header = results.next()
    url = header['basequeryurl']
    count = 0
    logger.debug("Start URL: {0}, Limit: {1}".format(repr(url), limit))
    entries = list(results)
    logger.debug("Result count: {0}".format(len(entries)))
    properties = profile_properties(entries)
    #logger.debug("DEFAULT_PROPERTIES: {0}".format(DEFAULT_PROPERTIES))
    for prop in properties:
        if prop[u"property"] in DEFAULT_PROPERTIES:
            prop[u"tags"] = DEFAULT_PROPERTIES[prop[u"property"]][u"tags"]
    #checkmem()
    return json.dumps({'items': entries, 'data_profile': {"properties": properties}}, indent=4)
Example #20
0
File: moin.py Project: mredar/akara
 def factory(rest_uri, moin_link=None, opener=None):
     opener = opener or urllib2.build_opener()
     logger.debug('rest_uri: ' + rest_uri)
     req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT})
     resp = opener.open(req)
     doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL)
     original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
     #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER]
     #amara.xml_print(self.content_cache)
     metadata, first_id = metadata_dict(generate_metadata(doc))
     metadata = metadata[first_id]
     akara_type = U(metadata[u'ak-type'])
     logger.debug('Type: ' + akara_type)
     try:
         #Older Moin CMS resource types are implemented by registration to the global node.NODES
         cls = node.NODES[akara_type]
     except KeyError:
         #Newer Moin CMS resource types are implemented by discovery of a URL,
         #to which a POST request executes the desired action
         return node.ENDPOINTS and (rest_uri, akara_type,
                                    node.ENDPOINTS[akara_type], doc,
                                    metadata, original_wiki_base)
     else:
         instance = cls(rest_uri,
                        moin_link,
                        opener,
                        cache=(doc, metadata, original_wiki_base))
         return instance
Example #21
0
def rss2translate(url=None, format=None):
    """Convert RSS 2.0 feed to Atom or RSS 1.0
    
    Sample request:
    * curl "http://localhost:8880/akara.rss2translate?url=http://feeds.delicious.com/v2/rss/recent"

    This is a demo and is not meant as an industrial-strength converter.
    """
    # Support connection-negotiation in addition to query parameter
    if not format:
        accepted_imts = request.environ.get('HTTP_ACCEPT', '').split(',')
        imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
        if imt == 'RDF_IMT':
            format = 'rss1'
        else:
            format = 'atom'
    
    if not url:
        raise AssertionError("The 'url' query parameter is mandatory.")

    import feedparser # From http://www.feedparser.org/
    feed = feedparser.parse(url)
    
    # Note: bad URLs might mean the feed doesn't have headers
    logger.debug('Feed info: ' + repr((url, feed.version, feed.encoding, feed.headers.get('Content-type'))))

    updated = getattr(feed.feed, 'updated_parsed', None)
    if updated:
        #FIXME: Double-check this conversion
        updated = datetime(*updated[:7]).isoformat()
    
    f = atomtools.feed(title=feed.feed.title, updated=updated, id=feed.feed.link)
    for e in feed.entries:
        updated = getattr(e, 'updated_parsed', None)
        if updated:
            #FIXME: Double-check this conversion
            updated = datetime(*updated[:7]).isoformat()
        links = [
            #FIXME: self?
            (e.link, u'alternate'),
        ]
        f.append(
            e.link,
            e.title,
            updated = updated,
            summary=e.description,
            #e.author_detail.name
            #authors=authors,
            links=links,
        )

    if format == 'atom':
        result = f.xml_encode()
        response.add_header("Content-Type", ATOM_IMT)
    else:
        result = f.rss1format()
        response.add_header("Content-Type", RDF_IMT)
    return result
Example #22
0
def texas_enrich_location(body,
                          ctype,
                          action="texas_enrich_location",
                          prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of
    that document.

    For use with the texas profile
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    def _get_coordinates(value):
        lat, lon = None, None
        for v in value.split(";"):
            if "north=" in v:
                lat = v.split("=")[-1]
            elif "east=" in v:
                lon = v.split("=")[-1]

        if lat and lon:
            return (lat, lon)
        else:
            return ()

    if exists(data, prop):
        spatial = []
        values = getprop(data, prop)

        for v in values:
            sp = {"name": v}
            shredded = [s.strip() for s in v.split(" - ")]

            coordinates = _get_coordinates(sp["name"])
            if coordinates:
                sp["name"] = "%s, %s" % coordinates

            if len(shredded) < 5:
                if not re.search("\d", sp["name"]):
                    sp["country"] = shredded[0]
                if "country" in sp:
                    if sp["country"] in ["United States", "Canada"]:
                        try:
                            sp["state"] = shredded[1]
                            sp["county"] = shredded[2]
                            sp["city"] = shredded[3]
                        except Exception, e:
                            logger.debug("Error enriching location %s: %s" %
                                         (data["_id"], e))
            spatial.append(sp)
        logger.debug("SPATIAL: %s" % spatial)
        setprop(data, prop, spatial)
Example #23
0
 def run(self):
     self._sock.setblocking(1)
     logger.debug("Start request from address %r, local socket %r" %
                  (self._addr, self._sock.getsockname()))
     handler = AkaraWSGIDispatcher(self.settings, self.config)
     self.handler = AkaraWSGIHandler(self._sock, self._addr, handler)
     logger.debug("End request from address %r, local socket %r" %
                  (self._addr, self._sock.getsockname()))
     self._sock.close()
Example #24
0
def akara_echo_body(body, ctype, log=u'no'):
    '''
    Sample request:
    curl --request POST --data-binary "@foo.dat" --header "Content-type: text/plain" "http://localhost:8880/akara.echo"
    '''
    if log == u'yes':
        from akara import logger
        logger.debug('akara_echo_body: ' + body)
    return body
def texas_enrich_location(body, ctype, action="texas_enrich_location",
                          prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of
    that document.

    For use with the texas profile
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"


    def _get_coordinates(value):
        lat, lon = None, None
        for v in value.split(";"):
            if "north=" in v:
                lat = v.split("=")[-1]
            elif "east=" in v:
                lon = v.split("=")[-1]

        if lat and lon:
            return (lat, lon)
        else:
            return ()

    if exists(data, prop):
        spatial = []
        values = getprop(data,prop)

        for v in values:
            sp = {"name": v}
            shredded = [s.strip() for s in v.split(" - ")]

            coordinates = _get_coordinates(sp["name"]) 
            if coordinates:
                sp["name"] = "%s, %s" % coordinates

            if len(shredded) < 5:
                if not re.search("\d", sp["name"]):
                    sp["country"] = shredded[0]
                if "country" in sp:
                    if sp["country"] in ["United States", "Canada"]:
                        try:
                            sp["state"] = shredded[1]
                            sp["county"] = shredded[2]
                            sp["city"] = shredded[3]
                        except Exception, e:
                            logger.debug("Error enriching location %s: %s" %
                                         (data["_id"], e))
            spatial.append(sp)
        logger.debug("SPATIAL: %s" % spatial)
        setprop(data, prop, spatial)
Example #26
0
File: couchdb.py Project: dpla/zen
 def zen_type(space, data):
     '''
     Computer a Zen type full moinrest uri as well as a path relative to top of the wiki instance
     '''
     rtype = data['zen:metadata']['zen:type']
     if logger: logger.debug('zen_type link: ' + repr(rtype))
     tpath, tid = rtype, absolutize(rtype, space.remotedb)
     if logger: logger.debug('Retrieved zen_type: ' + repr((tid, tpath)))
     return (tid, tpath)
Example #27
0
 def run(self):
     self._sock.setblocking(1)
     logger.debug("Start request from address %r, local socket %r" %
                  (self._addr, self._sock.getsockname()))
     handler = AkaraWSGIDispatcher(self.settings, self.config)
     self.handler = AkaraWSGIHandler(self._sock, self._addr, handler)
     logger.debug("End request from address %r, local socket %r" %
                  (self._addr, self._sock.getsockname()))
     self._sock.close()
def download_preview(body, ctype):
    """
    Reponsible for:  downloading a preview for a document
    Usage: as a module in separate pipeline, to be run on existing
    documents in the repository to download the thumbnails.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check the "admin/object_status" field
    status = None
    try:
        status = getprop(data, "admin/object_status")
        if status in ["error", "downloaded"]:
            logger.debug("Status is %s, doing nothing" % status)
            return body
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    # Thumbnail URL
    url = None
    try:
        url = getprop(data, "object/@id")
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    # Document ID
    id = None
    try:
        id = getprop(data, "id")
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    download = False
    if status == "pending":
        download = True

    (relative_fname, mime, status) = download_image(url, id, download)

    if not relative_fname:
        logger.error("Cannot save thumbnail from: %s." % (url))

    # so everything is OK and the file is on disk
    doc = update_document(data, relative_fname, mime, status)
    return json.dumps(doc)
Example #29
0
def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None):
    """Unsets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to unset
    condition -- the condition to be met (uses prop by default) 
    condition_prop -- the prop(s) to use in the condition (comma-separated if
                      multiple props)
    
    """

    CONDITIONS = {
        "is_digit": lambda v: v[0].isdigit(),
        "mwdl_exclude": lambda v: (v[0] == "collections" or
                                   v[0] == "findingAids"),
        "hathi_exclude": lambda v: "Minnesota Digital Library" in v,
        "finding_aid_title": lambda v: v[0].startswith("Finding Aid"),
        "usc_no_contributor": lambda v: not v[0].get("contributor", False)
    }

    def condition_met(condition_prop, condition):
        values = []
        props = condition_prop.split(",")
        for p in props:
            iterified = iterify(getprop(data, p, True))
            [values.append(i) for i in iterified]

        return CONDITIONS[condition](values)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check if prop exists to avoid key error
    if exists(data, prop):
        if not condition:
            delprop(data, prop)
        else:
            if not condition_prop:
                condition_prop = prop
            try:
                if condition_met(condition_prop, condition):
                    logger.debug("Unsetting prop %s for doc with id %s" % 
                                 (prop, data["_id"]))
                    delprop(data, prop)
            except KeyError:
                logger.error("CONDITIONS does not contain %s" % condition)
                

    return json.dumps(data)
def dataprovider_transform(d, p):
    dataprovider = []
    for s in getprop(d, p):
        if "partner" in s:
            term = s.split(":")[-1]
            try:
                dataprovider.append(DATAPROVIDER_TERM_LABEL[term])
            except:
                logger.debug("TERM %s does not exist %s" % (term, d["_id"]))

    return {"dataProvider": dataprovider} if dataprovider else {}
def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None):
    """Unsets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to unset
    condition -- the condition to be met (uses prop by default) 
    condition_prop -- the prop(s) to use in the condition (comma-separated if
                      multiple props)
    
    """

    CONDITIONS = {
        "is_digit": lambda v: v[0].isdigit(),
        "mwdl_exclude": lambda v:
        (v[0] == "collections" or v[0] == "findingAids"),
        "hathi_exclude": lambda v: "Minnesota Digital Library" in v,
        "finding_aid_title": lambda v: v[0].startswith("Finding Aid"),
        "usc_no_contributor": lambda v: not v[0].get("contributor", False)
    }

    def condition_met(condition_prop, condition):
        values = []
        props = condition_prop.split(",")
        for p in props:
            iterified = iterify(getprop(data, p, True))
            [values.append(i) for i in iterified]

        return CONDITIONS[condition](values)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check if prop exists to avoid key error
    if exists(data, prop):
        if not condition:
            delprop(data, prop)
        else:
            if not condition_prop:
                condition_prop = prop
            try:
                if condition_met(condition_prop, condition):
                    logger.debug("Unsetting prop %s for doc with id %s" %
                                 (prop, data["_id"]))
                    delprop(data, prop)
            except KeyError:
                logger.error("CONDITIONS does not contain %s" % condition)

    return json.dumps(data)
Example #32
0
 def register_service(self, ident, path, handler, doc=None, query_template=None):
     if "/" in path:
         raise ValueError("Registered path %r may not contain a '/'" % (path,))
     if doc is None:
         doc = inspect.getdoc(handler) or ""
     if ident in self._registered_services:
         logger.warn("Replacing mount point %r (%r)" % (path, ident))
     else:
         logger.debug("Created new mount point %r (%r)" % (path, ident))
     serv = Service(handler, path, ident, doc, query_template)
     self._registered_services[path] = serv
Example #33
0
File: geonames.py Project: dpla/zen
 def url(self):
     domain = "ws.geonames.org"
     user = ''
     if self.user:
         domain = "ba-ws.geonames.net"
         user = '******'%self.user
         logger.debug('Using Commercial GeoNames service (ba-ws.geonames.org). Username: '******'json': 'JSON'}
     resource = "search" + append_formats.get(output_format, '')
     return "http://%(domain)s/%(resource)s?%(user)s%%s" % locals()
Example #34
0
def primotodpla(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(
        multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
Example #35
0
def primotodpla(body,ctype,geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))    

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
Example #36
0
    def search(self, term):
        qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id})
        url = DSPACE_OAI_ENDPOINT + '?' + qstr
        logger.debug('DSpace URL: ' + str(url))
        #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]

        doc = bindery.parse(url, model=OAI_MODEL)
        #print >> sys.stderr, list(generate_metadata(doc))
        resources, first_id = metadata_dict(generate_metadata(doc), nesteddict=False)
        record = doc.OAI_PMH

        resource = resources[first_id]
Example #37
0
def arctodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified ARC (NARA) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in data.keys():
        if p in CHO_TRANSFORMER:
            out["sourceResource"].update(CHO_TRANSFORMER[p](data))
        if p in AGGREGATION_TRANSFORMER:
            out.update(AGGREGATION_TRANSFORMER[p](data))

    # Apply transformations that are dependent on more than one
    # original document  field
    out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(rights_transform(data))
    out["sourceResource"].update(subject_and_spatial_transform(data))
    out.update(has_view_transform(data))
    out["sourceResource"].update(transform_state_located_in(data))

    if exists(out, "sourceResource/date"):
        logger.debug("OUTTYPE: %s" % getprop(out, "sourceResource/date"))

    if exists(data, "objects/object"):
        out.update(transform_thumbnail(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
Example #38
0
    def search(self, term):
        qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id})
        url = DSPACE_OAI_ENDPOINT + '?' + qstr
        logger.debug('DSpace URL: ' + str(url))
        #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]

        doc = bindery.parse(url, model=OAI_MODEL)
        #print >> sys.stderr, list(generate_metadata(doc))
        resources, first_id = metadata_dict(generate_metadata(doc), nesteddict=False)
        record = doc.OAI_PMH

        resource = resources[first_id]
Example #39
0
def pipe(content,ctype,enrichments,wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if len(uri) < 1: continue # in case there's no pipeline
        headers = copy_headers_to_dict(request.environ,exclude=[wsgi_header])
        headers['content-type'] = ctype
        resp, cont = H.request(uri,'POST',body=body,headers=headers)
        if not str(resp.status).startswith('2'):
            logger.debug("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
Example #40
0
def listrecords(limit=100):
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME)
    url += '?limit=' + str(limit)
    logger.debug(url)
    resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER)
    logger.debug("Content: " + content)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't get documents via: " + repr(resp))
Example #41
0
def listrecords(limit=100):
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME)
    url += '?limit=' + str(limit)
    logger.debug(url)
    resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER)
    logger.debug("Content: " + content)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't get documents via: " + repr(resp))
Example #42
0
def edantodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for k, v in CHO_TRANSFORMER.items():
        if exists(data, k):
            out["sourceResource"].update(v(data))
    for k, v in AGGREGATION_TRANSFORMER.items():
        if exists(data, k):
            out.update(v(data))

    # Apply transformations that are dependent on more than one
    # original document  field
    #out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(transform_rights(data))
    out["sourceResource"].update(transform_subject(data))
    out["sourceResource"].update(transform_spatial(data))

    out.update(transform_is_shown_at(data))
    out.update(transform_object(data))
    out.update(transform_data_provider(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
def subject_and_spatial_transform(d, p):
    val = {}
    val["subject"] = []
    val["spatial"] = []

    v = getprop(d, p)
    for s in (iterify(v)):
        subject = []
        if "name" in s:
            subject.append(name_from_name_part(getprop(s, "name/namePart")))

        if "topic" in s:
            for t in (s["topic"] if isinstance(s["topic"], list) else
                      [s["topic"]]):
                if t not in subject:
                    subject.append(t)

        if "geographic" in s:
            for g in iterify(s["geographic"]):
                if g not in subject:
                    subject.append(g)
                if g not in val["spatial"]:
                    val["spatial"].append(g)

        if "hierarchicalGeographic" in s:
            for h in iterify(s["hierarchicalGeographic"]):
                if isinstance(h, dict):
                    for k in h.keys():
                        if k not in ["city", "county", "state", "country",
                                     "coordinates"]:
                            del h[k]
                    if h not in val["spatial"]:
                        val["spatial"].append(h)
                    if "country" in h:
                        subject.append(h["country"])

        coords = getprop(s, "cartographics/coordinates", True)
        if coords and coords not in val["spatial"]:
            val["spatial"].append(coords)

        if "temporal" in s:
            logger.debug("TEMPORAL: %s" % s["temporal"])

        val["subject"].append("--".join(subject))

    if not val["subject"]:
        del val["subject"]
    if not val["spatial"]:
        del val["spatial"]

    return val
def scraper_json(url=None):
    '''
    End-point for bookmarklet that scrapes a site for RDFa then using Calais

    Sample request:
    * curl "http://localhost:8880/z.scraper.json?url=http://zepheira.com"
    '''
    for s in SCRAPER_SERVICES:
        logger.debug("Not found: " + place)
        #print >> sys.stderr, 'Trying:', s%{'url': url[0]}
        #result = urllib.urlopen(s%{'url': url[0]}).read()
        result = urllib.urlopen(s + url[0]).read()
        if result:
            return result
    return '{}'
Example #45
0
File: couchdb.py Project: dpla/zen
    def __init__(self, space, docid, data, rtype=None):
        '''
        '''
        self.docid = docid
        self.space = space
        self.slave_uri = join(space.remotedb, docid)
        self.data = data
        self.rulesheet = None

        if logger: logger.debug('GRIPPO: ' + repr(rtype))
        if isinstance(rtype, basestring) and rtype != RESOURCE_TYPE_TYPE:
            self.type = space.resource_factory(rtype)
        else:
            self.type = rtype
        return
Example #46
0
def find_conversion_dictionary(mapping_key):
    """Finds the dictionary with values to use for conversion.

    Args:
        mapping_key (Str): Name of conversion key read from Akara.conf

    Returns:
        Dictionary used for converting values.
    """
    # Mapping should be in akara.conf
    mapping = module_config().get('lookup_mapping')
    logger.debug("Looking for mapping using key [%s]" % mapping_key)
    dict_name = mapping[mapping_key].upper()
    logger.debug("Found substitution dict [%s] for key mapping [%s]" % (dict_name, mapping_key,))
    return globals()[dict_name]
def scraper_json(url=None):
    '''
    End-point for bookmarklet that scrapes a site for RDFa then using Calais

    Sample request:
    * curl "http://localhost:8880/z.scraper.json?url=http://zepheira.com"
    '''
    for s in SCRAPER_SERVICES:
        logger.debug("Not found: " + place)
        #print >> sys.stderr, 'Trying:', s%{'url': url[0]}
        #result = urllib.urlopen(s%{'url': url[0]}).read()
        result = urllib.urlopen(s + url[0]).read()
        if result:
            return result
    return '{}'
Example #48
0
    def map_data_provider(self):
        prop = "originalRecord/header/setSpec"

        if exists(self.provider_data, prop):
            dataprovider = []
            for s in iterify(getprop(self.provider_data, prop)):
                if "partner" in s:
                    term = s.split(":")[-1]
                    try:
                        dataprovider.append(self.dataprovider_term_label[term])
                    except:
                        logger.debug("Term %s does not exist in " % term +
                                     "self.dataprovider_term_label for %s" %
                                     self.provider_data["_id"])

            if dataprovider:
                self.mapped_data.update({"dataProvider": dataprovider})
Example #49
0
 def register_service(self,
                      ident,
                      path,
                      handler,
                      doc=None,
                      query_template=None):
     if "/" in path:
         raise ValueError("Registered path %r may not contain a '/'" %
                          (path, ))
     if doc is None:
         doc = inspect.getdoc(handler) or ""
     if ident in self._registered_services:
         logger.warn("Replacing mount point %r (%r)" % (path, ident))
     else:
         logger.debug("Created new mount point %r (%r)" % (path, ident))
     serv = Service(handler, path, ident, doc, query_template)
     self._registered_services[path] = serv
Example #50
0
def generate_file_path(id, file_extension):
    """
    Generates and returns the file path based in provided params.

    Algorithm for generating the file path:

      The file path is generated using the following algorithm:

        -   convert all not allowed characters from the document id to "_"
        -   to the above string add number and extension getting FILE_NAME
        -   fetch id (it will already be the md5 of the _id field)
        -   convert to uppercase
        -   insert "/" between each to characters of this hash getting CALCULATED_PATH
        -   join the MAIN_PATH, CALCULATED_PATH and FILE_NAME

    Arguments:
        id             - document id from couchdb
        file_extension - extension of the file

    Returns:
        filepath       - path, without file name
        full_filepath  - path, with file name
        relative_fname - path, relative, without ROOT_PATH

    Example:
        Function call:
            generate_file_path('clemsontest--hcc001-hcc016', ".jpg")

        Generated values for the algorithm steps:

        TODO: Update doc here for the new algorithm.

        CLEARED_ID: clemsontest__hcc001_hcc016
        FILE_NAME:  clemsontest__hcc001_hcc016.jpg
        HASHED_ID:  8E393B3B5DA0E0B3A7AEBFB91FE1278A
        PATH:       8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/
        FULL_NAME:  /main_pic_dir/8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/clemsontest__hcc001_hcc016.jpg
    """

    cleared_id = id.upper()
    logger.debug("Generating filename for document with id: [%s].", id)

    fname = "%s%s" % (cleared_id, file_extension)
    logger.debug("File name:  " + fname)

    path = re.sub("(.{2})", "\\1" + os.sep, cleared_id, re.DOTALL)
    logger.debug("PATH:       " + path)

    relative_fname = os.path.join(path, fname)

    path = os.path.join(THUMBS_ROOT_PATH, path)
    full_fname = os.path.join(path, fname)
    logger.debug("FULL PATH:  " + full_fname)

    return (path, full_fname, relative_fname)
Example #51
0
def find_conversion_dictionary(mapping_key):
    """Finds the dictionary with values to use for conversion.

    Args:
        mapping_key (Str): Name of conversion key read from Akara.conf

    Returns:
        Dictionary used for converting values.
    """
    # Mapping should be in akara.conf
    mapping = module_config().get('lookup_mapping')
    logger.debug("Looking for mapping using key [%s]" % mapping_key)
    dict_name = mapping[mapping_key].upper()
    logger.debug("Found substitution dict [%s] for key mapping [%s]" % (
        dict_name,
        mapping_key,
    ))
    return globals()[dict_name]
Example #52
0
def update_document(body, ctype):
    logger.debug(body)
    from StringIO import StringIO
    io = StringIO(body) 
    parsed_doc = json.load(io) 
    document_id = parsed_doc[u"id"]
    document  = body

    logger.debug("Storing the document: " + document_id)
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, document_id)
    resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
Example #53
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
Example #54
0
def oaitodpla(body, ctype, geoprop=None):
    '''   
    Convert output of Freemix OAI service into the DPLA JSON-LD format.

    Does not currently require any enrichments to be ahead in the pipeline, but
    supports geocoding if used. In the future, subject shredding may be assumed too.

    Parameter "geoprop" specifies the property name containing lat/long coords
    '''

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document to sourceResource
    for p in data.keys():
        if p in CHO_TRANSFORMER:
            out['sourceResource'].update(CHO_TRANSFORMER[p](data))
        if p in AGGREGATION_TRANSFORMER:
            out.update(AGGREGATION_TRANSFORMER[p](data))

    # Additional content not from original document

    if 'HTTP_CONTRIBUTOR' in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ['HTTP_CONTRIBUTOR']))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ['HTTP_CONTRIBUTOR'] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
Example #55
0
def kentucky_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    LOG_JSON_ON_ERROR = True

    def log_json():
        if LOG_JSON_ON_ERROR:
            logger.debug(body)

    data = {}
    try:
        data = json.loads(body)
    except Exception as e:
        msg = "Bad JSON: " + e.args[0]
        logger.error(msg)
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return msg

    relation_field = "sourceResource/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        msg = "Field %s does not exist" % relation_field
        logger.debug(msg)
        return body

    base_url, ext = os.path.splitext(url)
    data["object"] = "%s_tb%s" % (base_url, ext)

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #56
0
    def _twofishes_data(self, url):
        """Return a dict of Twofishes data for the given URL.

        Rely on the response being Unicode-encoded JSON.
        """
        logger.debug("GET %s" % url)
        try:
            response = urlopen(url, None, 2)
            http_status = response.getcode()
            if http_status != 200:
                logger.error("Got status %d from %s" % (http_status, url))
                return {}
            return json.loads(response.read())
        except URLError as e:
            logger.error("Could not open %s (%s)" % (url, e))
            return {}
        except Exception as e:
            logger.error("Unexpected exception from %s: %e" % (url, e))
            return {}
Example #57
0
def is_fresh(resp):
    """
    Returns a tuple, the first element a boolean whether the response can be
    considered (for our purposes) fresh or not, and the second the freshness
    lifetime of the response.

    Much of this is reworked from httplib2._entry_disposition. We can't reuse it
    directly since it assumes responses are stale unless otherwise marked as
    fresh, and we want to do the opposite.
    """
    fresh = True
    freshness_lifetime = 0

    cc_response = httplib2._parse_cache_control(resp)
    if 'no-cache' in cc_response or 'private' in cc_response:
        fresh = False
    elif 'date' in resp:
        date = calendar.timegm(email.Utils.parsedate_tz(resp['date']))
        now = time.time()
        current_age = max(0,
                          now - date - 5)  # Give us 5 seconds to get this far
        if 'max-age' in cc_response:
            try:
                freshness_lifetime = int(cc_response['max-age'])
            except ValueError:
                freshness_lifetime = 0

        elif 'expires' in resp:
            expires = email.Utils.parsedate_tz(resp['expires'])
            if expires == None:
                freshness_lifetime = 0
            else:
                freshness_lifetime = calendar.timegm(expires) - date
        else:
            freshness_lifetime = 0

        if freshness_lifetime < current_age:
            logger.debug(
                'lifetime = {0}, age = {1}, so marking explicitly stale'.
                format(freshness_lifetime, current_age))
            fresh = False

    return fresh, freshness_lifetime
Example #58
0
def david_rumsey_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    LOG_JSON_ON_ERROR = True

    def log_json():
        if LOG_JSON_ON_ERROR:
            logger.debug(body)

    data = {}
    try:
        data = json.loads(body)
    except Exception as e:
        msg = "Bad JSON: " + e.args[0]
        logger.error(msg)
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return msg

    handle_field = "originalRecord/handle"
    if exists(data, handle_field):
        handle = getprop(data, handle_field)
    else:
        msg = "Field %s does not exist" % handle_field
        logger.debug(msg)
        return body

    data["object"] = handle[1]

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)