コード例 #1
0
ファイル: moinrest.py プロジェクト: pombredanne/akara
def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError,e:
        raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
コード例 #2
0
ファイル: moinrest.py プロジェクト: pombredanne/akara
def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.
    
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError,e:
        
        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
コード例 #3
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue  # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://"
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
コード例 #4
0
def _delete_page(environ, start_response):
    '''
    Deletes a Wiki page, returning 200 if successful.  Does not yet support
    the deletion of attachments.

    '''
    #The Moin form asks that this be in multipart-form format, but the multipart handler
    #fallsback to url-encoding unless you pass it a file.  Luckily, the equivalent
    #url-encoded request works... for now.

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers)

    url = absolutize(page, base)

    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
    except urllib2.URLError, e:

        if e.code == 404:
            # Moin returns 404 on a succcessful DeletePage POST; recast as a 200
            pass
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
コード例 #5
0
def _put_page(environ, start_response):
    '''
    '''
    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)

    ctype = environ.get('CONTENT_TYPE', 'application/unknown')
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers)
    form_vars["savetext"] = open(temp_fpath, "r").read()

    url = absolutize(page, base)
    data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, data, req_headers)
    try:
        logger.debug('Prior to urllib2.opener')
        with closing(opener.open(request)) as resp:
            logger.debug('Return from urllib2.opener')
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            logger.debug('HTML parse complete post urllib2.opener')
    except urllib2.URLError, e:
        raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
コード例 #6
0
ファイル: enrich.py プロジェクト: dpla/ingestion
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://" 
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
コード例 #7
0
def ucsb_aleph_marc_id(body, ctype):
    '''MARC sucks'''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    ident = None
    for field in data['fields']:
        if '856' in field:
            subfields = field['856']['subfields']
            for subf in subfields:
                if 'u' in subf:
                    # restrict to ones that have url like
                    # http://www.library.ucsb.edu/OBJID/Cylinder0002
                    if 'OBJID' in subf['u']:
                        ident = subf['u']

    if not ident:
        logger.error('NO 856 u for doc leader:{}'.format(data['leader']))
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident)
    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
コード例 #8
0
def sfpl_marc_id(body, ctype):
    '''MARC sucks'''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    ident = None
    for field in data['fields']:
        if '010' in field:
            subfields = field['010']['subfields']
            for subf in subfields:
                if 'a' in subf:
                    ident = subf['a']

    if not ident:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident)
    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
コード例 #9
0
ファイル: enrich.py プロジェクト: dpla/ingestion
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header "Pipeline-Item"
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u"Pipeline-Item","").split(",")

    records = json.loads(body)

    # Counts
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        error, enriched_record_text = pipe(record, ctype, rec_enrichments,
                                           "HTTP_PIPELINE_ITEM")
        if error:
            errors.append(error)

        enriched_record = json.loads(enriched_record_text)

        if enriched_record.get("_id", None):
            ingest_type = enriched_record.get("ingestType")
            # Item records should have sourceResource
            if (ingest_type == "item" and not
                "sourceResource" in enriched_record):
                logger.error("Record %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            logger.error("Found a record without an _id %s" % enriched_record)
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)


    return json.dumps(docs)
コード例 #10
0
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header "Pipeline-Item"
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")

    records = json.loads(body)

    # Counts
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        error, enriched_record_text = pipe(record, ctype, rec_enrichments,
                                           "HTTP_PIPELINE_ITEM")
        if error:
            errors.append(error)

        enriched_record = json.loads(enriched_record_text)

        if enriched_record.get("_id", None):
            ingest_type = enriched_record.get("ingestType")
            # Item records should have sourceResource
            if (ingest_type == "item"
                    and not "sourceResource" in enriched_record):
                logger.error("Record %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            logger.error("Found a record without an _id %s" % enriched_record)
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)

    return json.dumps(docs)
コード例 #11
0
ファイル: enrich.py プロジェクト: ranti/ingestion
def pipe(content,ctype,enrichments,wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if len(uri) < 1: continue # in case there's no pipeline
        headers = copy_headers_to_dict(request.environ,exclude=[wsgi_header])
        headers['content-type'] = ctype
        resp, cont = H.request(uri,'POST',body=body,headers=headers)
        if not str(resp.status).startswith('2'):
            logger.debug("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
コード例 #12
0
ファイル: select-id.py プロジェクト: mlhale7/ingestion
def selid(body, ctype, prop='handle', use_source='yes'):
    '''
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''

    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    record_id = None
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            record_id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        record_id = h
                if not record_id:
                    record_id = v[0]

    if not record_id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"
    '''
    If the useSource parameter is True (default) than prepend it to
    the id and use that value when hashing for the DPLA id
    '''
    if use_source.lower() == 'yes':
        data[u'_id'] = couch_rec_id_builder(source_name, record_id)
    else:
        data[u'_id'] = clean_id(record_id)

    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
コード例 #13
0
ファイル: edan_select_id.py プロジェクト: mlhale7/ingestion
def selid(body,
          ctype,
          prop='descriptiveNonRepeating/record_link',
          alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''
    tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data, prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v, basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)
コード例 #14
0
ファイル: enrich.py プロジェクト: amber-reichert/ingestion
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header 'Pipeline-Rec'
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    docs = {}
    for record in data:
        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        docs[doc["_id"]] = doc

    return json.dumps(docs)
コード例 #15
0
def selid(body,ctype,prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''   
    tmpl="http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"
    
    if prop:
        try :
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type','text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data,prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v,basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type','text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)
コード例 #16
0
ファイル: enrich.py プロジェクト: chadfennell/ingestion
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header 'Pipeline-Rec'
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    docs = {}
    for record in data:
        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        docs[doc["_id"]] = doc

    return json.dumps(docs)
コード例 #17
0
def selid(body, ctype, prop='handle'):
    '''
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''

    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    id = None
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        id = h
                if not id:
                    id = v[0]

    if not id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
    #   we don't use this, dump it
    #    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
コード例 #18
0
ファイル: enrich.py プロジェクト: chadfennell/ingestion
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp)))
            continue

        body = cont
    return body
コード例 #19
0
ファイル: select-id.py プロジェクト: chadfennell/ingestion
def selid(body, ctype, prop='handle'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''   
    
    if not prop:
        # Remove this document
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property has been selected"

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    id = None
    if exists(data,prop):
        v = getprop(data,prop)
        if isinstance(v,basestring):
            id = v
        else:
            if v:
                for h in (v if isinstance(v, list) else [v]):
                    if is_absolute(h):
                        id = h
                if not id:
                    id = v[0]

    if not id:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
コード例 #20
0
def selectid(body, ctype):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''   
    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    try:
        select_id(source_name, data)
    except ValueError, e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"
コード例 #21
0
def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" +
                 repr((wiki_id, base, opener, original_page,
                       wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
    #    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener,
                                     req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError, e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
コード例 #22
0
ファイル: enrich.py プロジェクト: amber-reichert/ingestion
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ['wsgi.url_scheme'] + '://' 
            if request.environ.get('HTTP_HOST'):
                prefix += request.environ['HTTP_HOST']
            else:
                prefix += request.environ['SERVER_NAME']
            uri = prefix + uri
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers['content-type'] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, 'POST', body=body, headers=headers)
        if not str(resp.status).startswith('2'):
            logger.warn("Error in enrichment pipeline at %s: %s" % 
                        (uri, repr(resp)))
            continue
        body = cont

    return body
コード例 #23
0
def selectid(body, ctype):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to
    the value of the property named by the "prop" paramater
    '''   
    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    objid = None
    v = getprop(data, 'identifier')
    if isinstance(v,basestring):
        objid = v
    else:
        if v:
            for h in (v if isinstance(v, list) else [v]):
                if h['text'].startswith('http://ark.cdlib.org/ark:'):
                    if is_absolute(h['text']):
                        objid = h['text']
                if not objid:
                    objid = v[0]

    if not objid:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, objid)
    data[u'id']  = hashlib.md5(data[u'_id']).hexdigest()
    data[u'isShownAt'] = objid
    data[u'isShownBy'] = objid + '/thumbnail'

    return json.dumps(data)
コード例 #24
0
ファイル: moinrest.py プロジェクト: pombredanne/akara
def post_page(environ, start_response):
    '''
    Attachments use URI path params
    (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99)
    '''
    #ctype = environ.get('CONTENT_TYPE', 'application/unknown')

    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base)))
    check_auth(environ, start_response, base, opener, req_headers)

    page = environ['PATH_INFO'].lstrip('/')
    page, chaff, attachment = page.partition(';attachment=')
#    print >> sys.stderr, page, attachment
    #now = datetime.now().isoformat()
    #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap
    #content = StringIO(environ['wsgi.input'].read(clen))
    temp_fpath = read_http_body_to_temp(environ, start_response)
    form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers)
    form_vars["file"] = open(temp_fpath, "rb")

    url = absolutize(page, base)
    #print >> sys.stderr, url, temp_fpath
    #data = urllib.urlencode(form_vars)
    request = urllib2.Request(url, form_vars, req_headers)
    try:
        with closing(opener.open(request)) as resp:
            doc = htmlparse(resp)
            raise_embedded_error(doc)
            #logger.debug('POST for attachment page response... ' + doc.xml_encode())

    except urllib2.URLError,e:
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
コード例 #25
0
ファイル: moinrest.py プロジェクト: pombredanne/akara
def get_page(environ, start_response):
    #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO'])))
    req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)
    upstream_handler = None
    status = httplib.OK
    params = cgi.parse_qs(environ['QUERY_STRING'])
    #Note: probably a better solution here: http://code.google.com/p/mimeparse/
    accepted_imts = environ.get('HTTP_ACCEPT', '').split(',')
    #logger.debug('accepted_imts: ' + repr(accepted_imts))
    imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
    #logger.debug('imt: ' + repr(imt))
    params_for_moin = {}
    cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used
    if NO_CACHE_PATHS and first_item(dropwhile(lambda x: x not in page, NO_CACHE_PATHS)):
        cache_max_age = None

    if 'rev' in params:
        #XXX: Not compatible with search
        #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'}
        params_for_moin = {'rev' : params['rev'][0]}
    if 'search' in params:
        searchq = params['search'][0]
        query = urllib.urlencode({'value' : searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text'})
        #?action=fullsearch&context=180&value=foo&=Text
        url = absolutize('?'+query, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
        cache_max_age = None
    #elif 'action' in params and params['action'][0] == 'recall':
    elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''):
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page+'?'+params, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.HTML_IMT
    elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''):
        #FIXME: Make unique flag optional
        #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1'
        url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base)
        #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', )
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
    elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''):
        url = absolutize(page + '?action=AttachFile', base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.ATTACHMENTS_IMT
        def upstream_handler():
            #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            doc = htmlparse(rbody)
            raise_embedded_error(doc)
            attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]')
            targets = []
            for node in attachment_nodes:
                target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0]
                targets.append(target)
            output = structencoder(indent=u"yes")
            output.feed(
            ROOT(
                E((u'attachments'),
                    (E(u'attachment', {u'href': unicode(t)}) for t in targets)
                )
            ))
            return output.read(), ctype
    #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw
    elif ';attachment=' in page:
        page, attachment = page.split(';attachment=', 1)
        url = absolutize(page + '?action=AttachFile&do=get&target=' + attachment, base)
        request = urllib2.Request(url, None, req_headers)
        def upstream_handler():
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            return rbody, dict(resp.info())['content-type']
    #
    elif ';history' in page:
        cache_max_age = None
        page, discard = page.split(';history', 1)
        ctype = moin.XML_IMT
        def upstream_handler():
            revs = scrape_page_history(page, base, opener, req_headers)
            output = structencoder(indent=u"yes")
            output.feed(
            ROOT(
                E((u'history'),
                    (E(u'rev', {u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T')}) for r in revs)
                )
            ))
            return output.read(), ctype
    elif imt:
        params_for_moin.update({'mimetype': imt})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.DOCBOOK_IMT
    else:
        params_for_moin.update({'action': 'raw'})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.WIKITEXT_IMT
    try:
        if upstream_handler:
            rbody, ctype = upstream_handler()
        else:
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
        
        #headers = {moin.ORIG_BASE_HEADER: base}
        #moin_base = absolutize(wiki_id, base)
        moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
        response_headers = [("Content-Type", ctype),
                            ("Vary", "Accept"),
                            (moin.ORIG_BASE_HEADER, moin_base_info)]
        if cache_max_age:
            response_headers.append(("Cache-Control","max-age="+str(cache_max_age)))

        start_response(status_response(status), response_headers)
        return rbody
    except urllib2.URLError, e:
        if e.code == 401:
            raise HTTPAuthorizationError(url=request.get_full_url())
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id)
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ),backurl=url)
        else:
            raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
コード例 #26
0
ファイル: enrich.py プロジェクト: dpla/ingestion
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs
    provided in two request headers, one for collections and one for items.

    Returns a JSON dump of the collections and records enriched along with a
    count of records enriched.
    """
    request_headers = copy_headers_to_dict(request.environ)
    item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")
    coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",")

    records = json.loads(body)

    # Counts for enrich script
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        if record.get("ingestType") == "collection":
            wsgi_header = "HTTP_PIPELINE_COLL"
            enrichments = coll_enrichments
        else:
            wsgi_header = "HTTP_PIPELINE_ITEM"
            enrichments = item_enrichments
            # Preserve record prior to any enrichments
            record["originalRecord"] = record.copy()         
            record["ingestType"] = "item"

        # Explicitly populate ingestDate as UTC
        record["ingestDate"] = iso_utc_with_tz()

        error, enriched_record_text = pipe(record, ctype, enrichments,
                                           wsgi_header)
        enriched_record = json.loads(enriched_record_text)
        if error:
            errors.append(error)

        ingest_type = record.get("ingestType")
        # Enriched record should have an _id
        if enriched_record.get("_id", None):
            # Item records should have sourceResource
            if (ingest_type == "item" and not "sourceResource" in
                enriched_record):
                logger.error("Records %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)
コード例 #27
0
ファイル: enrich.py プロジェクト: ranti/ingestion
def enrich(body,ctype):
    '''   
    Establishes a pipeline of services identified by an ordered list of URIs provided
    in two request headers, one for collections and one for records
    '''

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    collection_name = request_headers.get('Collection')

    if not (collection_name or source_name):
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Source and Collection request headers are required"

    coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',')
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    # First, we run the collection representation through its enrichment pipeline
    cid = "%s-%s"%(source_name,collection_name)
    at_id = "http://dp.la/api/collections/" + cid
    COLL = {
        "_id": cid,
        "@id": at_id,
    }

    enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL')
    enriched_collection = json.loads(enriched_coll_text)
    if COUCH_DATABASE:
        docuri = couch_rev_check(join(COUCH_DATABASE,cid))
        resp, cont = H.request(docuri,'PUT',body=enriched_coll_text,headers=CT_JSON)
        if not str(resp.status).startswith('2'):
            logger.debug("Error storing collection in Couch: "+repr((resp,content)))

    # Then the records
    for record in data[u'items']:
        # Preserve record prior to any enrichments
        record['original_record'] = record.copy()         

        # Add collection information
        record[u'collection'] = {
            '@id' : at_id,
            'title' : enriched_collection.get('title',"")
        }

        # Set id to value of the first handle, disambiguated w source. Not sure if
        # one is guaranteed or on what scale it's unique
        rid = "%s-%s"%(source_name,record[u'handle'][0].strip())
        record[u'_id'] = rid

        enriched_record = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        if COUCH_DATABASE:
            docuri = couch_rev_check(join(COUCH_DATABASE,rid))
            resp, cont = H.request(docuri,'PUT',body=enriched_record,headers=CT_JSON)
            if not str(resp.status).startswith('2'):
                logger.debug("Error storing record in Couch: "+repr((resp,content)))
                continue
    
    return json.dumps({})
コード例 #28
0
def get_page(environ, start_response):
    #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO'])))
    req_headers = copy_headers_to_dict(environ,
                                       exclude=['HTTP_ACCEPT_ENCODING'])
    wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ)
    page = environ['PATH_INFO'].lstrip('/')
    check_auth(environ, start_response, base, opener, req_headers)
    upstream_handler = None
    status = httplib.OK
    params = cgi.parse_qs(environ['QUERY_STRING'])
    #Note: probably a better solution here: http://code.google.com/p/mimeparse/
    accepted_imts = environ.get('HTTP_ACCEPT', '').split(',')
    #logger.debug('accepted_imts: ' + repr(accepted_imts))
    imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts))
    #logger.debug('imt: ' + repr(imt))
    params_for_moin = {}
    cache_max_age = CACHE_MAX_AGE  # max-age of this response. If set to None, it will not be used
    if NO_CACHE_PATHS and first_item(
            dropwhile(lambda x: x not in page, NO_CACHE_PATHS)):
        cache_max_age = None

    if 'rev' in params:
        #XXX: Not compatible with search
        #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'}
        params_for_moin = {'rev': params['rev'][0]}
    if 'search' in params:
        searchq = params['search'][0]
        query = urllib.urlencode({
            'value': searchq,
            'action': 'fullsearch',
            'context': '180',
            'fullsearch': 'Text'
        })
        #?action=fullsearch&context=180&value=foo&=Text
        url = absolutize('?' + query, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
        cache_max_age = None
    #elif 'action' in params and params['action'][0] == 'recall':
    elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''):
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page + '?' + params, base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.HTML_IMT
    elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''):
        #FIXME: Make unique flag optional
        #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1'
        url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base)
        #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', )
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.RDF_IMT
    elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''):
        url = absolutize(page + '?action=AttachFile', base)
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.ATTACHMENTS_IMT

        def upstream_handler():
            #Sigh.  Sometimes you have to break some Tag soup eggs to make a RESTful omlette
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            doc = htmlparse(rbody)
            raise_embedded_error(doc)
            attachment_nodes = doc.xml_select(
                u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]'
            )
            targets = []
            for node in attachment_nodes:
                target = [
                    param.split('=', 1)[1] for param in node.href.split(u'&')
                    if param.startswith('target=')
                ][0]
                targets.append(target)
            output = structencoder(indent=u"yes")
            output.feed(
                ROOT(
                    E((u'attachments'),
                      (E(u'attachment', {u'href': unicode(t)})
                       for t in targets))))
            return output.read(), ctype
    #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw
    elif ';attachment=' in page:
        page, attachment = page.split(';attachment=', 1)
        url = absolutize(
            page + '?action=AttachFile&do=get&target=' + attachment, base)
        request = urllib2.Request(url, None, req_headers)

        def upstream_handler():
            with closing(opener.open(request)) as resp:
                rbody = resp.read()
            return rbody, dict(resp.info())['content-type']
    #
    elif ';history' in page:
        cache_max_age = None
        page, discard = page.split(';history', 1)
        ctype = moin.XML_IMT

        def upstream_handler():
            revs = scrape_page_history(page, base, opener, req_headers)
            output = structencoder(indent=u"yes")
            output.feed(
                ROOT(
                    E((u'history'), (E(
                        u'rev', {
                            u'id': unicode(r['rev']),
                            u'editor': unicode(r['editor']),
                            u'date': unicode(r['date']).replace(' ', 'T')
                        }) for r in revs))))
            return output.read(), ctype
    elif imt:
        params_for_moin.update({'mimetype': imt})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.DOCBOOK_IMT
    else:
        params_for_moin.update({'action': 'raw'})
        params = urllib.urlencode(params_for_moin)
        url = absolutize(page, base) + '?' + params
        request = urllib2.Request(url, None, req_headers)
        ctype = moin.WIKITEXT_IMT
    try:
        if upstream_handler:
            rbody, ctype = upstream_handler()
        else:
            with closing(opener.open(request)) as resp:
                rbody = resp.read()

        #headers = {moin.ORIG_BASE_HEADER: base}
        #moin_base = absolutize(wiki_id, base)
        moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page
        response_headers = [("Content-Type", ctype), ("Vary", "Accept"),
                            (moin.ORIG_BASE_HEADER, moin_base_info)]
        if cache_max_age:
            response_headers.append(
                ("Cache-Control", "max-age=" + str(cache_max_age)))

        start_response(status_response(status), response_headers)
        return rbody
    except urllib2.URLError, e:
        if e.code == 401:
            raise HTTPAuthorizationError(url=request.get_full_url())
        if e.code == 403:
            raise MoinMustAuthenticateError(url=request.get_full_url(),
                                            target=wiki_id)
        if e.code == 404:
            raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url)
        else:
            raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
コード例 #29
0
ファイル: enrich.py プロジェクト: eldios/ingestion
def enrich(body,ctype):
    '''   
    Establishes a pipeline of services identified by an ordered list of URIs provided
    in two request headers, one for collections and one for records
    '''

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    collection_name = request_headers.get('Collection')

    if not (collection_name or source_name):
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Source and Collection request headers are required"

    coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',')
    rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',')

    data = json.loads(body)

    # First, we run the collection representation through its enrichment pipeline
    cid = COUCH_ID_BUILDER(source_name,collection_name)
    at_id = "http://dp.la/api/collections/" + cid
    COLL = {
        "_id": cid,
        "@id": at_id,
        "ingestType": "collection"
    }
    # Set collection title field from collection_name if no sets
    if not coll_enrichments[0]:
        COLL['title'] = collection_name 
    set_ingested_date(COLL)

    enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL')
    enriched_collection = json.loads(enriched_coll_text)
    # FIXME. Integrate collection storage into bulk call below
    if COUCH_DATABASE:
        docuri = join(COUCH_DATABASE,cid)
        couch_rev_check_coll(docuri,enriched_collection)
        resp, cont = H.request(docuri,'PUT',body=json.dumps(enriched_collection),headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items()))
        if not str(resp.status).startswith('2'):
            logger.debug("Error storing collection in Couch: "+repr((resp,cont)))

    # Then the records
    docs = []
    for record in data[u'items']:
        # Preserve record prior to any enrichments
        record['originalRecord'] = record.copy()         

        # Add collection information
        record[u'collection'] = {
            '@id' : at_id,
            'name' : enriched_collection.get('title',"")
        }
        if 'description' in enriched_collection:
            record[u'collection']['description'] = enriched_collection.get('description',"")

        record[u'ingestType'] = 'item'
        set_ingested_date(record)

        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        docs.append(doc)

    couch_rev_check_recs(docs,source_name)
    couch_docs_text = json.dumps({"docs":docs})
    if COUCH_DATABASE:
        resp, content = H.request(join(COUCH_DATABASE,'_bulk_docs'),'POST',body=couch_docs_text,headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items()))
        logger.debug("Couch bulk update response: "+content)
        if not str(resp.status).startswith('2'):
            logger.debug('HTTP error posting to CouchDB: '+repr((resp,content)))

    return json.dumps({'docs' : docs})
コード例 #30
0
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs
    provided in two request headers, one for collections and one for items.

    Returns a JSON dump of the collections and records enriched along with a
    count of records enriched.
    """
    request_headers = copy_headers_to_dict(request.environ)
    item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")
    coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",")

    records = json.loads(body)

    # Counts for enrich script
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        if record.get("ingestType") == "collection":
            wsgi_header = "HTTP_PIPELINE_COLL"
            enrichments = coll_enrichments
        else:
            wsgi_header = "HTTP_PIPELINE_ITEM"
            enrichments = item_enrichments
            # Preserve record prior to any enrichments
            record["originalRecord"] = record.copy()
            record["ingestType"] = "item"

        # Explicitly populate ingestDate as UTC
        record["ingestDate"] = iso_utc_with_tz()

        error, enriched_record_text = pipe(record, ctype, enrichments,
                                           wsgi_header)
        enriched_record = json.loads(enriched_record_text)
        if error:
            errors.append(error)

        ingest_type = record.get("ingestType")
        # Enriched record should have an _id
        if enriched_record.get("_id", None):
            # Item records should have sourceResource
            if (ingest_type == "item"
                    and not "sourceResource" in enriched_record):
                logger.error("Records %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)
コード例 #31
0
ファイル: enrich.py プロジェクト: amber-reichert/ingestion
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs
    provided in two request headers, one for collections and one for records.

    Returns a JSON dump of the collections and records enriched along with a
    count of records enriched.
    """
    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u'Pipeline-Rec', '').split(',')
    coll_enrichments = request_headers.get(u'Pipeline-Coll', '').split(',')

    data = json.loads(body)
    provider = data['provider']
    collection = data['collection']
    contributor = data['contributor']

    # Enrich collection first
    if collection:
        coll_id = collection.get('id')
        desc = collection.get('description')
        title = collection.get('title')
        COLLECTIONS[coll_id] = enrich_coll(ctype, provider, coll_id,
                                           coll_enrichments, title, desc)

    docs = {}
    for record in data['records']:
        # Preserve record prior to any enrichments
        record['originalRecord'] = record.copy()         

        # Set ingestType, provider, and ingestDate
        record[u'ingestType'] = 'item'
        record[u'provider'] = contributor
        set_ingested_date(record)

        # Add collection(s)
        record[u'collection'] = []
        # OAI records can be part of multiple collections whose titles are
        # listed in the record's "setSpec" property
        sets = record.get('setSpec')
        if sets:
            for set_id in iterify(sets):
                if set_id not in COLLECTIONS:
                    COLLECTIONS[set_id] = enrich_coll(ctype, provider, set_id,
                                                      coll_enrichments)
                record[u'collection'].append(create_record_collection(
                                        COLLECTIONS[set_id])
                                        )

            if len(record[u'collection']) == 1:
                record[u'collection'] = record[u'collection'][0]
        elif collection:
            record[u'collection'] = create_record_collection(
                                        COLLECTIONS[coll_id]
                                        )

        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        # After pipe doc must have _id and sourceResource
        if doc.get("_id", None):
            if "sourceResource" in doc:
                logger.debug("Enriched record %s" % doc["_id"])
                docs[doc["_id"]] = doc
            else:
                logger.error("Document %s does not have sourceResource: %s" %
                             (doc["_id"], doc))
        else:
            logger.error("Document does not have an _id: %s" % doc)

    enriched_records_count =  len(docs)

    # Add collections to docs
    for collection in COLLECTIONS.values():
        docs[collection["_id"]] = collection

    data = {
        "enriched_records": docs,
        "enriched_records_count": enriched_records_count
    }

    return json.dumps(data)
コード例 #32
0
ファイル: enrich.py プロジェクト: chadfennell/ingestion
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs provided
    in two request headers, one for collections and one for records
    """

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')
    collection_name = request_headers.get('Collection')

    if not (collection_name or source_name):
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Source and Collection request headers are required"

    coll_enrichments = request_headers.get(u'Pipeline-Coll', '').split(',')
    rec_enrichments = request_headers.get(u'Pipeline-Rec', '').split(',')

    data = json.loads(body)

    # For non-OAI, the collection title is included as part of the data,
    # so we extract it here to pass it to def enrich_coll a few lines down.
    # For OAI, the collection enrichment pipeline with set the title and so
    # None will be overridden. 
    collection_title = data.get("title", None)

    docs = {}
    for record in data[u'items']:
        # Preserve record prior to any enrichments
        record['originalRecord'] = record.copy()         

        # Add collection(s)
        record[u'collection'] = []
        sets = record.get('setSpec', collection_name)
        for s in (sets if isinstance(sets, list) else [sets]):
            if s not in COLLECTIONS:
                COLLECTIONS[s] = enrich_coll(ctype, source_name, s,
                                             collection_title, coll_enrichments)
            rec_collection = {
                'id': COLLECTIONS[s].get('id', None),
                '@id': COLLECTIONS[s].get('@id', None),
                'title': COLLECTIONS[s].get('title', None),
                'description': COLLECTIONS[s].get('description', None)
            }
            record[u'collection'].append(dict((k, v) for k, v in
                                         rec_collection.items() if v))
                    
        if len(record[u'collection']) == 1:
            record[u'collection'] = record[u'collection'][0]

        record[u'ingestType'] = 'item'
        set_ingested_date(record)

        doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC')
        doc = json.loads(doc_text)
        # After pipe doc must have _id and sourceResource
        if doc.get("_id", None):
            if "sourceResource" in doc:
                docs[doc["_id"]] = doc
            else:
                logger.error("Document does not have sourceResource: %s" % doc["_id"])

    # Add collections to docs
    for collection in COLLECTIONS.values():
        docs[collection["_id"]] = collection

    return json.dumps(docs)