Beispiel #1
0
def sfpl_marc_id(body, ctype):
    '''MARC sucks'''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    ident = None
    for field in data['fields']:
        if '010' in field:
            subfields = field['010']['subfields']
            for subf in subfields:
                if 'a' in subf:
                    ident = subf['a']

    if not ident:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident)
    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
Beispiel #2
0
def enrich_temporal_date(body, ctype, prop="aggregatedCHO/temporal", date_key="name"):
    """
    Service that accepts a JSON document and extracts the "created date" of the item, using the
    following rules:

    a) Looks in the list of fields specified by the 'prop' parameter
    b) Extracts all dates, and sets the created date to the earliest date
    """
    try :
        data = json.loads(body)
    except:
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return "Unable to parse body as JSON"

    date_candidates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            for s in v:
                a, b = parse_date_or_range(s[date_key])
                date_candidates.append( {
                    "begin": a,
                    "end": b,
                    "displayDate" : s[date_key]
                })
    if date_candidates:
        setprop(data, p, date_candidates)

    return json.dumps(data)
Beispiel #3
0
def capitalize_value(body, ctype, prop=",".join(DEFAULT_PROP), exclude=None):
    """
    Service that accepts a JSON document and capitalizes the prop field of that document
    """

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    prop = prop.split(",")
    if exclude in prop:
        prop.remove(exclude)

    for p in prop:
        if p:
            capitalize(data, p)

    return json.dumps(data)
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document.

    For use with the scdl profiles
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        value = getprop(data,prop)
        for v in iterify(value): 
            name = replace_state_abbreviations(v["name"].rstrip())
            v["name"] = name

            # Try to extract a County
            if " county " in name.lower(): 
                # "XXX County (S.C.)" => county: XXX
                v["county"] = name[0:name.lower().index("county")].strip()
            elif "(S.C.)" in name:
                # "XXX (S.C)" => city: XXX
                v["city"] = name[0:name.index("(S.C.)")].strip()

    return json.dumps(data)
def replace_substring(body, ctype, prop=None, old=None, new=None):
    """Replaces a substring in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    old -- the substring to replace
    new -- the substring to replaced old with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not old or not new:
        logger.error("No old or new parameters were provided")
    else:
        if exists(data, prop):
            v = getprop(data, prop)
            setprop(data, prop, v.replace(old, new))

    return json.dumps(data)
def decode_html(body, ctype, prop=None):
    """Decodes any encoded html in the prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to decode
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    REGEX = ('&quot;', '"'), ('&amp;', '&'), ('&lt;', '<'), ('&gt;', '>')

    if prop and exists(data, prop):
        decoded = []
        v = getprop(data, prop)
        if not isinstance(v, list):
            v = [v]
        for s in v:
            if isinstance(s, basestring):
                for p, r in REGEX:
                    s = re.sub(p, r, s)
            decoded.append(s)

        setprop(data, prop, decoded)
                

    return json.dumps(data)
def ucsb_aleph_marc_id(body, ctype):
    '''MARC sucks'''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    ident = None
    for field in data['fields']:
        if '856' in field:
            subfields = field['856']['subfields']
            for subf in subfields:
                if 'u' in subf:
                    # restrict to ones that have url like
                    # http://www.library.ucsb.edu/OBJID/Cylinder0002
                    if 'OBJID' in subf['u']:
                        ident = subf['u']

    if not ident:
        logger.error('NO 856 u for doc leader:{}'.format(data['leader']))
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident)
    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "",
                                 item[i].strip()).lstrip()

        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)
Beispiel #9
0
def replace_regex(body, ctype, prop=None, regex=None, new=None):
    """Replaces a regex in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    regex -- the regex to replace
    new -- the substring to replaced regex with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not regex:
        logger.error("No regex parameter supplied")
    else:
        if not new:
            logger.debug("NO New parameter, will replace with empty string")
            new = ''
        if exists(data, prop):
            v = getprop(data, prop)
            new_val = replace_regex_recurse_field(v, regex, new)
            setprop(data, prop, new_val)

    return json.dumps(data)
Beispiel #10
0
def oaitodpla(body, ctype, geoprop=None):
    '''   
    Convert output of Freemix OAI service into the DPLA JSON-LD format.

    Does not currently require any enrichments to be ahead in the pipeline, but
    supports geocoding if used. In the future, subject shredding may be assumed too.

    Parameter "geoprop" specifies the property name containing lat/long coords
    '''

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document to sourceResource
    for p in data.keys():
        if p in CHO_TRANSFORMER:
            out['sourceResource'].update(CHO_TRANSFORMER[p](data))
        if p in AGGREGATION_TRANSFORMER:
            out.update(AGGREGATION_TRANSFORMER[p](data))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
def setcontext(body, ctype, prop="@context"):
    """   
    Service that accepts a JSON document and sets the "@context" field of that
    document.
    """

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    item_context = {
        "@context": "http://dp.la/api/items/context",
        "aggregatedCHO": "#sourceResource",
        "@type": "ore:Aggregation"
    }

    collection_context = {
        "@context": "http://dp.la/api/collections/context",
        "@type": "dcmitype:Collection"
    }

    if data["ingestType"] == "item":
        data.update(item_context)
        setprop(data, "sourceResource/@id", "%s#sourceResource" % data["@id"])
    else:
        data.update(collection_context)

    return json.dumps(data)
def decode_html(body, ctype, prop=None):
    """Decodes any encoded html in the prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to decode
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    REGEX = ('&quot;', '"'), ('&amp;', '&'), ('&lt;', '<'), ('&gt;', '>')

    if prop and exists(data, prop):
        decoded = []
        v = getprop(data, prop)
        if not isinstance(v, list):
            v = [v]
        for s in v:
            if isinstance(s, basestring):
                for p, r in REGEX:
                    s = re.sub(p, r, s)
            decoded.append(s)

        setprop(data, prop, decoded)

    return json.dumps(data)
def nypl_select_hasview(body, ctype):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_high_res_link"
    source_key = u"hasView"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].",
                     original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].",
                     original_document_key, original_preview_key, data[u'id'])
        return body

    data[source_key] = {
        "@id": data[original_document_key][original_preview_key],
        "format": None
    }
    return json.dumps(data)
def kentucky_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """
    data = {}
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    relation_field = "sourceResource/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        logger.debug("Field %s does not exist" % relation_field)
        return body

    base_url, ext = os.path.splitext(url)
    data["object"] = "%s_tb%s" % (base_url, ext)

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Beispiel #15
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
def digital_commonwealth_enrich_location(
        body,
        ctype,
        action="digital_commonwealth_enrich_location",
        prop="sourceResource/spatial"):
    """
    Service that massages a Digital Commonwealth JSON document.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Strings which are present in the spatial field, which do end up being geocoded,
    #  but are not locations
    NON_SPATIALS = [
        "Aerial views.", "Church history.", "Dwellings", "Dwellings.",
        "History", "Pictorial works"
    ]

    if (exists(data, prop)):
        # Spatial field is simply a list of strings, convert to a list
        #  of dictionaries with the name key set to the string value
        spatials = []
        for spatial in iterify(getprop(data, prop)):
            if (isinstance(spatial, basestring) \
                and spatial not in NON_SPATIALS):
                spatials.append({"name": format_spatial(spatial)})

        setprop(data, prop, spatials)

    return json.dumps(data)
def setspectype(body, ctype, prop="sourceResource/type"):
    """   
    Service that accepts a JSON document and sets the "sourceResource/specType"
    field of that document from the prop field
    """

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    TYPE_TO_SPEC_TYPE = {
        "book": "Book",
        "government": "Government Document",
        "periodical": "Serial",
        "nonmusic": "Nonmusic",
        "still image": "Photograph/Pictorial Works",
        "mixed material": "Mixed Material"
    }

    if exists(data, prop):
        spec_type = []
        for s in iterify(getprop(data, prop)):
            for k, v in TYPE_TO_SPEC_TYPE.items():
                if k in s.lower() and v not in spec_type:
                    spec_type.append(v)

            if spec_type:
                setprop(data, "sourceResource/specType", spec_type)

    return json.dumps(data)
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in",
                             prop="sourceResource/stateLocatedIn"):
    """
    Service that accepts a JSON document and enriches the "stateLocatedIn"
    field of that document by:

    For primary use with MWDL documents.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        sli = []
        values = getprop(data,prop)
        for v in values.split(";"):
            if STATE_CODES.get(v):
                sli.append(STATE_CODES[v])
            else:
                sli.append(v)
        setprop(data, prop, "; ".join(sli))

    return json.dumps(data)
Beispiel #19
0
def uscenrichlocation(body, ctype, action="usc_enrich_location",
                      prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of
    that document by:

    1. If one of the spatial values is a lat/lon coordinate, removing all other
       values
    2. Removing 1-3 digit numbers and values that contain "s.d"

    For primary use with USC documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        spatial = getprop(data, prop)

        coordinates = find_coordinates(spatial)
        if coordinates:
            spatial = [{"name": "%s, %s" % coordinates}]
        else:
            spatial = clean(spatial)
            spatial = join_values(spatial)

        setprop(data, prop, spatial)

    return json.dumps(data)
Beispiel #20
0
def get_isostate(strg, frm_abbrev=None):
    if not isinstance(strg, basestring):
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Non-string parameter supplied to get_isostate"

    iso_arr, state_arr = [], []
    strg_arr = strg.split(';')
    for strg_item in strg_arr:
        if frm_abbrev:
            states = from_abbrev(strg_item)
        else:
            states = [strg_item]
        for state in states:
            for st in STATES:
                if st in state.upper():
                    iso_arr.append(STATES[st])
                    state_arr.append(st.title())

    iso, state = None, None
    if iso_arr:
        iso = ';'.join(iso_arr)
        if state_arr:
            state = ';'.join(state_arr)
    return (iso, state)
Beispiel #21
0
def david_rumsey_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    handle_field = "originalRecord/handle"
    if exists(data, handle_field):
        handle = getprop(data, handle_field)
    else:
        logger.error("Field %s does not exist" % handle_field)
        return body

    data["object"] = handle[1]

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
def digital_commonwealth_enrich_location(body, ctype, action="digital_commonwealth_enrich_location", prop="sourceResource/spatial"):
    """
    Service that massages a Digital Commonwealth JSON document.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Strings which are present in the spatial field, which do end up being geocoded, 
    #  but are not locations
    NON_SPATIALS = ["Aerial views.",
                    "Church history.", 
                    "Dwellings",
                    "Dwellings.",
                    "History",
                    "Pictorial works"]

    if (exists(data, prop)): 
        # Spatial field is simply a list of strings, convert to a list 
        #  of dictionaries with the name key set to the string value
        spatials = []
        for spatial in iterify(getprop(data, prop)):
            if (isinstance(spatial, basestring) \
                and spatial not in NON_SPATIALS):
                spatials.append({"name": format_spatial(spatial)})
                
        setprop(data, prop, spatials)

    return json.dumps(data)
Beispiel #24
0
def nypl_identify_object(body, ctype, download="True"):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_image_id"
    preview_format = "http://images.nypl.org/index.php?id={0}&t=t"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id'])
        return body

    preview_url = preview_format.format(data[original_document_key][original_preview_key])
    data["object"] = preview_url

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Beispiel #25
0
def set_prop(body, ctype, prop=None, value=None, condition_prop=None,
             condition_value=None):
    """Sets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to set
    value -- the value to set prop to
    condition_prop -- (optional) the field that must exist to set the prop
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not value:
        logger.error("No value was supplied to set_prop.")
    else:
        # If there is no condition_prop, set the prop, creating it if it does
        #not exist. If there is a condition_prop, only set the prop if the
        # condition_prop exists.
        if not condition_prop or exists(data, condition_prop):
            setprop(data, prop, value)

    return json.dumps(data)
Beispiel #26
0
def nypl_select_hasview(body, ctype):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (
            HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_high_res_link"
    source_key = u"hasView"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].",
                     original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].",
                     original_document_key, original_preview_key, data[u'id'])
        return body

    data[source_key] = {
        "@id": data[original_document_key][original_preview_key],
        "format": None
    }
    return json.dumps(data)
def oaimodstodpladigitalnc(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
Beispiel #28
0
def enrichdate(body, ctype, action="enrich-format", prop="aggregatedCHO/date"):
    """
    Service that accepts a JSON document and extracts the "created date" of the item, using the
    following rules:

    a) Looks in the list of fields specified by the 'prop' parameter
    b) Extracts all dates, and sets the created date to the earliest date 
    """
    try :
        data = json.loads(body)
    except:
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE,  HTTP_TYPE_TEXT)
        return "Unable to parse body as JSON"

    date_candidates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            date_candidates = []
            for s in (v if not isinstance(v, basestring) else [v]):
                a, b = parse_date_or_range(s)
                date_candidates.append( {
                        "begin": a,
                        "end": b,
                        "displayDate" : s
                        })
        date_candidates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)
        if date_candidates:
            setprop(data, p, date_candidates[0])

    return json.dumps(data)
Beispiel #29
0
def ia_identify_object(body, ctype, download="True"):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Beispiel #30
0
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(
            getprop(data, "originalRecord/_id"),
            getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]",
                     original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Beispiel #31
0
def setspectype(body, ctype, prop="sourceResource/type"):
    """   
    Service that accepts a JSON document and sets the "sourceResource/specType"
    field of that document from the prop field
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    TYPE_TO_SPEC_TYPE = {
        "book": "Book",
        "government": "Government Document",
        "periodical": "Serial",
        "nonmusic": "Nonmusic",
        "still image": "Photograph/Pictorial Works",
        "mixed material": "Mixed Material"
    }

    if exists(data, prop):
        spec_type = []
        for s in iterify(getprop(data, prop)):
            for k, v in TYPE_TO_SPEC_TYPE.items():
                if k in s.lower() and v not in spec_type:
                    spec_type.append(v)

            if spec_type:
                setprop(data, "sourceResource/specType", spec_type)

    return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)
def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document. 

    For primary use with MWDL documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        spatials = []
        for spatial in iterify(getprop(data,prop)):
            if (is_spatial(spatial)): 
                spatials.append(format_spatial(spatial))

        if (len(spatials) > 0): 
            setprop(data, prop, spatials)
        else:
            delprop(data, prop)

    return json.dumps(data)
Beispiel #34
0
def geocode(body,ctype,prop=None,newprop=None):
    '''   
    Service that accepts a JSON document and "unshreds" the value of the
    field named by the "prop" parameter
    '''   

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if prop not in data:
        return json.dumps(data) # graceful abort

    if not newprop:
        newprop = prop

    if hasattr(data[prop],'__iter__'): # Handle strings and iterables
        data[newprop] = [ lookup_place(place) for place in data[prop] ]
    else:
        data[newprop] = lookup_place(data[prop])

    return json.dumps(data)
def get_isostate(strg, abbrev=None):
    if not isinstance(strg, basestring):
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Non-string parameter supplied to get_isostate"

    iso_arr, state_arr = [], []
    for s in strg.split(";"):
        states = from_abbrev(s) if abbrev else s
        for state in (states if isinstance(states, list) else [states]):
            append_empty_strings = True
            for st in STATES:
                if st in state.upper():
                    iso_arr.append(STATES[st])
                    state_arr.append(st.title())
                    append_empty_strings = None
            if append_empty_strings:
                iso_arr.append("")
                state_arr.append("")

    iso = None
    state = None
    if filter(None, iso_arr):
        iso = ';'.join(iso_arr)
        if state_arr:
            state = ';'.join(state_arr)
    return (iso, state)
Beispiel #36
0
def set_prop(body,
             ctype,
             prop=None,
             value=None,
             condition_prop=None,
             condition_value=None):
    """Sets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to set
    value -- the value to set prop to
    condition_prop -- (optional) the field that must exist to set the prop
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not value:
        logger.error("No value was supplied to set_prop.")
    else:
        # If there is no condition_prop, set the prop, creating it if it does
        #not exist. If there is a condition_prop, only set the prop if the
        # condition_prop exists.
        if not condition_prop or exists(data, condition_prop):
            setprop(data, prop, value)

    return json.dumps(data)
Beispiel #37
0
def nypl_identify_object(body, ctype, list_sets=None):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    H = httplib2.Http('/tmp/.cache')
    H.force_exception_as_status_code = True
    resp, content = H.request(list_sets)
    if not resp[u'status'].startswith('2'):
        logger.error('  HTTP error (' + resp[u'status'] + ') resolving URL: ' + list_sets)
        return body
    content_dict = xmltodict.parse(content, xml_attribs=True, attr_prefix='', force_cdata=False, ignore_whitespace_cdata=True)
    sets = content_dict["nyplAPI"]["response"]

    for r in sets:
        if "collection" == r:
            for coll_dict in sets[r]:
                if "uuid" in coll_dict and "title" in coll_dict and (coll_dict["uuid"] == data["title"] or coll_dict["uuid"] in data["@id"]):
                    data["title"] = coll_dict["title"]

    return json.dumps(data)
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Beispiel #39
0
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (
            HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "",
                                 item[i].strip()).lstrip()

        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)
Beispiel #40
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that
    document by removing duplicate array elements
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        for p in prop.split(","):
            if exists(data, p):
                v = getprop(data, p)
                if isinstance(v, list):
                    # Remove whitespace, periods, parens, brackets
                    clone = [_stripped(s) for s in v if _stripped(s)]
                    # Get index of unique values
                    index = list(
                        set([clone.index(s) for s in list(set(clone))]))
                    setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
Beispiel #41
0
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"):
    '''
    Service that accepts a JSON document and enriches the "language" field of that document
    by:

    a) converting a list of language values into list of dictionaries: {"name": language}

    By default it works on the 'language' field, but can be overridden by passing the name of the field to use
    as a parameter
    '''

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        langs = getprop(data, prop)

        if isinstance(langs, basestring):
            setprop(data, prop, {"name": langs})
        elif isinstance(langs, list):
            languages = []
            for l in langs:
                languages.append({"name": l})
            setprop(data, prop, languages)

    return json.dumps(data)
def set_ucldc_dataprovider(body, ctype):
    '''For ucldc, we always have a originalRecord/collection entry.
    This has a repository object which may or may not have a list of 
    campuses.
    Concatenate the repo & campus if exisiting, separated by a ,
    for dataProvider value
    '''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    collection = getprop(data, 'originalRecord/collection')[0]
    repo = collection['repository'][0]
    campus = None
    if len(repo['campus']):
        campus = repo['campus'][0]
    dataProvider = repo['name']
    if campus:
        dataProvider = ', '.join((campus['name'], repo['name']))
    setprop(data, 'dataProvider', dataProvider)
    data['provider'] = {}
    setprop(data, 'provider/name', dataProvider)
    setprop(data, 'provider/@id', collection['@id'])
    data['sourceResource']['stateLocatedIn'] = [{'name': 'California'}]
    return json.dumps(data)
def uscsetdataprovider(body, ctype, prop="dataProvider"):
    """   
    Service that accepts a JSON document and sets the "dataProvider"
    field of that document to:

    1. The first value of the originalRecord/source field (placed in
       dataProvider in the oai-to-dpla module) for the chs set (setSpec
       p15799coll65)
    2. The string "University of Southern California. Libraries" for all
       other sets

    For primary use with USC documents
    """

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    data_provider = getprop(data, "dataProvider", True)
    if getprop(data, "originalRecord/setSpec") == "p15799coll65":
        setprop(data, "dataProvider", data_provider[0])
    else:
        setprop(data, "dataProvider",
                "University of Southern California. Libraries")

    return json.dumps(data)
def mwdlenrichstatelocatedin(body,
                             ctype,
                             action="mdl_enrich_state_located_in",
                             prop="sourceResource/stateLocatedIn"):
    """
    Service that accepts a JSON document and enriches the "stateLocatedIn"
    field of that document by:

    For primary use with MWDL documents.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        sli = []
        values = getprop(data, prop)
        for v in values.split(";"):
            if STATE_CODES.get(v):
                sli.append(STATE_CODES[v])
            else:
                sli.append(v)
        setprop(data, prop, "; ".join(sli))

    return json.dumps(data)
Beispiel #45
0
def shred(body,ctype,action="shred",prop=None,delim=';'):
    '''   
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This requires
    that the fields share a common delimiter however.
    '''   
    
    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(','):
        if exists(data,p):
            v = getprop(data,p)
            if action == "shred":
                if isinstance(v,list):
                    v = delim.join(v)
                    setprop(data,p,v)
                if delim not in v: continue
                setprop(data,p,[ s.strip() for s in v.split(delim) ])
            elif action == "unshred":
                if isinstance(v,list):
                    setprop(data,p,delim.join(v))

    return json.dumps(data)
Beispiel #46
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens
                clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))

                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
def scdl_enrich_location(body,
                         ctype,
                         action="scdl_enrich_location",
                         prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document.

    For use with the scdl profiles
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        value = getprop(data, prop)
        for v in iterify(value):
            name = replace_state_abbreviations(v["name"].rstrip())
            v["name"] = name

            # Try to extract a County
            if " county " in name.lower():
                # "XXX County (S.C.)" => county: XXX
                v["county"] = name[0:name.lower().index(" county")]
            elif "(S.C.)" in name:
                # "XXX (S.C)" => city: XXX
                v["city"] = name[0:name.index(" (S.C.)")]

    return json.dumps(data)
Beispiel #48
0
def uscsetdataprovider(body, ctype, prop="dataProvider"):
    """   
    Service that accepts a JSON document and sets the "dataProvider"
    field of that document to:

    1. The first value of the originalRecord/source field (placed in
       dataProvider in the oai-to-dpla module) for the chs set (setSpec
       p15799coll65)
    2. The string "University of Southern California. Libraries" for all
       other sets

    For primary use with USC documents
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"


    data_provider = getprop(data, "dataProvider", True)
    if getprop(data, "originalRecord/setSpec") == "p15799coll65":
        setprop(data, "dataProvider", data_provider[0])
    else:
        setprop(data, "dataProvider",
                "University of Southern California. Libraries")

    return json.dumps(data)
def georgiasetspectype(body, ctype):
    """   
    Service that accepts a JSON document and sets the "sourceResource/specType"
    field of that document from the "sourceResource/type" field

    For primary use with DLG documents
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    TYPE_TO_SPEC_TYPE = {
        "books": "Book",
        "government": "Government Document",
        "periodicals": "Serial"
    }

    type = getprop(data, "sourceResource/type", True)
    if type:
        spec_type = []
        for s in iterify(type):
            for k, v in TYPE_TO_SPEC_TYPE.items():
                if k in s.lower() and v not in spec_type:
                    spec_type.append(v)

        if spec_type:
            setprop(data, "sourceResource/specType", spec_type)

    return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)
Beispiel #51
0
def replace_substring(body, ctype, prop=None, old=None, new=None):
    """Replaces a substring in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    old -- the substring to replace
    new -- the substring to replaced old with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not old or not new:
        logger.error("No old or new parameters were provided")
    else:
        if exists(data, prop):
            v = getprop(data, prop)
            setprop(data, prop, v.replace(old, new))

    return json.dumps(data)
Beispiel #52
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens, brackets
                clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))
            
                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
def get_isostate(strg, abbrev=None):
    if not isinstance(strg, basestring):
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Non-string parameter supplied to get_isostate"

    iso_arr, state_arr = [], []
    for s in strg.split(";"):
        states = from_abbrev(s) if abbrev else s
        for state in (states if isinstance(states, list) else [states]):
            append_empty_strings = True
            for st in STATES:
                if st in state.upper():
                    iso_arr.append(STATES[st])
                    state_arr.append(st.title())
                    append_empty_strings = None
            if append_empty_strings:
                iso_arr.append("")
                state_arr.append("")

    iso = None
    state = None
    if filter(None, iso_arr):
        iso = ';'.join(iso_arr)
        if state_arr:
            state = ';'.join(state_arr)
    return (iso, state)