def sfpl_marc_id(body, ctype): '''MARC sucks''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" ident = None for field in data['fields']: if '010' in field: subfields = field['010']['subfields'] for subf in subfields: if 'a' in subf: ident = subf['a'] if not ident: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def enrich_temporal_date(body, ctype, prop="aggregatedCHO/temporal", date_key="name"): """ Service that accepts a JSON document and extracts the "created date" of the item, using the following rules: a) Looks in the list of fields specified by the 'prop' parameter b) Extracts all dates, and sets the created date to the earliest date """ try : data = json.loads(body) except: response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return "Unable to parse body as JSON" date_candidates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) for s in v: a, b = parse_date_or_range(s[date_key]) date_candidates.append( { "begin": a, "end": b, "displayDate" : s[date_key] }) if date_candidates: setprop(data, p, date_candidates) return json.dumps(data)
def capitalize_value(body, ctype, prop=",".join(DEFAULT_PROP), exclude=None): """ Service that accepts a JSON document and capitalizes the prop field of that document """ if prop is None: response.code = 500 response.add_header('content-type', 'text/plain') msg = "Prop param is None" logger.error(msg) return msg try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) prop = prop.split(",") if exclude in prop: prop.remove(exclude) for p in prop: if p: capitalize(data, p) return json.dumps(data)
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data,prop) for v in iterify(value): name = replace_state_abbreviations(v["name"].rstrip()) v["name"] = name # Try to extract a County if " county " in name.lower(): # "XXX County (S.C.)" => county: XXX v["county"] = name[0:name.lower().index("county")].strip() elif "(S.C.)" in name: # "XXX (S.C)" => city: XXX v["city"] = name[0:name.index("(S.C.)")].strip() return json.dumps(data)
def replace_substring(body, ctype, prop=None, old=None, new=None): """Replaces a substring in prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to apply replacing old -- the substring to replace new -- the substring to replaced old with """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not old or not new: logger.error("No old or new parameters were provided") else: if exists(data, prop): v = getprop(data, prop) setprop(data, prop, v.replace(old, new)) return json.dumps(data)
def decode_html(body, ctype, prop=None): """Decodes any encoded html in the prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to decode """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" REGEX = ('"', '"'), ('&', '&'), ('<', '<'), ('>', '>') if prop and exists(data, prop): decoded = [] v = getprop(data, prop) if not isinstance(v, list): v = [v] for s in v: if isinstance(s, basestring): for p, r in REGEX: s = re.sub(p, r, s) decoded.append(s) setprop(data, prop, decoded) return json.dumps(data)
def ucsb_aleph_marc_id(body, ctype): '''MARC sucks''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" ident = None for field in data['fields']: if '856' in field: subfields = field['856']['subfields'] for subf in subfields: if 'u' in subf: # restrict to ones that have url like # http://www.library.ucsb.edu/OBJID/Cylinder0002 if 'OBJID' in subf['u']: ident = subf['u'] if not ident: logger.error('NO 856 u for doc leader:{}'.format(data['leader'])) response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def replace_regex(body, ctype, prop=None, regex=None, new=None): """Replaces a regex in prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to apply replacing regex -- the regex to replace new -- the substring to replaced regex with """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not regex: logger.error("No regex parameter supplied") else: if not new: logger.debug("NO New parameter, will replace with empty string") new = '' if exists(data, prop): v = getprop(data, prop) new_val = replace_regex_recurse_field(v, regex, new) setprop(data, prop, new_val) return json.dumps(data)
def oaitodpla(body, ctype, geoprop=None): ''' Convert output of Freemix OAI service into the DPLA JSON-LD format. Does not currently require any enrichments to be ahead in the pipeline, but supports geocoding if used. In the future, subject shredding may be assumed too. Parameter "geoprop" specifies the property name containing lat/long coords ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document to sourceResource for p in data.keys(): if p in CHO_TRANSFORMER: out['sourceResource'].update(CHO_TRANSFORMER[p](data)) if p in AGGREGATION_TRANSFORMER: out.update(AGGREGATION_TRANSFORMER[p](data)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def setcontext(body, ctype, prop="@context"): """ Service that accepts a JSON document and sets the "@context" field of that document. """ try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" item_context = { "@context": "http://dp.la/api/items/context", "aggregatedCHO": "#sourceResource", "@type": "ore:Aggregation" } collection_context = { "@context": "http://dp.la/api/collections/context", "@type": "dcmitype:Collection" } if data["ingestType"] == "item": data.update(item_context) setprop(data, "sourceResource/@id", "%s#sourceResource" % data["@id"]) else: data.update(collection_context) return json.dumps(data)
def nypl_select_hasview(body, ctype): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_document_key = u"originalRecord" original_preview_key = u"tmp_high_res_link" source_key = u"hasView" if original_document_key not in data: logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id']) return body if original_preview_key not in data[original_document_key]: logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id']) return body data[source_key] = { "@id": data[original_document_key][original_preview_key], "format": None } return json.dumps(data)
def kentucky_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ data = {} try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" relation_field = "sourceResource/relation" if exists(data, relation_field): url = getprop(data, relation_field) else: logger.debug("Field %s does not exist" % relation_field) return body base_url, ext = os.path.splitext(url) data["object"] = "%s_tb%s" % (base_url, ext) status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def digital_commonwealth_enrich_location( body, ctype, action="digital_commonwealth_enrich_location", prop="sourceResource/spatial"): """ Service that massages a Digital Commonwealth JSON document. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Strings which are present in the spatial field, which do end up being geocoded, # but are not locations NON_SPATIALS = [ "Aerial views.", "Church history.", "Dwellings", "Dwellings.", "History", "Pictorial works" ] if (exists(data, prop)): # Spatial field is simply a list of strings, convert to a list # of dictionaries with the name key set to the string value spatials = [] for spatial in iterify(getprop(data, prop)): if (isinstance(spatial, basestring) \ and spatial not in NON_SPATIALS): spatials.append({"name": format_spatial(spatial)}) setprop(data, prop, spatials) return json.dumps(data)
def setspectype(body, ctype, prop="sourceResource/type"): """ Service that accepts a JSON document and sets the "sourceResource/specType" field of that document from the prop field """ try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" TYPE_TO_SPEC_TYPE = { "book": "Book", "government": "Government Document", "periodical": "Serial", "nonmusic": "Nonmusic", "still image": "Photograph/Pictorial Works", "mixed material": "Mixed Material" } if exists(data, prop): spec_type = [] for s in iterify(getprop(data, prop)): for k, v in TYPE_TO_SPEC_TYPE.items(): if k in s.lower() and v not in spec_type: spec_type.append(v) if spec_type: setprop(data, "sourceResource/specType", spec_type) return json.dumps(data)
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in", prop="sourceResource/stateLocatedIn"): """ Service that accepts a JSON document and enriches the "stateLocatedIn" field of that document by: For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): sli = [] values = getprop(data,prop) for v in values.split(";"): if STATE_CODES.get(v): sli.append(STATE_CODES[v]) else: sli.append(v) setprop(data, prop, "; ".join(sli)) return json.dumps(data)
def uscenrichlocation(body, ctype, action="usc_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by: 1. If one of the spatial values is a lat/lon coordinate, removing all other values 2. Removing 1-3 digit numbers and values that contain "s.d" For primary use with USC documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): spatial = getprop(data, prop) coordinates = find_coordinates(spatial) if coordinates: spatial = [{"name": "%s, %s" % coordinates}] else: spatial = clean(spatial) spatial = join_values(spatial) setprop(data, prop, spatial) return json.dumps(data)
def get_isostate(strg, frm_abbrev=None): if not isinstance(strg, basestring): response.code = 500 response.add_header('content-type', 'text/plain') return "Non-string parameter supplied to get_isostate" iso_arr, state_arr = [], [] strg_arr = strg.split(';') for strg_item in strg_arr: if frm_abbrev: states = from_abbrev(strg_item) else: states = [strg_item] for state in states: for st in STATES: if st in state.upper(): iso_arr.append(STATES[st]) state_arr.append(st.title()) iso, state = None, None if iso_arr: iso = ';'.join(iso_arr) if state_arr: state = ';'.join(state_arr) return (iso, state)
def david_rumsey_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" handle_field = "originalRecord/handle" if exists(data, handle_field): handle = getprop(data, handle_field) else: logger.error("Field %s does not exist" % handle_field) return body data["object"] = handle[1] status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def digital_commonwealth_enrich_location(body, ctype, action="digital_commonwealth_enrich_location", prop="sourceResource/spatial"): """ Service that massages a Digital Commonwealth JSON document. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Strings which are present in the spatial field, which do end up being geocoded, # but are not locations NON_SPATIALS = ["Aerial views.", "Church history.", "Dwellings", "Dwellings.", "History", "Pictorial works"] if (exists(data, prop)): # Spatial field is simply a list of strings, convert to a list # of dictionaries with the name key set to the string value spatials = [] for spatial in iterify(getprop(data, prop)): if (isinstance(spatial, basestring) \ and spatial not in NON_SPATIALS): spatials.append({"name": format_spatial(spatial)}) setprop(data, prop, spatials) return json.dumps(data)
def nypl_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_document_key = u"originalRecord" original_preview_key = u"tmp_image_id" preview_format = "http://images.nypl.org/index.php?id={0}&t=t" if original_document_key not in data: logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id']) return body if original_preview_key not in data[original_document_key]: logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id']) return body preview_url = preview_format.format(data[original_document_key][original_preview_key]) data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def set_prop(body, ctype, prop=None, value=None, condition_prop=None, condition_value=None): """Sets the value of prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to set value -- the value to set prop to condition_prop -- (optional) the field that must exist to set the prop """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not value: logger.error("No value was supplied to set_prop.") else: # If there is no condition_prop, set the prop, creating it if it does #not exist. If there is a condition_prop, only set the prop if the # condition_prop exists. if not condition_prop or exists(data, condition_prop): setprop(data, prop, value) return json.dumps(data)
def nypl_select_hasview(body, ctype): try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % ( HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text original_document_key = u"originalRecord" original_preview_key = u"tmp_high_res_link" source_key = u"hasView" if original_document_key not in data: logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id']) return body if original_preview_key not in data[original_document_key]: logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id']) return body data[source_key] = { "@id": data[original_document_key][original_preview_key], "format": None } return json.dumps(data)
def oaimodstodpladigitalnc(body, ctype, geoprop=None): """ Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def enrichdate(body, ctype, action="enrich-format", prop="aggregatedCHO/date"): """ Service that accepts a JSON document and extracts the "created date" of the item, using the following rules: a) Looks in the list of fields specified by the 'prop' parameter b) Extracts all dates, and sets the created date to the earliest date """ try : data = json.loads(body) except: response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return "Unable to parse body as JSON" date_candidates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) date_candidates = [] for s in (v if not isinstance(v, basestring) else [v]): a, b = parse_date_or_range(s) date_candidates.append( { "begin": a, "end": b, "displayDate" : s }) date_candidates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR) if date_candidates: setprop(data, p, date_candidates[0]) return json.dumps(data)
def ia_identify_object(body, ctype, download="True"): try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def ia_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format( getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def setspectype(body, ctype, prop="sourceResource/type"): """ Service that accepts a JSON document and sets the "sourceResource/specType" field of that document from the prop field """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" TYPE_TO_SPEC_TYPE = { "book": "Book", "government": "Government Document", "periodical": "Serial", "nonmusic": "Nonmusic", "still image": "Photograph/Pictorial Works", "mixed material": "Mixed Material" } if exists(data, prop): spec_type = [] for s in iterify(getprop(data, prop)): for k, v in TYPE_TO_SPEC_TYPE.items(): if k in s.lower() and v not in spec_type: spec_type.append(v) if spec_type: setprop(data, "sourceResource/specType", spec_type) return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): spatials = [] for spatial in iterify(getprop(data,prop)): if (is_spatial(spatial)): spatials.append(format_spatial(spatial)) if (len(spatials) > 0): setprop(data, prop, spatials) else: delprop(data, prop) return json.dumps(data)
def geocode(body,ctype,prop=None,newprop=None): ''' Service that accepts a JSON document and "unshreds" the value of the field named by the "prop" parameter ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if prop not in data: return json.dumps(data) # graceful abort if not newprop: newprop = prop if hasattr(data[prop],'__iter__'): # Handle strings and iterables data[newprop] = [ lookup_place(place) for place in data[prop] ] else: data[newprop] = lookup_place(data[prop]) return json.dumps(data)
def get_isostate(strg, abbrev=None): if not isinstance(strg, basestring): response.code = 500 response.add_header('content-type', 'text/plain') return "Non-string parameter supplied to get_isostate" iso_arr, state_arr = [], [] for s in strg.split(";"): states = from_abbrev(s) if abbrev else s for state in (states if isinstance(states, list) else [states]): append_empty_strings = True for st in STATES: if st in state.upper(): iso_arr.append(STATES[st]) state_arr.append(st.title()) append_empty_strings = None if append_empty_strings: iso_arr.append("") state_arr.append("") iso = None state = None if filter(None, iso_arr): iso = ';'.join(iso_arr) if state_arr: state = ';'.join(state_arr) return (iso, state)
def nypl_identify_object(body, ctype, list_sets=None): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" H = httplib2.Http('/tmp/.cache') H.force_exception_as_status_code = True resp, content = H.request(list_sets) if not resp[u'status'].startswith('2'): logger.error(' HTTP error (' + resp[u'status'] + ') resolving URL: ' + list_sets) return body content_dict = xmltodict.parse(content, xml_attribs=True, attr_prefix='', force_cdata=False, ignore_whitespace_cdata=True) sets = content_dict["nyplAPI"]["response"] for r in sets: if "collection" == r: for coll_dict in sets[r]: if "uuid" in coll_dict and "title" in coll_dict and (coll_dict["uuid"] == data["title"] or coll_dict["uuid"] in data["@id"]): data["title"] = coll_dict["title"] return json.dumps(data)
def ia_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % ( HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by removing duplicate array elements ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [_stripped(s) for s in v if _stripped(s)] # Get index of unique values index = list( set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): ''' Service that accepts a JSON document and enriches the "language" field of that document by: a) converting a list of language values into list of dictionaries: {"name": language} By default it works on the 'language' field, but can be overridden by passing the name of the field to use as a parameter ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): langs = getprop(data, prop) if isinstance(langs, basestring): setprop(data, prop, {"name": langs}) elif isinstance(langs, list): languages = [] for l in langs: languages.append({"name": l}) setprop(data, prop, languages) return json.dumps(data)
def set_ucldc_dataprovider(body, ctype): '''For ucldc, we always have a originalRecord/collection entry. This has a repository object which may or may not have a list of campuses. Concatenate the repo & campus if exisiting, separated by a , for dataProvider value ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" collection = getprop(data, 'originalRecord/collection')[0] repo = collection['repository'][0] campus = None if len(repo['campus']): campus = repo['campus'][0] dataProvider = repo['name'] if campus: dataProvider = ', '.join((campus['name'], repo['name'])) setprop(data, 'dataProvider', dataProvider) data['provider'] = {} setprop(data, 'provider/name', dataProvider) setprop(data, 'provider/@id', collection['@id']) data['sourceResource']['stateLocatedIn'] = [{'name': 'California'}] return json.dumps(data)
def uscsetdataprovider(body, ctype, prop="dataProvider"): """ Service that accepts a JSON document and sets the "dataProvider" field of that document to: 1. The first value of the originalRecord/source field (placed in dataProvider in the oai-to-dpla module) for the chs set (setSpec p15799coll65) 2. The string "University of Southern California. Libraries" for all other sets For primary use with USC documents """ try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" data_provider = getprop(data, "dataProvider", True) if getprop(data, "originalRecord/setSpec") == "p15799coll65": setprop(data, "dataProvider", data_provider[0]) else: setprop(data, "dataProvider", "University of Southern California. Libraries") return json.dumps(data)
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in", prop="sourceResource/stateLocatedIn"): """ Service that accepts a JSON document and enriches the "stateLocatedIn" field of that document by: For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): sli = [] values = getprop(data, prop) for v in values.split(";"): if STATE_CODES.get(v): sli.append(STATE_CODES[v]) else: sli.append(v) setprop(data, prop, "; ".join(sli)) return json.dumps(data)
def shred(body,ctype,action="shred",prop=None,delim=';'): ''' Service that accepts a JSON document and "shreds" or "unshreds" the value of the field(s) named by the "prop" parameter "prop" can include multiple property names, delimited by a comma (the delim property is used only for the fields to be shredded/unshredded). This requires that the fields share a common delimiter however. ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" for p in prop.split(','): if exists(data,p): v = getprop(data,p) if action == "shred": if isinstance(v,list): v = delim.join(v) setprop(data,p,v) if delim not in v: continue setprop(data,p,[ s.strip() for s in v.split(delim) ]) elif action == "unshred": if isinstance(v,list): setprop(data,p,delim.join(v)) return json.dumps(data)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop is None: response.code = 500 response.add_header('content-type', 'text/plain') msg = "Prop param is None" logger.error(msg) return msg try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data, prop) for v in iterify(value): name = replace_state_abbreviations(v["name"].rstrip()) v["name"] = name # Try to extract a County if " county " in name.lower(): # "XXX County (S.C.)" => county: XXX v["county"] = name[0:name.lower().index(" county")] elif "(S.C.)" in name: # "XXX (S.C)" => city: XXX v["city"] = name[0:name.index(" (S.C.)")] return json.dumps(data)
def uscsetdataprovider(body, ctype, prop="dataProvider"): """ Service that accepts a JSON document and sets the "dataProvider" field of that document to: 1. The first value of the originalRecord/source field (placed in dataProvider in the oai-to-dpla module) for the chs set (setSpec p15799coll65) 2. The string "University of Southern California. Libraries" for all other sets For primary use with USC documents """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" data_provider = getprop(data, "dataProvider", True) if getprop(data, "originalRecord/setSpec") == "p15799coll65": setprop(data, "dataProvider", data_provider[0]) else: setprop(data, "dataProvider", "University of Southern California. Libraries") return json.dumps(data)
def georgiasetspectype(body, ctype): """ Service that accepts a JSON document and sets the "sourceResource/specType" field of that document from the "sourceResource/type" field For primary use with DLG documents """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" TYPE_TO_SPEC_TYPE = { "books": "Book", "government": "Government Document", "periodicals": "Serial" } type = getprop(data, "sourceResource/type", True) if type: spec_type = [] for s in iterify(type): for k, v in TYPE_TO_SPEC_TYPE.items(): if k in s.lower() and v not in spec_type: spec_type.append(v) if spec_type: setprop(data, "sourceResource/specType", spec_type) return json.dumps(data)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)