def map_spatial(self): spatial = [] prop = "subject" if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "hierarchicalGeographic" in s: spatial = s["hierarchicalGeographic"] name = ", ".join( filter(None, [ spatial.get("city"), spatial.get("county"), spatial.get("state"), spatial.get("country") ])) spatial["name"] = name spatial = [spatial] prop = "originInfo/place" if not spatial and exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "placeTerm" in s: for place in iterify(s["placeTerm"]): if "type" in place and place["type"] != "code": spatial.append(place["#text"]) if spatial: self.update_source_resource({"spatial": spatial})
def oaimodstodpladigitalnc(body, ctype, geoprop=None): """ Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def map_format(self): if exists(self.provider_data, "medium"): self.update_source_resource({"format": getprop(self.provider_data, "medium")}) elif exists(self.provider_data, "format"): self.update_source_resource({"format": getprop(self.provider_data, "format")})
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def map_subject(self): # Mapped from subject and genre # # Per discussion with Amy on 10 April 2014, don't worry about # checking whether heading maps to authority file. Amy simplified the # crosswalk. # # TODO: When present, we should probably pull in the valueURI and # authority values into the sourceResource.subject - this would # represent an index/API change, however. subject = [] if exists(self.provider_data, "subject"): for v in iterify(getprop(self.provider_data, "subject")): if "topic" in v: if isinstance(v, basestring): subject.append(v["topic"]) elif isinstance(v["topic"], dict): subject.append(v["topic"].get("#text")) else: logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"]) if exists(v, "name/namePart"): subject.append(getprop(v, "name/namePart")) if exists(self.provider_data, "genre"): for v in iterify(getprop(self.provider_data, "genre")): if isinstance(v, basestring): subject.append(v) elif isinstance(v, dict): subject.append(v.get("#text")) else: logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"]) if subject: self.update_source_resource({"subject": subject})
def set_field_from_value_mode(data, field, mode, value, multivalue=True): '''Set the value for the data "field" from data in collection ckey field with the value passed in. ''' logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue)) if value: #no value don't bother if mode=='overwrite': if exists(data, field): setprop(data, field, value) else: pp,pn = tuple(field.lstrip('/').split('/',1)) if not pp in data: data[pp] = {} data[pp][pn] = value elif mode=='append': new_value = [] if exists(data, field): old_value = getprop(data, field) if isinstance(old_value, list): new_value.extend(old_value) else: new_value.append(old_value) if isinstance(value, list): new_value.extend(value) else: new_value.append(value) setprop(data, field, new_value) else: # fill blanks if not exists(data, field) or not getprop(data, field,keyErrorAsNone=True): if multivalue and not isinstance(value, list): value = [value] setprop(data, field, value) return data
def oaimodstodpla(body, ctype, geoprop=None, provider=None): """ Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } if provider == "BPL": data = remove_key_prefix(data, "mods:") # Apply all transformation rules from original document transformer_pipeline = {} transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}), **CHO_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out["sourceResource"].update(transformer_pipeline[p](data, p)) transformer_pipeline = {} transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}), **AGGREGATION_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out.update(transformer_pipeline[p](data, p)) # Apply transformations that are dependent on more than one # original document field if provider == "HARVARD": out["sourceResource"].update(identifier_transform_harvard(data)) out.update(url_transform_harvard(data)) out.update(data_provider_transform_harvard(data)) # Join dataProvider with isPartOf for BPL if provider == "BPL": try: ipo = getprop(out, "dataProvider") + ". " + \ getprop(out, "sourceResource/isPartOf") setprop(out, "sourceResource/isPartOf", ipo.replace("..", ".")) except: pass # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def map_rights_note(self): rightsnotes = [] if exists(self.provider_data_source, 'ucldc_schema:rightsnotice'): rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnotice')) if exists(self.provider_data_source, 'ucldc_schema:rightsnote'): rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnote')) if rightsnotes: self.update_original_record({'rightsNote': rightsnotes})
def map_rights(self): rights = [] if exists(self.provider_data_source, 'ucldc_schema:rightsstatus'): rights_status = self.provider_data_source.get('ucldc_schema:rightsstatus') rights.append(self.map_rights_codes(rights_status)) if exists(self.provider_data_source, 'ucldc_schema:rightsstatement'): rights.append(self.provider_data_source.get('ucldc_schema:rightsstatement')) self.update_source_resource({'rights': rights})
def map_rights_holder(self): rightsholders = [] if exists(self.provider_data_source, 'ucldc_schema:rightsholder'): rightsholders = [rh['name'] for rh in self.provider_data_source.get('ucldc_schema:rightsholder')] if exists(self.provider_data_source, 'ucldc_schema:rightscontact'): rightsholders.append(self.provider_data_source.get('ucldc_schema:rightscontact')) if rightsholders: self.update_original_record({'rightsHolder': rightsholders})
def map_date(self): if exists(self.provider_data, "date"): self.update_source_resource({ "date": getprop(self.provider_data, "date") }) elif exists(self.provider_data, "created"): self.update_source_resource({ "date": getprop(self.provider_data, "created") })
def map_identifier(self): identifiers = [] if exists(self.provider_data_source, 'ucldc_schema:identifier'): identifiers.append(self.provider_data_source.get('ucldc_schema:identifier')) if exists(self.provider_data_source, 'ucldc_schema:localidentifier'): localids = self.provider_data_source.get('ucldc_schema:localidentifier') identifiers.extend(localids) if identifiers: self.update_source_resource({'identifier': identifiers})
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def ia_to_dpla(body, ctype, geoprop=None): """ Convert output of Internet Archive service into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" + "\n" + str(e) global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource" : {} } def multi_path_processor(data, paths, transformation): value = {} for sub_p in paths: if exists(data, sub_p): fetched = transformation[paths](data, sub_p) for k in fetched: if k in value: if isinstance(value[k], list): value[k].append(fetched[k]) elif isinstance(value[k], basestring) and value[k] != fetched[k]: value[k] = [value[k], fetched[k]] elif isinstance(value[k], dict): value[k].update(fetched[k]) else: value[k] = fetched[k] return value # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if isinstance(p, tuple): out["sourceResource"].update(multi_path_processor(data, p, CHO_TRANSFORMER)) elif exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if isinstance(p, tuple): out.update(multi_path_processor(data, p, AGGREGATION_TRANSFORMER)) elif exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update({"isShownAt": textnode(url)})
def map_title(self): path = "/metadata/mods/titleInfo" titles = [] if exists(self.provider_data, path): for t in iterify(getprop(self.provider_data, path)): if exists(t, "title") and not exists(t, "title/type"): titles.append(textnode(getprop(t, "title"))) if titles: self.update_source_resource({"title": titles})
def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) setting the format to be all lowercase b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) checking to see if the field is a valid IMT, and moving it to a separatee field if not See http://www.iana.org/assignments/media-types for list of valid media-types. We do not require that a subtype be defined. d) Remove any extra text after the IMT By default works on the 'format' field, but can be overridden by passing the name of the field to use as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter. """ REGEXPS = ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','') IMT_TYPES = ['application','audio','image','message','model','multipart','text','video'] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s) return s def is_imt(s): imt_regexes = [re.compile('^' + x + '(/|\Z)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if exists(data,prop): v = getprop(data,prop) format = [] physicalFormat = getprop(data,alternate) if exists(data,alternate) else [] if not isinstance(physicalFormat,list): physicalFormat = [physicalFormat] for s in (v if not isinstance(v,basestring) else [v]): format.append(cleanup(s)) if is_imt(cleanup(s)) else physicalFormat.append(s) if format: setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format) else: setprop(data,prop,None) if physicalFormat: setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat) return json.dumps(data)
def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None): """Copies value in one prop to another prop. For use with string and/or list prop value types. If to_prop exists, its value is iterified then extended with the iterified value of prop. If the to_prop parent prop (ie hasView in hasView/rights) does not exist, the from_prop value is not copied and an error is logged. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) skip_if_exists -- set to True to not copy if to_prop exists """ def is_string_or_list(value): return (isinstance(value, basestring) or isinstance(value, list)) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, to_prop) and skip_if_exists: pass else: if exists(data, prop): if exists(data, to_prop): from_value = getprop(data, prop) if not is_string_or_list(from_value): msg = "Prop %s " % prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = getprop(data, to_prop) if not is_string_or_list(to_value): msg = "Prop %s " % to_prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = iterify(to_value) to_value.extend(iterify(from_value)) setprop(data, to_prop, to_value) else: try: setprop(data, to_prop, getprop(data, prop)) except Exception, e: logger.error("Could not copy %s to %s: %s" % (prop, to_prop, e))
def map_is_part_of(self): prop = self.root_key + "relatedItem" _dict = {"relation": []} if exists(self.provider_data, prop): for relatedItem in iterify(getprop(self.provider_data, prop)): title_prop = "titleInfo/title" if exists(relatedItem, title_prop): _dict["relation"].append(getprop(relatedItem, title_prop)) self.update_source_resource(self.clean_dict(_dict))
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update( {"isShownAt": textnode(url)})
def primotodpla(body,ctype,geoprop=None): """ Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Apply transformations that are dependent on more than one # original document field sp_props = ["display/lds08"] ipo_props = ["display/lds04"] title_props = ["display/title", "display/lds10"] out["sourceResource"].update(multi_transform(data, "spatial", sp_props, "list")) out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props)) out["sourceResource"].update(multi_transform(data, "title", title_props)) dp_props = ["display/lds03"] out.update(multi_transform(data, "dataProvider", dp_props)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e)) # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def primotodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Apply transformations that are dependent on more than one # original document field sp_props = ["display/lds08"] ipo_props = ["display/lds04"] title_props = ["display/title", "display/lds10"] out["sourceResource"].update( multi_transform(data, "spatial", sp_props, "list")) out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props)) out["sourceResource"].update(multi_transform(data, "title", title_props)) dp_props = ["display/lds03"] out.update(multi_transform(data, "dataProvider", dp_props)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def arctodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified ARC (NARA) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for p in data.keys(): if p in CHO_TRANSFORMER: out["sourceResource"].update(CHO_TRANSFORMER[p](data)) if p in AGGREGATION_TRANSFORMER: out.update(AGGREGATION_TRANSFORMER[p](data)) # Apply transformations that are dependent on more than one # original document field out["sourceResource"].update(type_transform(data)) out["sourceResource"].update(rights_transform(data)) out["sourceResource"].update(subject_and_spatial_transform(data)) out.update(has_view_transform(data)) out["sourceResource"].update(transform_state_located_in(data)) if exists(out, "sourceResource/date"): logger.debug("OUTTYPE: %s" % getprop(out, "sourceResource/date")) if exists(data, "objects/object"): out.update(transform_thumbnail(data)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def enrichtype(body,ctype,action="enrich-type", prop="aggregatedCHO/type", alternate="aggregatedCHO/physicalMedium"): """ Service that accepts a JSON document and enriches the "type" field of that document by: a) making the type lowercase b) converting "image" to "still image" (TODO: Amy to confirm that this is ok) c) applying a set of regexps to do data cleanup (remove plural forms) d) moving all items that are not standard DC types to the physical format field (http://dublincore.org/documents/resource-typelist/) By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter """ REGEXPS = ('images','image'), ('still image','image') DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'model', 'party', 'physical object', 'place', 'service', 'software', 'sound', 'text'] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) return s def is_dc_type(s): return s in DC_TYPES try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if exists(data,prop): v = getprop(data,prop) dctype = [] physicalFormat = getprop(data,alternate) if exists(data,alternate) else [] if not isinstance(physicalFormat,list): physicalFormat = [physicalFormat] for s in (v if not isinstance(v,basestring) else [v]): dctype.append(cleanup(s)) if is_dc_type(cleanup(s)) else physicalFormat.append(s) if dctype: setprop(data,prop,dctype[0]) if len(dctype) == 1 else setprop(data,prop,dctype) else: setprop(data,prop,None) if physicalFormat: setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat) return json.dumps(data)
def map_type(self): path = "/metadata/mods/typeOfResource" path_form = "/metadata/mods/physicalDescription/form" if not exists(self.provider_data, path): path = path_form if exists(self.provider_data, path): types = [] for t in iterify(getprop(self.provider_data, path)): types.append(textnode(t)) if types: self.update_source_resource({"type": types})
def name_part(self, role_type): prop = "/metadata/mods/name" results = [] if exists(self.provider_data, prop): for name in getprop(self.provider_data, prop): if "role" in name and "namePart" in name: for role in iterify(name["role"]): role_prop = "roleTerm/#text" if exists(role, role_prop) \ and getprop(role, role_prop) == role_type: results.append(name["namePart"]) return results
def harvard_enrich_location(body, ctype, action="harvard_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a Harvard JSON document and enriches the "spatial" field by translating any MARC country codes contained within the originalDocument place element into their names, for better geocoding accuracy. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (exists(data, "originalRecord/metadata/mods/originInfo/place")): places = getprop(data, "originalRecord/metadata/mods/originInfo/place") country = "" countryCode = "" name = "" # Add non-country terms for place in iterify(places): logger.info("place: %s" % place) placeTerm = getprop(place, "placeTerm", True) if (isinstance(placeTerm, basestring)): name += " " + placeTerm elif (not exists(placeTerm, "authority")): name += " " + getprop(placeTerm, "#text", True) # Add country for place in iterify(places): placeTerm = getprop(place, "placeTerm", True) if (exists(placeTerm, "authority") \ and "marccountry" == getprop(placeTerm, "authority", True)): countryCode = getprop(placeTerm, "#text", True) country = get_country_from_marccode(countryCode) if (country): name += ", " + country # logger.info("geocode: harvard: Converting name to %s" % name) spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))} if (country \ and (2 == len(countryCode) \ or countryCode.startswith("xx"))): spatial["country"] = country setprop(data, prop, [spatial]) return json.dumps(data)
def movedatestotemporal(body,ctype,action="move_dates_to_temporal",prop=None): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSUB = ("\(", ""), ("\)", "") REGSEARCH = ["(\( *)?(\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4})( *\))?", "(\( *)?(\d{4} *[-/] *\d{4})( *\))?", "(\( *)?(\d{4})( *\))?"] def cleanup(s): for p,r in REGSUB: s = re.sub(p,r,s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): p = [] temporal_field = "aggregatedCHO/temporal" temporal = getprop(data, temporal_field) if exists(data, temporal_field) else [] for d in getprop(data, prop): for regsearch in REGSEARCH: pattern = re.compile(regsearch) for match in pattern.findall(d["name"]): m = "".join(match) #TODO (\( *)? matches 0 and produces '' in m if m: d["name"] = re.sub(re.escape(m),"",d["name"]) temporal.append({"name": cleanup(m)}) if d["name"].strip(): # Append to p, which will overwrite data[prop] p.append(d) if temporal: setprop(data, temporal_field, temporal) if p: setprop(data, prop, p) else: delprop(data, prop) return json.dumps(data)
def edantodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for k, v in CHO_TRANSFORMER.items(): if exists(data, k): out["sourceResource"].update(v(data)) for k, v in AGGREGATION_TRANSFORMER.items(): if exists(data, k): out.update(v(data)) # Apply transformations that are dependent on more than one # original document field #out["sourceResource"].update(type_transform(data)) out["sourceResource"].update(transform_rights(data)) out["sourceResource"].update(transform_subject(data)) out["sourceResource"].update(transform_spatial(data)) out.update(transform_is_shown_at(data)) out.update(transform_object(data)) out.update(transform_data_provider(data)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def type_transform(d): type = [] if "general-records-types" in d: type = arc_group_extraction(d, "general-records-types", "general-records-type", "general-records-type-desc") if exists(d, "physical-occurrences/physical-occurrence"): phys_occur = getprop(d, "physical-occurrences/physical-occurrence") type_key = "media-occurrences/media-occurrence/media-type" for p in phys_occur: if exists(p, type_key): type.append(getprop(p, type_key)) return {"type": "; ".join(type)} if type else {}
def map_type(self): _type = self.extract_xml_items("general-records-types", "general-records-type", "general-records-type-desc") prop = "physical-occurrences/physical-occurrence" if exists(self.provider_data, prop): type_key = "media-occurrences/media-occurrence/media-type" for s in iterify(getprop(self.provider_data, prop)): if exists(s, type_key): _type.append(getprop(s, type_key)) if _type: self.update_source_resource({"type": "; ".join(_type)})
def selid(body, ctype, prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA" if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop) or exists(data, alternative_prop): v = getprop(data, prop, True) if not v: v = getprop(data, alternative_prop) v = tmpl % v if isinstance(v, basestring): id = v else: if v: for h in v: if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def kentucky_identify_object(body, ctype, rights_field="aggregatedCHO/rights", download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ LOG_JSON_ON_ERROR = True def log_json(): if LOG_JSON_ON_ERROR: logger.debug(body) data = {} try: data = json.loads(body) except Exception as e: msg = "Bad JSON: " + e.args[0] logger.error(msg) response.code = 500 response.add_header('content-type', 'text/plain') return msg relation_field = "aggregatedCHO/relation" if exists(data, relation_field): url = getprop(data, relation_field) else: msg = "Field %s does not exist" % relation_field logger.error(msg) return body base_url, ext = os.path.splitext(url) thumb_url = "%s_tb%s" % (base_url, ext) rights = None if exists(data, rights_field): rights = getprop(data, rights_field) data["object"] = {"@id": thumb_url, "format": "", "rights": rights} status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def map_rights(self): edm_rights_prop = "rightsStatementURI" tmp_rights_prop = "tmp_rights_statement" map_tmp = True if exists(self.provider_data, edm_rights_prop): self.mapped_data.update( {"rights": getprop(self.provider_data, edm_rights_prop)} ) map_tmp = False if map_tmp and exists(self.provider_data, tmp_rights_prop): self.update_source_resource( {"rights": getprop(self.provider_data, tmp_rights_prop)} )
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def map_creator(self): prop = "name" if exists(self.provider_data, prop): personal_creator = [] corporate_creator = [] for s in iterify(getprop(self.provider_data, prop)): creator = [None, None, None] for name in iterify(s.get("namePart")): if isinstance(name, basestring): creator[0] = name elif isinstance(name, dict): type = name.get("type") if type == "family": creator[0] = name.get("#text") elif type == "given": creator[1] = name.get("#text") elif type == "termsOfAddress": creator[1] = name.get("#text") elif type == "date": creator[2] = name.get("#text") creator = ", ".join(filter(None, creator)) if (s.get("type") == "personal" and creator not in personal_creator): personal_creator.append(creator) elif (s.get("type") == "corporate" and creator not in corporate_creator): corporate_creator.append(creator) if personal_creator: self.update_source_resource({"creator": personal_creator}) elif corporate_creator: self.update_source_resource({"creator": corporate_creator})
def map_is_shown_at(self): prop = "descriptiveNonRepeating/record_ID" if exists(self.provider_data, prop): prefix = "http://collections.si.edu/search/results.htm?" + "q=record_ID%%3A%s&repo=DPLA" obj = getprop(self.provider_data, prop) self.mapped_data.update({"isShownAt": prefix % obj})
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data,prop) for v in iterify(value): name = replace_state_abbreviations(v["name"].rstrip()) v["name"] = name # Try to extract a County if " county " in name.lower(): # "XXX County (S.C.)" => county: XXX v["county"] = name[0:name.lower().index("county")].strip() elif "(S.C.)" in name: # "XXX (S.C)" => city: XXX v["city"] = name[0:name.index("(S.C.)")].strip() return json.dumps(data)
def map_is_show_at_object_has_view_and_dataprovider(self): def _get_media_type(d): pd = iterify(getprop(d, "physicalDescription")) for _dict in pd: if exists(_dict, "internetMediaType"): return getprop(_dict, "internetMediaType") prop = "location" if exists(self.provider_data, prop): location = iterify(getprop(self.provider_data, prop)) format = _get_media_type(self.provider_data) out = {} try: for _dict in location: if "url" in _dict: for url_dict in _dict["url"]: if url_dict and "access" in url_dict: if url_dict["access"] == "object in context": out["isShownAt"] = url_dict.get("#text") elif url_dict["access"] == "preview": out["object"] = url_dict.get("#text") elif url_dict["access"] == "raw object": has_view = {"@id": url_dict.get("#text"), "format": format} out["hasView"] = has_view if ("physicalLocation" in _dict and isinstance(_dict["physicalLocation"], basestring)): out["dataProvider"] = _dict["physicalLocation"] except Exception as e: logger.error(e) if out: self.mapped_data.update(out)
def map_date(self): originInfoPath = self.root_key + "originInfo" dateCreated = [] dateIssued = [] date_begin, date_end = None, None if exists(self.provider_data, originInfoPath): for date in iterify(getprop(self.provider_data, originInfoPath)): if "dateCreated" in date: dateCreated.append(textnode(date["dateCreated"])) if "dateIssued" in date: t = date["dateIssued"] try: if "point" not in t: dateIssued.append(textnode(t)) elif "point" in t and t["point"] == "start": date_begin = textnode(t) elif "point" in t and t["point"] == "end": date_end = textnode(t) except Exception as e: logger.error("Exception when trying to map date " "values. for record %s \n\n%s" % (self.provider_data % e.message)) # If there are no dateIssued or dateCreated properties then construct # a date range from begin and end points (if they exist). if date_begin and date_end and not dateCreated and not dateIssued: dateIssued.append(date_begin + "-" + date_end) if dateCreated: self.update_source_resource({"date": dateCreated}) elif dateIssued: self.update_source_resource({"date": dateIssued})
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def capitalize(data, prop): """ Capitalizes the value of the related property path. Modifies given dictionary (data argument). """ def str_capitalize(s): """ Changes the first letter of the string into uppercase. python "aaa".capitalize() can be used, other words first letters into lowercase. """ if s: return s[0].upper() + s[1:] return s if exists(data, prop): v = getprop(data, prop, keyErrorAsNone=True) if v: if isinstance(v, basestring): setprop(data, prop, str_capitalize(v)) elif isinstance(v, list): new_v = [] for s in v: if isinstance(s, basestring): new_v.append(str_capitalize(s)) else: new_v.append(s) setprop(data, prop, new_v)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by removing duplicate array elements ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [_stripped(s) for s in v if _stripped(s)] # Get index of unique values index = list( set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop is None: response.code = 500 response.add_header('content-type', 'text/plain') msg = "Prop param is None" logger.error(msg) return msg try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def map_contributor(self): prop = "contributor" if exists(self.provider_data, prop): contributors = iterify(self.provider_data.get(prop)) setprop(self.mapped_data, "dataProvider", contributors[-1]) if len(contributors) > 1: self.update_source_resource({"contributor": contributors[:-1]})
def map_date(self): path = "/metadata/mods/originInfo/dateCreated/#text" if exists(self.provider_data, path): date_created = getprop(self.provider_data, path) if date_created: self.update_source_resource({"date": date_created})
def map_relation(self): prop = self.root_key + "relatedItem" if exists(self.provider_data, prop): relation = [] host = None series = None for s in iterify(getprop(self.provider_data, prop)): title = getprop(s, "titleInfo/title", True) if title is not None: if s.get("type") == "host": host = title if s.get("type") == "series": series = title if host: val = host if series: val += ". " + series relation.append(val) relation = relation[0] if len(relation) == 1 else relation if relation: self.update_source_resource({"relation": relation})
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in", prop="sourceResource/stateLocatedIn"): """ Service that accepts a JSON document and enriches the "stateLocatedIn" field of that document by: For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): sli = [] values = getprop(data, prop) for v in values.split(";"): if STATE_CODES.get(v): sli.append(STATE_CODES[v]) else: sli.append(v) setprop(data, prop, "; ".join(sli)) return json.dumps(data)