def map_creator_and_contributor(self): prop = self.root_key + "name" if exists(self.provider_data, prop): creator_and_contributor = {} names = [] for s in iterify(getprop(self.provider_data, prop)): name = {} name["name"] = self.name_from_name_part( getprop(s, "namePart", True)) if name["name"]: name["type"] = getprop(s, "type", True) name["roles"] = [] if "role" in s: roles = getprop(s, "role") for r in iterify(roles): role = r["roleTerm"] if isinstance(role, dict): role = role["#text"] name["roles"].append(role) names.append(name) # Set creator creator = [name for name in names if "creator" in name["roles"]] creator = creator[0] if creator else names[0] names.remove(creator) creator_and_contributor["creator"] = creator["name"] # Set contributor contributor = [name["name"] for name in names] if contributor: creator_and_contributor["contributor"] = contributor self.update_source_resource(creator_and_contributor)
def creator_and_contributor_transform(d, p): val = {} v = getprop(d, p) names = [] for s in (v if isinstance(v, list) else [v]): name = {} name["name"] = name_from_name_part(getprop(s, "namePart", True)) if name["name"]: name["type"] = getprop(s, "type", True) name["roles"] = [] if "role" in s: roles = getprop(s, "role") for r in (roles if isinstance(roles, list) else [roles]): role = r["roleTerm"] if isinstance(role, dict): role = role["#text"] name["roles"].append(role) names.append(name) # Set creator creator = [name for name in names if "creator" in name["roles"]] creator = creator[0] if creator else names[0] names.remove(creator) val["creator"] = creator["name"] # Set contributor contributor = [name["name"] for name in names] if contributor: val["contributor"] = contributor return val
def ia_identify_object(body, ctype, download="True"): try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def map_relation(self): prop = self.root_key + "relatedItem" if exists(self.provider_data, prop): relation = [] host = None series = None for s in iterify(getprop(self.provider_data, prop)): title = getprop(s, "titleInfo/title", True) if title is not None: if s.get("type") == "host": host = title if s.get("type") == "series": series = title if host: val = host if series: val += ". " + series relation.append(val) relation = relation[0] if len(relation) == 1 else relation if relation: self.update_source_resource({"relation": relation})
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord}) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def uscsetdataprovider(body, ctype, prop="dataProvider"): """ Service that accepts a JSON document and sets the "dataProvider" field of that document to: 1. The first value of the originalRecord/source field (placed in dataProvider in the oai-to-dpla module) for the chs set (setSpec p15799coll65) 2. The string "University of Southern California. Libraries" for all other sets For primary use with USC documents """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" data_provider = getprop(data, "dataProvider", True) if getprop(data, "originalRecord/setSpec") == "p15799coll65": setprop(data, "dataProvider", data_provider[0]) else: setprop(data, "dataProvider", "University of Southern California. Libraries") return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def ia_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format( getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord }) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def set_field_from_value_mode(data, field, mode, value, multivalue=True): '''Set the value for the data "field" from data in collection ckey field with the value passed in. ''' logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue)) if value: #no value don't bother if mode=='overwrite': if exists(data, field): setprop(data, field, value) else: pp,pn = tuple(field.lstrip('/').split('/',1)) if not pp in data: data[pp] = {} data[pp][pn] = value elif mode=='append': new_value = [] if exists(data, field): old_value = getprop(data, field) if isinstance(old_value, list): new_value.extend(old_value) else: new_value.append(old_value) if isinstance(value, list): new_value.extend(value) else: new_value.append(value) setprop(data, field, new_value) else: # fill blanks if not exists(data, field) or not getprop(data, field,keyErrorAsNone=True): if multivalue and not isinstance(value, list): value = [value] setprop(data, field, value) return data
def uscsetdataprovider(body, ctype, prop="dataProvider"): """ Service that accepts a JSON document and sets the "dataProvider" field of that document to: 1. The first value of the originalRecord/source field (placed in dataProvider in the oai-to-dpla module) for the chs set (setSpec p15799coll65) 2. The string "University of Southern California. Libraries" for all other sets For primary use with USC documents """ try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" data_provider = getprop(data, "dataProvider", True) if getprop(data, "originalRecord/setSpec") == "p15799coll65": setprop(data, "dataProvider", data_provider[0]) else: setprop(data, "dataProvider", "University of Southern California. Libraries") return json.dumps(data)
def map_source_resource(self): super(CDLJSONMapper, self).map_source_resource() maps = { "alternative_title_ss": "alternative", "contributor_ss": "contributor", "creator_ss": "creator", "date_ss": "date", "description": "description", "extent_ss": "extent", "format_ss": "format", "genre_ss": "hasType", "identifier_ss": "identifier", "language_ss": "language", "coverage_ss": "spatial", "publisher_ss": "publisher", "relation_ss": "relation", "rights_ss": "rights", "rights_note_ss": "rights", "rights_date_ss": "rights", "rightsholder_ss": "rights", "subject_ss": "subject", "temporal_ss": "temporal", "title_ss": "title", "type_ss": "type" } for (source, dest) in maps.iteritems(): values = \ iterify(getprop(self.provider_data, source, True)) if values: existing_values = \ getprop(self.mapped_data["sourceResource"], dest, True) if existing_values: values = list(set(values + existing_values)) self.update_source_resource({dest: values})
def map_creator_and_contributor(self): prop = self.root_key + "name" if exists(self.provider_data, prop): creator_and_contributor = {} names = [] for s in iterify(getprop(self.provider_data, prop)): name = {} name["name"] = self.name_from_name_part( getprop(s, "namePart", True) ) if name["name"]: name["type"] = getprop(s, "type", True) name["roles"] = [] if "role" in s: roles = getprop(s, "role") for r in iterify(roles): role = r["roleTerm"] if isinstance(role, dict): role = role["#text"] name["roles"].append(role) names.append(name) # Set creator creator = [name["name"] for name in names if "creator" in name["roles"]] if creator: creator_and_contributor["creator"] = creator # Set contributor contributor = [name["name"] for name in names if "contributor" in name["roles"]] if contributor: creator_and_contributor["contributor"] = contributor self.update_source_resource(creator_and_contributor)
def ia_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_preview_key = "originalRecord/files/gif" preview_format = "http://www.archive.org/download/{0}/{1}" try: preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key)) except KeyError: logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"]) return body data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def map_format(self): if exists(self.provider_data, "medium"): self.update_source_resource({"format": getprop(self.provider_data, "medium")}) elif exists(self.provider_data, "format"): self.update_source_resource({"format": getprop(self.provider_data, "format")})
def map_spatial(self): spatial = [] prop = "subject" if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "hierarchicalGeographic" in s: spatial = s["hierarchicalGeographic"] name = ", ".join( filter(None, [ spatial.get("city"), spatial.get("county"), spatial.get("state"), spatial.get("country") ])) spatial["name"] = name spatial = [spatial] prop = "originInfo/place" if not spatial and exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "placeTerm" in s: for place in iterify(s["placeTerm"]): if "type" in place and place["type"] != "code": spatial.append(place["#text"]) if spatial: self.update_source_resource({"spatial": spatial})
def map_subject(self): # Mapped from subject and genre # # Per discussion with Amy on 10 April 2014, don't worry about # checking whether heading maps to authority file. Amy simplified the # crosswalk. # # TODO: When present, we should probably pull in the valueURI and # authority values into the sourceResource.subject - this would # represent an index/API change, however. subject = [] if exists(self.provider_data, "subject"): for v in iterify(getprop(self.provider_data, "subject")): if "topic" in v: if isinstance(v, basestring): subject.append(v["topic"]) elif isinstance(v["topic"], dict): subject.append(v["topic"].get("#text")) else: logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"]) if exists(v, "name/namePart"): subject.append(getprop(v, "name/namePart")) if exists(self.provider_data, "genre"): for v in iterify(getprop(self.provider_data, "genre")): if isinstance(v, basestring): subject.append(v) elif isinstance(v, dict): subject.append(v.get("#text")) else: logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"]) if subject: self.update_source_resource({"subject": subject})
def map_subject_spatial_and_temporal(self, geographic_subject=True): prop = self.root_key + "subject" if exists(self.provider_data, prop): ret_dict = { "subject": [], "spatial": [], "temporal": [] } for s in iterify(getprop(self.provider_data, prop)): subject = [] if "name" in s: namepart = getprop(s, "name/namePart", True) name = self.name_from_name_part(namepart) if name and name not in subject: subject.append(name) if "topic" in s: for t in iterify(s["topic"]): if t and t not in subject: subject.append(t) if "geographic" in s: for g in iterify(s["geographic"]): if g: if geographic_subject and g not in subject: subject.append(g) if g not in ret_dict["spatial"]: ret_dict["spatial"].append(g) if "hierarchicalGeographic" in s: for h in iterify(s["hierarchicalGeographic"]): if isinstance(h, dict): # TODO: use set logic and declarative style, as # in MissouriMapper, instead of deleting list # elements for k in h.keys(): if k not in ["city", "county", "state", "country", "coordinates"]: del h[k] if h not in ret_dict["spatial"]: ret_dict["spatial"].append(h) if "country" in h: ret_dict["spatial"].append(h["country"]) coords = getprop(s, "cartographics/coordinates", True) if coords and coords not in ret_dict["spatial"]: ret_dict["spatial"].append(coords) if "temporal" in s: ret_dict["temporal"].append(s["temporal"]) ret_dict["subject"].append("--".join(subject)) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)
def oaimodstodpla(body, ctype, geoprop=None, provider=None): """ Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } if provider == "BPL": data = remove_key_prefix(data, "mods:") # Apply all transformation rules from original document transformer_pipeline = {} transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}), **CHO_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out["sourceResource"].update(transformer_pipeline[p](data, p)) transformer_pipeline = {} transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}), **AGGREGATION_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out.update(transformer_pipeline[p](data, p)) # Apply transformations that are dependent on more than one # original document field if provider == "HARVARD": out["sourceResource"].update(identifier_transform_harvard(data)) out.update(url_transform_harvard(data)) out.update(data_provider_transform_harvard(data)) # Join dataProvider with isPartOf for BPL if provider == "BPL": try: ipo = getprop(out, "dataProvider") + ". " + \ getprop(out, "sourceResource/isPartOf") setprop(out, "sourceResource/isPartOf", ipo.replace("..", ".")) except: pass # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def update_relation(self): # Join dataProvider with relation try: relation = getprop(self.mapped_data, "dataProvider") + ". " + \ getprop(self.mapped_data, "sourceResource/relation") self.update_source_resource({"relation": relation.replace("..", ".").strip()}) except: pass
def map_date(self): if exists(self.provider_data, "date"): self.update_source_resource({ "date": getprop(self.provider_data, "date") }) elif exists(self.provider_data, "created"): self.update_source_resource({ "date": getprop(self.provider_data, "created") })
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_subject_spatial_and_temporal(self, geographic_subject=True): prop = self.root_key + "subject" if exists(self.provider_data, prop): ret_dict = {"subject": [], "spatial": [], "temporal": []} for s in iterify(getprop(self.provider_data, prop)): subject = [] if "name" in s: namepart = getprop(s, "name/namePart", True) name = self.name_from_name_part(namepart) if name and name not in subject: subject.append(name) if "topic" in s: for t in iterify(s["topic"]): if t and t not in subject: subject.append(t) if "geographic" in s: for g in iterify(s["geographic"]): if g: if geographic_subject and g not in subject: subject.append(g) if g not in ret_dict["spatial"]: ret_dict["spatial"].append(g) if "hierarchicalGeographic" in s: for h in iterify(s["hierarchicalGeographic"]): if isinstance(h, dict): # TODO: use set logic and declarative style, as # in MissouriMapper, instead of deleting list # elements for k in h.keys(): if k not in [ "city", "county", "state", "country", "coordinates" ]: del h[k] if h not in ret_dict["spatial"]: ret_dict["spatial"].append(h) if "country" in h: ret_dict["spatial"].append(h["country"]) coords = getprop(s, "cartographics/coordinates", True) if coords and coords not in ret_dict["spatial"]: ret_dict["spatial"].append(coords) if "temporal" in s: ret_dict["temporal"].append(s["temporal"]) ret_dict["subject"].append("--".join(subject)) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)
def get_enrich_dir(ingestion_document_id): couch = Couch() ingestion_doc = couch.dashboard_db[ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": raise AssertionError( "Cannot save Avro files, enrich process did not complete") return getprop(ingestion_doc, "enrich_process/data_dir")
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def _get_media_type(d): pd = iterify(getprop(d, "physicalDescription", True)) for _dict in filter(None, pd): try: return getprop(_dict, "internetMediaType") except KeyError: pass return None
def download_preview(body, ctype): """ Reponsible for: downloading a preview for a document Usage: as a module in separate pipeline, to be run on existing documents in the repository to download the thumbnails. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Check the "admin/object_status" field status = None try: status = getprop(data, "admin/object_status") if status in ["error", "downloaded"]: logger.debug("Status is %s, doing nothing" % status) return body except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Thumbnail URL url = None try: url = getprop(data, "object/@id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Document ID id = None try: id = getprop(data, "id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) download = False if status == "pending": download = True (relative_fname, mime, status) = download_image(url, id, download) if not relative_fname: logger.error("Cannot save thumbnail from: %s." % (url)) # so everything is OK and the file is on disk doc = update_document(data, relative_fname, mime, status) return json.dumps(doc)
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop, True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop,True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def map_title(self): path = "/metadata/mods/titleInfo" titles = [] if exists(self.provider_data, path): for t in iterify(getprop(self.provider_data, path)): if exists(t, "title") and not exists(t, "title/type"): titles.append(textnode(getprop(t, "title"))) if titles: self.update_source_resource({"title": titles})
def map_spatial(self): """<mods:subject><mods:geographic>""" prop = self.root_key + "subject" geo = [] for s in iterify(getprop(self.provider_data, prop, True)): for g in iterify(getprop(s, "geographic", True)): geo.append(textnode(g)) if geo: self.update_source_resource({"spatial": geo})
def map_extent(self): extents = set() for physical_description in iterify( getprop(self.provider_data, "physicalDescription", True)): if exists(physical_description, "extent"): for extent in iterify( getprop(physical_description, "extent", True)): extents.add(extent) if extents: self.update_source_resource({"extent": list(extents)})
def _get_media_type(): pd = iterify(getprop(self.provider_data, self.root_key + "physicalDescription", True)) for _dict in filter(None, pd): try: return getprop(_dict, "internetMediaType", True) except KeyError: pass return None
def map_rights(self): set_spec = getprop(self.provider_data, "header/setSpec", True) if set_spec == 'eda': rights = 'CC BY-NC-ND 3.0 http://www.edickenson.org/terms' elif set_spec == 'cna': rights = getprop(self.provider_data, self.root_key + 'accessCondition', True) else: rights = 'Held in the collections of Harvard University.' self.update_source_resource({'rights': rights})
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update({"isShownAt": textnode(url)})
def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) setting the format to be all lowercase b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) checking to see if the field is a valid IMT, and moving it to a separatee field if not See http://www.iana.org/assignments/media-types for list of valid media-types. We do not require that a subtype be defined. d) Remove any extra text after the IMT By default works on the 'format' field, but can be overridden by passing the name of the field to use as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter. """ REGEXPS = ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','') IMT_TYPES = ['application','audio','image','message','model','multipart','text','video'] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s) return s def is_imt(s): imt_regexes = [re.compile('^' + x + '(/|\Z)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if exists(data,prop): v = getprop(data,prop) format = [] physicalFormat = getprop(data,alternate) if exists(data,alternate) else [] if not isinstance(physicalFormat,list): physicalFormat = [physicalFormat] for s in (v if not isinstance(v,basestring) else [v]): format.append(cleanup(s)) if is_imt(cleanup(s)) else physicalFormat.append(s) if format: setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format) else: setprop(data,prop,None) if physicalFormat: setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat) return json.dumps(data)
def is_part_of_transform(d, p): ipo = [] v = getprop(d, p) for s in (v if isinstance(v, list) else [v]): if "type" in v and v["type"] == "series": ipo.append(getprop(s, "titleInfo/title", True)) ipo = filter(None, ipo) ipo = ipo[0] if len(ipo) == 1 else ipo return {"isPartOf": ipo} if ipo else {}
def origin_info_transform(d, p): val = {} v = getprop(d, p) # date date = None if "dateCreated" in v: date = v["dateCreated"] if not date and getprop(v, "dateOther/keyDate", True) == "yes": date = getprop(v, "dateOther/#text", True) if isinstance(date, list): dd = {} for i in date: if isinstance(i, basestring): dd["displayDate"] = i elif "point" in i: if i["point"] == "start": dd["begin"] = i["point"] else: dd["end"] = i["point"] else: # Odd date? Log error and investigate logger.error("Invalid date in record %s" % d["_id"]) date = dd if dd else None if date and date != "unknown": val["date"] = date # publisher if "publisher" in v: val["publisher"] = [] pub = v["publisher"] di = v.get("dateIssued", None) di = di[0] if isinstance(di, list) else di # Get all placeTerms of type "text" terms = [] if "place" in v: place = v["place"] for p in (place if isinstance(place, list) else [place]): if getprop(p, "placeTerm/type", True) == "text": terms.append(getprop(p, "placeTerm/#text", True)) for t in filter(None, terms): if di: val["publisher"].append("%s: %s, %s" % (t, pub, di)) else: val["publisher"].append("%s: %s" % (t, pub)) if len(val["publisher"]) == 1: val["publisher"] = val["publisher"][0] return val
def map_is_part_of(self): prop = self.root_key + "relatedItem" _dict = {"relation": []} if exists(self.provider_data, prop): for relatedItem in iterify(getprop(self.provider_data, prop)): title_prop = "titleInfo/title" if exists(relatedItem, title_prop): _dict["relation"].append(getprop(relatedItem, title_prop)) self.update_source_resource(self.clean_dict(_dict))
def map_rights(self): path = "/metadata/mods/accessCondition" rights = [] if exists(self.provider_data, path): for r in iterify(getprop(self.provider_data, path)): t = getprop(r, "type", True) if t and t == "local rights statement": rights.append(textnode(r)) if rights: self.update_source_resource({"rights": rights})
def map_language(self): languages = set() for language_data in iterify( getprop(self.provider_data, "language", True)): for language_term in iterify( getprop(language_data, "languageTerm", True)): language = self.txt(language_term) if language: languages.add(language) if languages: self.update_source_resource({"language": list(languages)})
def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None): """Copies value in one prop to another prop. For use with string and/or list prop value types. If to_prop exists, its value is iterified then extended with the iterified value of prop. If the to_prop parent prop (ie hasView in hasView/rights) does not exist, the from_prop value is not copied and an error is logged. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) skip_if_exists -- set to True to not copy if to_prop exists """ def is_string_or_list(value): return (isinstance(value, basestring) or isinstance(value, list)) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, to_prop) and skip_if_exists: pass else: if exists(data, prop): if exists(data, to_prop): from_value = getprop(data, prop) if not is_string_or_list(from_value): msg = "Prop %s " % prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = getprop(data, to_prop) if not is_string_or_list(to_value): msg = "Prop %s " % to_prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = iterify(to_value) to_value.extend(iterify(from_value)) setprop(data, to_prop, to_value) else: try: setprop(data, to_prop, getprop(data, prop)) except Exception, e: logger.error("Could not copy %s to %s: %s" % (prop, to_prop, e))
def map_title(self): """<mods:titleInfo><mods:title>""" prop = self.root_key + "titleInfo" titles = [] for ti in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(ti, "title", True)): titles.append(textnode(t)) if titles: self.update_source_resource({"title": titles})
def is_part_of_transform_harvard(d, p): ipo = [] v = getprop(d, p) for s in (v if isinstance(v, list) else[v]): if "type" in v and v["type"] == "series": ipo.append(getprop(s, "titleInfo/title", True)) ipo = filter(None, ipo) ipo = ipo[0] if len(ipo) == 1 else ipo return {"isPartOf": ipo} if ipo else {}
def origin_info_transform_harvard(d, p): val = {} v = getprop(d, p) # date date = None if "dateCreated" in v: date = v["dateCreated"] if not date and getprop(v, "dateOther/keyDate", True) == "yes": date = getprop(v, "dateOther/#text", True) if isinstance(date, list): dd = {} for i in date: if isinstance(i, basestring): dd["displayDate"] = i elif "point" in i: if i["point"] == "start": dd["begin"] = i["point"] else: dd["end"] = i["point"] else: # Odd date? Log error and investigate logger.error("Invalid date in record %s" % d["_id"]) date = dd if dd else None if date and date != "unknown": val["date"] = date # publisher if "publisher" in v: val["publisher"] = [] pub = v["publisher"] di = v.get("dateIssued", None) di = di[0] if isinstance(di, list) else di # Get all placeTerms of type "text" terms = [] if "place" in v: place = v["place"] for p in (place if isinstance(place, list) else [place]): if getprop(p, "placeTerm/type", True) == "text": terms.append(getprop(p, "placeTerm/#text", True)) for t in filter(None, terms): if di: val["publisher"].append("%s: %s, %s" % (t, pub, di)) else: val["publisher"].append("%s: %s" % (t, pub)) if len(val["publisher"]) == 1: val["publisher"] = val["publisher"][0] return val
def map_subject(self): """<mods:subject><mods:topic>""" prop = self.root_key + "subject" subjects = [] for s in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(s, "topic", True)): subjects.append(textnode(t)) if subjects: self.update_source_resource({"subject": subjects})