def map_datafield_tags(self): for item in iterify(getprop(self.provider_data, "datafield")): for _dict in iterify(item): tag = _dict.get("tag", None) # Skip cases where there is no tag or where tag == "ERR" try: int(tag) except: continue if tag == "086" or tag == "087": self.datafield_086_or_087 = True if tag == "264" and _dict.get("ind2") != "1": continue for match, func_tuples in self.mapping_dict.items(): if match(tag): for func_tuple in func_tuples: if len(func_tuple) == 2: func, codes = func_tuple func(_dict, tag, codes) elif len(func_tuple) == 3: func, index, codes = func_tuple func(_dict, tag, index, codes)
def map_creator(self): prop = "name" if exists(self.provider_data, prop): personal_creator = [] corporate_creator = [] for s in iterify(getprop(self.provider_data, prop)): creator = [None, None, None] for name in iterify(s.get("namePart")): if isinstance(name, basestring): creator[0] = name elif isinstance(name, dict): type = name.get("type") if type == "family": creator[0] = name.get("#text") elif type == "given": creator[1] = name.get("#text") elif type == "termsOfAddress": creator[1] = name.get("#text") elif type == "date": creator[2] = name.get("#text") creator = ", ".join(filter(None, creator)) if (s.get("type") == "personal" and creator not in personal_creator): personal_creator.append(creator) elif (s.get("type") == "corporate" and creator not in corporate_creator): corporate_creator.append(creator) if personal_creator: self.update_source_resource({"creator": personal_creator}) elif corporate_creator: self.update_source_resource({"creator": corporate_creator})
def map_creator_and_contributor(self): prop = self.root_key + "name" mapped_props = { "creator": [], "contributor": [] } if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): name = s.get("namePart") if name: role_terms = [] try: for r in iterify(s.get("role")): role_term = r.get("roleTerm") if isinstance(role_term, dict): role_terms.append( role_term.get("#text").lower()) elif isinstance(role_term, list): for rt in role_term: role_terms.append(rt.lower()) else: role_terms.append(role_term.lower()) except Exception as e: logger.error("Error getting name/role/roleTerm for " + "record %s\nException:%\n%s" % (self.provider_data["_id"], e)) continue if "creator" in role_terms: mapped_props["creator"].append(name) elif "contributor" in role_terms: mapped_props["contributor"].append(name) self.update_source_resource(self.clean_dict(mapped_props))
def uva_extract_records(self, content, url): error = None records = [] # Handle "mods:<key>" in UVA book collection key_prefix = "" if "mods:mods" in content: key_prefix = "mods:" if key_prefix + "mods" in content: item = content[key_prefix + "mods"] for _id_dict in iterify(item[key_prefix + "identifier"]): if _id_dict["type"] == "uri": item["_id"] = _id_dict["#text"] records.append(item) if "_id" not in item: # Handle localtion url for _loc_dict in iterify(item[key_prefix + "location"]): if "url" in _loc_dict: for url in _loc_dict["url"]: if ("usage" in url and url["usage"] == "primary display"): item["_id"] = url.get("#text") records.append(item) if not records: error = "Error, no records found in content from URL %s" % url yield error, records
def map_date_and_publisher(self): prop = self.root_key + "originInfo" _dict = { "date": None, "publisher": [] } if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "dateCreated" in s: date_list = iterify(s.get("dateCreated")) date = [t.get("#text") for t in date_list if t.get("keyDate") == "yes"] # Check if last date is already a range if "-" in date[-1] or "/" in date[-1]: _dict["date"] = date[-1] elif len(date) > 1: _dict["date"] = "%s-%s" % (date[0], date[-1]) else: _dict["date"] = date[0] if "publisher" in s: _dict["publisher"].append(s.get("publisher")) self.update_source_resource(self.clean_dict(_dict))
def map_creator_and_contributor(self): prop = self.root_key + "name" _dict = { "creator": [], "contributor": [] } if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): name = s.get("namePart") if name: try: role_terms = [r.get("roleTerm") for r in iterify(s.get("role"))] except: logger.error("Error getting name/role/roleTerm for " + "record %s" % self.provider_data["_id"]) continue if "creator" in role_terms: _dict["creator"].append(name) elif "contributor" in role_terms: _dict["contributor"].append(name) self.update_source_resource(self.clean_dict(_dict))
def map_subject(self): # Mapped from subject and genre # # Per discussion with Amy on 10 April 2014, don't worry about # checking whether heading maps to authority file. Amy simplified the # crosswalk. # # TODO: When present, we should probably pull in the valueURI and # authority values into the sourceResource.subject - this would # represent an index/API change, however. subject = [] if exists(self.provider_data, "subject"): for v in iterify(getprop(self.provider_data, "subject")): if "topic" in v: if isinstance(v, basestring): subject.append(v["topic"]) elif isinstance(v["topic"], dict): subject.append(v["topic"].get("#text")) else: logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"]) if exists(v, "name/namePart"): subject.append(getprop(v, "name/namePart")) if exists(self.provider_data, "genre"): for v in iterify(getprop(self.provider_data, "genre")): if isinstance(v, basestring): subject.append(v) elif isinstance(v, dict): subject.append(v.get("#text")) else: logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"]) if subject: self.update_source_resource({"subject": subject})
def map_spatial(self): spatial = [] prop = "subject" if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "hierarchicalGeographic" in s: spatial = s["hierarchicalGeographic"] name = ", ".join( filter(None, [ spatial.get("city"), spatial.get("county"), spatial.get("state"), spatial.get("country") ])) spatial["name"] = name spatial = [spatial] prop = "originInfo/place" if not spatial and exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "placeTerm" in s: for place in iterify(s["placeTerm"]): if "type" in place and place["type"] != "code": spatial.append(place["#text"]) if spatial: self.update_source_resource({"spatial": spatial})
def map_creator_and_contributor(self): prop = self.root_key + "name" if exists(self.provider_data, prop): creator_and_contributor = {} names = [] for s in iterify(getprop(self.provider_data, prop)): name = {} name["name"] = self.name_from_name_part( getprop(s, "namePart", True) ) if name["name"]: name["type"] = getprop(s, "type", True) name["roles"] = [] if "role" in s: roles = getprop(s, "role") for r in iterify(roles): role = r["roleTerm"] if isinstance(role, dict): role = role["#text"] name["roles"].append(role) names.append(name) # Set creator creator = [name["name"] for name in names if "creator" in name["roles"]] if creator: creator_and_contributor["creator"] = creator # Set contributor contributor = [name["name"] for name in names if "contributor" in name["roles"]] if contributor: creator_and_contributor["contributor"] = contributor self.update_source_resource(creator_and_contributor)
def map_creator_and_contributor(self): prop = self.root_key + "name" if exists(self.provider_data, prop): creator_and_contributor = {} names = [] for s in iterify(getprop(self.provider_data, prop)): name = {} name["name"] = self.name_from_name_part( getprop(s, "namePart", True)) if name["name"]: name["type"] = getprop(s, "type", True) name["roles"] = [] if "role" in s: roles = getprop(s, "role") for r in iterify(roles): role = r["roleTerm"] if isinstance(role, dict): role = role["#text"] name["roles"].append(role) names.append(name) # Set creator creator = [name for name in names if "creator" in name["roles"]] creator = creator[0] if creator else names[0] names.remove(creator) creator_and_contributor["creator"] = creator["name"] # Set contributor contributor = [name["name"] for name in names] if contributor: creator_and_contributor["contributor"] = contributor self.update_source_resource(creator_and_contributor)
def creator_transform_uva(d, p): personal_creator = [] corporate_creator = [] for s in iterify(getprop(d, p)): creator = [None, None, None] for name in iterify(s.get("namePart")): if isinstance(name, basestring): creator[0] = name elif isinstance(name, dict): type = name.get("type") if type == "family": creator[0] = name.get("#text") elif type == "given": creator[1] = name.get("#text") elif type == "termsOfAddress": creator[1] = name.get("#text") elif type == "date": creator[2] = name.get("#text") creator = ", ".join(filter(None, creator)) if s.get("type") == "personal" and creator not in personal_creator: personal_creator.append(creator) elif s.get("type") == "corporate" and creator not in corporate_creator: corporate_creator.append(creator) if personal_creator: return {"creator": personal_creator} elif corporate_creator: return {"creator": corporate_creator} else: return {}
def map_creator(self): """<mods:name><mods:namePart> when <mods:role><mods:roleTerm> equals Creator""" prop = self.root_key + "name" roleTypes = [] _dict = {"creator": []} for s in iterify(getprop(self.provider_data, prop, True)): name = s.get("namePart") if name: try: # Get all the roleTerm values for a given mods:name # entity roleTypes = [textnode(r.get("roleTerm")) for r in iterify(s.get("role"))] except Exception as e: continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if "creator" in map(unicode.lower, roleTypes): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def map_creator(self): """<mods:name><mods:namePart> when <mods:role><mods:roleTerm> equals Creator""" prop = self.root_key + "name" roleTypes = [] _dict = {"creator": []} for s in iterify(getprop(self.provider_data, prop, True)): name = s.get("namePart") if name: try: # Get all the roleTerm values for a given mods:name # entity roleTypes = [ textnode(r.get("roleTerm")) for r in iterify(s.get("role")) ] except Exception as e: continue # If mods:roleTerm is empty or if it contains 'Creator' # then map the namePart value to creator. If roleTerm # contains 'Contributor' map to contributor if "creator" in map(unicode.lower, roleTypes): if isinstance(name, list): for n in name: clean_name = textnode(n) if isinstance(clean_name, basestring): _dict["creator"].append(clean_name) else: _dict["creator"].append(textnode(name)) self.update_source_resource(self.clean_dict(_dict))
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord }) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def map_datafield_tags(self): for item in iterify(getprop(self.provider_data, self.datafield_tag)): for _dict in iterify(item): if self.pymarc: tag = _dict.keys()[0] #this is a pymarc record #grab "subfields" as data dict if 'subfields' in _dict[tag]: _dict['subfield'] = _dict[tag]['subfields'] else: tag = _dict.get("tag", None) # Skip cases where there is no tag or where tag == "ERR" try: int(tag) except: continue if tag == "086" or tag == "087": self.datafield_086_or_087 = True if tag == "264" and _dict.get("ind2") != "1": continue for match, func_tuples in self.mapping_dict.items(): if match(tag): for func_tuple in func_tuples: if len(func_tuple) == 2: func, codes = func_tuple func(_dict, tag, codes) elif len(func_tuple) == 3: func, index, codes = func_tuple func(_dict, tag, index, codes)
def first_date(els): """Return first date string from originInfo elements""" for el in els: # Allow for each of the following elements to be an array of # dicts or strings, or one on its own. date_created = iterify(el.get('dateCreated', [])) date_issued = iterify(el.get('dateIssued', [])) date_other = iterify(el.get('dateOther', [])) sort_date = iterify(el.get('sortDate', [])) try: for d in date_created: if type(d) == dict and d.get('keyDate') == 'yes': return textnode(d) for d in date_issued: if type(d) == dict and d.get('point') == 'start': return textnode(d) # Nothing yet? Take first dateOther, ignoring attributes: if date_other: return textnode(date_other[0]) # OK, then take the first dateIssued we can get, ignoring # attribute ... if date_issued: return textnode(date_issued[0]) # Still nothing? Try sortDate: if sort_date: return textnode(sort_date[0]) except NoTextNodeError: # Weird, but date is not required. pass
def origin_info_transform(d, p): val = {} date = [] publisher = [] for s in iterify(getprop(d, p)): # date if "dateCreated" in s: date_list = iterify(s.get("dateCreated")) date = [t.get("#text") for t in date_list if t.get("keyDate") == "yes"] # Check if last date is already a range if "-" in date[-1] or "/" in date[-1]: date = date[-1] elif len(date) > 1: date = "%s-%s" % (date[0], date[-1]) else: date = date[0] # publisher if "publisher" in s: publisher.append(s.get("publisher")) if date: val["date"] = date if publisher: val["publisher"] = publisher return val
def map_spatial_and_subject_and_temporal(self): path = "/metadata/mods/subject" subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"] spatials = [] temporals = [] subjects = [] if exists(self.provider_data, path): for subject in iterify(getprop(self.provider_data, path)): if "cartographics" in subject and \ "coordinates" in subject["cartographics"]: coord = subject["cartographics"]["coordinates"] spatials.append({"name": coord}) if "geographic" in subject: for g in iterify(getprop(subject, "geographic")): spatials.append({"name": textnode(g)}) if "temporal" in subject: for t in iterify(getprop(subject, "temporal")): temporals.append(textnode(t)) for s_path in subject_props: for s in iterify(getprop(subject, s_path, True)): subjects.append(s) if spatials: self.update_source_resource({"spatial": spatials}) if temporals: self.update_source_resource({"temporal": temporals}) if subjects: self.update_source_resource({"subject": subjects})
def creator_and_contributor_transform(d, p): val = {} creator = [] contributor = [] for s in iterify(getprop(d, p)): name_part = s.get("namePart") if name_part: try: role_terms = [r.get("roleTerm") for r in iterify(s.get("role"))] except: logger.error("Error getting name/role/roleTerm for record %s" % d["_id"]) continue if "creator" in role_terms: creator.append(name_part) elif "contributor" in role_terms: contributor.append(name_part) if creator: val["creator"] = creator if contributor: val["contributor"] = contributor return val
def first_date(els): """Return first date string from originInfo elements""" for el in els: # Allow for each of the following elements to be an array of # dicts or strings, or one on its own. date_created = iterify(el.get('dateCreated', [])) date_issued = iterify(el.get('dateIssued', [])) date_other = iterify(el.get('dateOther', [])) sort_date = iterify(el.get('sortDate', [])) try: for d in date_created: if type(d) == dict and d.get('keyDate') == 'yes': return textnode(d) for d in date_issued: if type(d) == dict and d.get('point') == 'start': return textnode(d) # Nothing yet? Take first dateOther, ignoring attributes: if date_other: return textnode(date_other[0]) # OK, then take the first dateIssued we can get, ignoring # attribute ... if date_issued: return textnode(date_issued[0]) # Still nothing? Try sortDate: if sort_date: return textnode(sort_date[0]) except NoTextNodeError: # Weird, but date is not required. pass
def map_creator(self): prop = "name" if exists(self.provider_data, prop): personal_creator = [] corporate_creator = [] for s in iterify(getprop(self.provider_data, prop)): creator = [None, None, None] for name in iterify(s.get("namePart")): if isinstance(name, basestring): creator[0] = name elif isinstance(name, dict): type = name.get("type") if type == "family": creator[0] = name.get("#text") elif type == "given": creator[1] = name.get("#text") elif type == "termsOfAddress": creator[1] = name.get("#text") elif type == "date": creator[2] = name.get("#text") creator = ", ".join(filter(None, creator)) if (s.get("type") == "personal" and creator not in personal_creator): personal_creator.append(creator) elif (s.get("type") == "corporate" and creator not in corporate_creator): corporate_creator.append(creator) if personal_creator: self.update_source_resource({"creator": personal_creator}) elif corporate_creator: self.update_source_resource({"creator": corporate_creator})
def uva_extract_records(self, content, url): error = None records = [] # Handle "mods:<key>" in UVA book collection key_prefix = "" if "mods:mods" in content: key_prefix = "mods:" if key_prefix + "mods" in content: item = content[key_prefix + "mods"] for _id_dict in iterify(item[key_prefix + "identifier"]): if _id_dict["type"] == "uri": item["_id"] = _id_dict["#text"] records.append(item) if "_id" not in item: # Handle localtion url for _loc_dict in iterify(item[key_prefix + "location"]): if "url" in _loc_dict: for url in _loc_dict["url"]: if ("usage" in url and url["usage"] == "primary display"): item["_id"] = url.get("#text") records.append(item) if not records: error = "Error, no records found in content from URL %s" % url yield error, records
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*", ] def cleanup(s): s = re.sub("[\(\)\.\?]", "", s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): values = iterify(getprop(data, prop)) remove = [] toprop = iterify(getprop(data, to_prop)) if exists(data, to_prop) else [] for v in iterify(values): if isinstance(v, basestring): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def map_subject_spatial_and_temporal(self, geographic_subject=True): prop = self.root_key + "subject" if exists(self.provider_data, prop): ret_dict = { "subject": [], "spatial": [], "temporal": [] } for s in iterify(getprop(self.provider_data, prop)): subject = [] if "name" in s: namepart = getprop(s, "name/namePart", True) name = self.name_from_name_part(namepart) if name and name not in subject: subject.append(name) if "topic" in s: for t in iterify(s["topic"]): if t and t not in subject: subject.append(t) if "geographic" in s: for g in iterify(s["geographic"]): if g: if geographic_subject and g not in subject: subject.append(g) if g not in ret_dict["spatial"]: ret_dict["spatial"].append(g) if "hierarchicalGeographic" in s: for h in iterify(s["hierarchicalGeographic"]): if isinstance(h, dict): # TODO: use set logic and declarative style, as # in MissouriMapper, instead of deleting list # elements for k in h.keys(): if k not in ["city", "county", "state", "country", "coordinates"]: del h[k] if h not in ret_dict["spatial"]: ret_dict["spatial"].append(h) if "country" in h: ret_dict["spatial"].append(h["country"]) coords = getprop(s, "cartographics/coordinates", True) if coords and coords not in ret_dict["spatial"]: ret_dict["spatial"].append(coords) if "temporal" in s: ret_dict["temporal"].append(s["temporal"]) ret_dict["subject"].append("--".join(subject)) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)
def fetch_all_data(self, set): """A generator to yield batches of records fetched, and any errors encountered in the process, via the self.response dicitonary. """ request_more = True while request_more: error, content = self.request_content_from( self.endpoint_url, self.endpoint_url_params ) print "Requesting %s?%s" % (self.endpoint_url, urlencode(self.endpoint_url_params, True)) if error is not None: # Stop requesting from this set request_more = False self.response["errors"].append(error) break error, content = self.extract_xml_content(content, self.endpoint_url) if error is not None: request_more = False self.response["errors"].extend(iterify(error)) else: for error, records, request_more in \ self.request_records(content): if error is not None: self.response["errors"].extend(iterify(error)) self.add_provider_to_item_records(records) self.add_collection_to_item_records(records) self.response["records"].extend(records) if len(self.response["records"]) >= self.batch_size: yield self.response self.reset_response() # Retry fetches, if any if self.retry: print >> sys.stderr, "Retrying %s fetches..." % \ len(self.retry) for error, records in self.retry_fetches(): self.response["errors"].extend(error) self.response["records"].extend(records) if len(self.response["records"]) >= self.batch_size: yield self.response self.reset_response() if self.response["errors"] or self.response["records"]: yield self.response # Last yield self.add_collection_records_to_response() if self.response["errors"] or self.response["records"]: yield self.response self.reset_response()
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_object(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "access") and url["access"].lower() == "preview"): self.mapped_data.update({"object": textnode(url)})
def map_spatial(self): """Map sourceResource.spatial See map_subject(). The JSON is formatted as follows: { "subject": [ { "hierarchicalGeographic": { "country": "United States", "state": "MO", "continent": "North America", "city": "St. Louis" }, "cartographics": { "coordinates": "38.6277,-90.1995" } } ] } """ def coordinates(el): """Return the coordinates string from the given dict""" try: return textnode(el.get('coordinates')) except: return '' prop = self.root_key + 'subject' subjects = None if exists(self.provider_data, prop): subjects = iterify(getprop(self.provider_data, prop)) if subjects: ok_spatial = set( ['city', 'county', 'state', 'country', 'coordinates']) spatial = [] # This logic of append() calls comes from oai_mods_mapper. # The assignment of coordinate strings as their own list elements # is by design. # For the treatment of 'continent', see # https://github.com/dpla/ingestion/pull/37 for s in subjects: for hg in iterify(s.get('hierarchicalGeographic', [])): keys = set.intersection(ok_spatial, hg.keys()) clean_hg = dict([(k, hg[k]) for k in keys]) # 'continent' is not allowed in MAP v3.1 but it can be used # for spatial.name. if 'continent' in hg and len(hg) == 1: clean_hg['name'] = hg['continent'] if clean_hg and clean_hg not in spatial: spatial.append(clean_hg) for carto in iterify(s.get('cartographics', [])): c = coordinates(carto) if c and c not in spatial: spatial.append(c) if spatial: self.update_source_resource({'spatial': spatial})
def map_subject_spatial_and_temporal(self, geographic_subject=True): prop = self.root_key + "subject" if exists(self.provider_data, prop): ret_dict = {"subject": [], "spatial": [], "temporal": []} for s in iterify(getprop(self.provider_data, prop)): subject = [] if "name" in s: namepart = getprop(s, "name/namePart", True) name = self.name_from_name_part(namepart) if name and name not in subject: subject.append(name) if "topic" in s: for t in iterify(s["topic"]): if t and t not in subject: subject.append(t) if "geographic" in s: for g in iterify(s["geographic"]): if g: if geographic_subject and g not in subject: subject.append(g) if g not in ret_dict["spatial"]: ret_dict["spatial"].append(g) if "hierarchicalGeographic" in s: for h in iterify(s["hierarchicalGeographic"]): if isinstance(h, dict): # TODO: use set logic and declarative style, as # in MissouriMapper, instead of deleting list # elements for k in h.keys(): if k not in [ "city", "county", "state", "country", "coordinates" ]: del h[k] if h not in ret_dict["spatial"]: ret_dict["spatial"].append(h) if "country" in h: ret_dict["spatial"].append(h["country"]) coords = getprop(s, "cartographics/coordinates", True) if coords and coords not in ret_dict["spatial"]: ret_dict["spatial"].append(coords) if "temporal" in s: ret_dict["temporal"].append(s["temporal"]) ret_dict["subject"].append("--".join(subject)) for k in ret_dict.keys(): if not ret_dict[k]: del ret_dict[k] self.update_source_resource(ret_dict)
def map_date_and_publisher(self): prop = self.root_key + "originInfo" mapped_props = { "date": "", "publisher": [] } if exists(self.provider_data, prop): dates = { "date": None, "early_date": None, "late_date": None } for s in iterify(getprop(self.provider_data, prop)): if "dateCreated" in s: date_list = iterify(s.get("dateCreated")) # Ist ein uber kludge try: dates['date'] = [t.get("#text") for t in date_list if isinstance(t, dict) and t.get("keyDate") == "yes" and "point" not in t] dates['early_date'] = [t.get("#text") for t in date_list if isinstance(t, dict) and t.get("keyDate") == "yes" and t.get("point") == "start"] dates['late_date'] = [t.get("#text") for t in date_list if isinstance(t, dict) and t.get("point") == "end"] except Exception as e: logger.error( "Unable to map date data:\n\t %s" % date_list) logger.error(e) if "publisher" in s: mapped_props["publisher"].append(s.get("publisher")) # Remove Time component from date for k in dates.keys(): if dates.get(k): dates[k] = dates[k][0] if 'T' in dates.get(k): date = dates[k] dates[k] = date[:date.index('T')] if dates.get("date"): mapped_props["date"] = dates.get("date") elif dates.get("early_date") and dates.get("late_date"): mapped_props["date"] = dates.get("early_date") + "-" + \ dates.get("late_date") elif dates.get("early_date"): mapped_props["date"] = dates.get("early_date") self.update_source_resource(self.clean_dict(mapped_props))
def fetch_all_data(self, set): """A generator to yield batches of records fetched, and any errors encountered in the process, via the self.response dicitonary. """ request_more = True while request_more: error, content = self.request_content_from( self.endpoint_url, self.endpoint_url_params) print "Requesting %s?%s" % ( self.endpoint_url, urlencode(self.endpoint_url_params, True)) if error is not None: # Stop requesting from this set request_more = False self.response["errors"].append(error) break error, content = self.extract_xml_content(content, self.endpoint_url) if error is not None: request_more = False self.response["errors"].extend(iterify(error)) else: for error, records, request_more in \ self.request_records(content): if error is not None: self.response["errors"].extend(iterify(error)) self.add_provider_to_item_records(records) self.add_collection_to_item_records(records) self.response["records"].extend(records) if len(self.response["records"]) >= self.batch_size: yield self.response self.reset_response() # Retry fetches, if any if self.retry: print >> sys.stderr, "Retrying %s fetches..." % \ len(self.retry) for error, records in self.retry_fetches(): self.response["errors"].extend(error) self.response["records"].extend(records) if len(self.response["records"]) >= self.batch_size: yield self.response self.reset_response() if self.response["errors"] or self.response["records"]: yield self.response # Last yield self.add_collection_records_to_response() if self.response["errors"] or self.response["records"]: yield self.response self.reset_response()
def map_spatial(self): """Map sourceResource.spatial See map_subject(). The JSON is formatted as follows: { "subject": [ { "hierarchicalGeographic": { "country": "United States", "state": "MO", "continent": "North America", "city": "St. Louis" }, "cartographics": { "coordinates": "38.6277,-90.1995" } } ] } """ def coordinates(el): """Return the coordinates string from the given dict""" try: return textnode(el.get('coordinates')) except: return '' prop = self.root_key + 'subject' subjects = None if exists(self.provider_data, prop): subjects = iterify(getprop(self.provider_data, prop)) if subjects: ok_spatial = set(['city', 'county', 'state', 'country', 'coordinates']) spatial = [] # This logic of append() calls comes from oai_mods_mapper. # The assignment of coordinate strings as their own list elements # is by design. # For the treatment of 'continent', see # https://github.com/dpla/ingestion/pull/37 for s in subjects: for hg in iterify(s.get('hierarchicalGeographic', [])): keys = set.intersection(ok_spatial, hg.keys()) clean_hg = dict([(k, hg[k]) for k in keys]) # 'continent' is not allowed in MAP v3.1 but it can be used # for spatial.name. if 'continent' in hg and len(hg) == 1: clean_hg['name'] = hg['continent'] if clean_hg and clean_hg not in spatial: spatial.append(clean_hg) for carto in iterify(s.get('cartographics', [])): c = coordinates(carto) if c and c not in spatial: spatial.append(c) if spatial: self.update_source_resource({'spatial': spatial})
def map_extent(self): extents = set() for physical_description in iterify( getprop(self.provider_data, "physicalDescription", True)): if exists(physical_description, "extent"): for extent in iterify( getprop(physical_description, "extent", True)): extents.add(extent) if extents: self.update_source_resource({"extent": list(extents)})
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop, True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def map_spatial(self): """<mods:subject><mods:geographic>""" prop = self.root_key + "subject" geo = [] for s in iterify(getprop(self.provider_data, prop, True)): for g in iterify(getprop(s, "geographic", True)): geo.append(textnode(g)) if geo: self.update_source_resource({"spatial": geo})
def map_date(self): """<mods:originInfo><mods:dateCreated>""" prop = self.root_key + "originInfo" dates = [] for oi in iterify(getprop(self.provider_data, prop,True)): for d in iterify(getprop(oi, "dateCreated", True)): dates.append(textnode(d)) if dates: self.update_source_resource({"date": dates})
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if(exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update({"isShownAt": textnode(url)})
def map_spatial(self): """<mods:subject><mods:geographic>""" prop = self.root_key + "subject" geo = [] for s in iterify(getprop(self.provider_data, prop, True)): for g in iterify(getprop(s, "geographic", True)): geo.append(textnode(g)) if geo: self.update_source_resource({"spatial": geo})
def map_data_provider(self): data_provider = "" campus_name = iterify(getprop(self.provider_data, "campus_name", True)) repository_name = \ iterify(getprop(self.provider_data, "repository_name", True)) if campus_name and repository_name: data_provider = "%s, %s" % (campus_name[0], repository_name[0]) elif repository_name: data_provider = repository_name[0] if data_provider: self.mapped_data.update({"dataProvider": data_provider})
def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None): """Copies value in one prop to another prop. For use with string and/or list prop value types. If to_prop exists, its value is iterified then extended with the iterified value of prop. If the to_prop parent prop (ie hasView in hasView/rights) does not exist, the from_prop value is not copied and an error is logged. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) skip_if_exists -- set to True to not copy if to_prop exists """ def is_string_or_list(value): return (isinstance(value, basestring) or isinstance(value, list)) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, to_prop) and skip_if_exists: pass else: if exists(data, prop): if exists(data, to_prop): from_value = getprop(data, prop) if not is_string_or_list(from_value): msg = "Prop %s " % prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = getprop(data, to_prop) if not is_string_or_list(to_value): msg = "Prop %s " % to_prop + \ "is not a string/list for record %s" % data["id"] logger.error(msg) return body to_value = iterify(to_value) to_value.extend(iterify(from_value)) setprop(data, to_prop, to_value) else: try: setprop(data, to_prop, getprop(data, prop)) except Exception, e: logger.error("Could not copy %s to %s: %s" % (prop, to_prop, e))
def map_data_provider(self): data_provider = "" campus_name = iterify(getprop(self.provider_data, "campus_name", True)) repository_name = \ iterify(getprop(self.provider_data, "repository_name", True)) if campus_name and repository_name: data_provider = "%s, %s" % (campus_name[0], repository_name[0]) elif repository_name: data_provider = repository_name[0] if data_provider: self.mapped_data.update({"dataProvider": data_provider})
def map_publisher(self): """<mods:originInfo><mods:publisher>""" prop = self.root_key + "originInfo" publishers = [] for oi in iterify(getprop(self.provider_data, prop, True)): for p in iterify(getprop(oi, "publisher", True)): publishers.append(textnode(p)) if publishers: self.update_source_resource({"publisher": publishers})
def map_language(self): languages = set() for language_data in iterify( getprop(self.provider_data, "language", True)): for language_term in iterify( getprop(language_data, "languageTerm", True)): language = self.txt(language_term) if language: languages.add(language) if languages: self.update_source_resource({"language": list(languages)})
def map_title(self): """<mods:titleInfo><mods:title>""" prop = self.root_key + "titleInfo" titles = [] for ti in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(ti, "title", True)): titles.append(textnode(t)) if titles: self.update_source_resource({"title": titles})
def map_publisher(self): """<mods:originInfo><mods:publisher>""" prop = self.root_key + "originInfo" publishers = [] for oi in iterify(getprop(self.provider_data, prop, True)): for p in iterify(getprop(oi, "publisher", True)): publishers.append(textnode(p)) if publishers: self.update_source_resource({"publisher": publishers})
def map_subject(self): """<mods:subject><mods:topic>""" prop = self.root_key + "subject" subjects = [] for s in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(s, "topic", True)): subjects.append(textnode(t)) if subjects: self.update_source_resource({"subject": subjects})
def map_title(self): """<mods:titleInfo><mods:title>""" prop = self.root_key + "titleInfo" titles = [] for ti in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(ti, "title", True)): titles.append(textnode(t)) if titles: self.update_source_resource({"title": titles})
def map_subject(self): """<mods:subject><mods:topic>""" prop = self.root_key + "subject" subjects = [] for s in iterify(getprop(self.provider_data, prop, True)): for t in iterify(getprop(s, "topic", True)): subjects.append(textnode(t)) if subjects: self.update_source_resource({"subject": subjects})
def map_is_shown_at(self): path = "/metadata/mods/location" if exists(self.provider_data, path): for locations in iterify(getprop(self.provider_data, path)): if exists(locations, "url"): for url in iterify(getprop(locations, "url")): if (exists(url, "usage") and exists(url, "access") and url["usage"].lower().startswith("primary") and url["access"].lower() == "object in context"): self.mapped_data.update( {"isShownAt": textnode(url)})
def map_data_provider(self): dp = None set_spec = getprop(self.provider_data, "header/setSpec", True) location = getprop(self.provider_data, self.root_key + "location", True) rel_repo = getprop(self.provider_data, self.root_key+"relatedItem", True) # Conditional mapping for dataProvider if set_spec in ["ihp", "ward", "rubbings", "dag", "cna"]: for repo in iterify(rel_repo): loc = None phyloc = None # First look for 'host': # <mods:relatedItem type=”host”> # <location> # <physicalLocation type=”repository”> if repo.get("type") == "host": loc = getprop(repo, "location", True) if loc: phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] # if that is not present then use 'constituent' # <mods:relatedItem type=”constituent”> # <location> # <physicalLocation type=”repository”> elif repo.get("type") == "cconstituent": loc = getprop(repo, "location", True) if loc: phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] # if neither of the above are present use # <mods:location><physicalLocation type=”repository”> if dp is None and location is not None: for loc in iterify(location): phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] if set_spec in self.set_to_data_provider: dp = self.set_to_data_provider[set_spec] if dp is not None and not dp.endswith(", Harvard University"): dp = dp + ", Harvard University" if dp is not None: self.mapped_data.update({"dataProvider": dp})
def map_object(self): """<mods:location><mods:url> @access=preview""" prop = self.root_key + "location" link = [] for l in iterify(getprop(self.provider_data, prop, True)): for r in iterify(getprop(l, "url", True)): access_type = getprop(r, "access", True) if access_type and access_type == "preview": link.append(textnode(r)) if link: self.mapped_data.update({"object": link[0]})
def map_object(self): """<mods:location><mods:url> @access=preview""" prop = self.root_key + "location" link = [] for l in iterify(getprop(self.provider_data, prop, True)): for r in iterify(getprop(l, "url", True)): access_type = getprop(r, "access", True) if access_type and access_type == "preview": link.append(textnode(r)) if link: self.mapped_data.update({"object": link[0]})
def map_data_provider(self): dp = None set_spec = getprop(self.provider_data, "header/setSpec", True) location = getprop(self.provider_data, self.root_key + "location", True) rel_repo = getprop(self.provider_data, self.root_key + "relatedItem", True) # Conditional mapping for dataProvider if set_spec in ["ihp", "ward", "rubbings", "dag", "cna"]: for repo in iterify(rel_repo): loc = None phyloc = None # First look for 'host': # <mods:relatedItem type=”host”> # <location> # <physicalLocation type=”repository”> if repo.get("type") == "host": loc = getprop(repo, "location", True) if loc: phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] # if that is not present then use 'constituent' # <mods:relatedItem type=”constituent”> # <location> # <physicalLocation type=”repository”> elif repo.get("type") == "cconstituent": loc = getprop(repo, "location", True) if loc: phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] # if neither of the above are present use # <mods:location><physicalLocation type=”repository”> if dp is None and location is not None: for loc in iterify(location): phyloc = getprop(loc, "physicalLocation", True) if phyloc and is_repository(phyloc): dp = getprop(phyloc, "#text").split(";")[0] if set_spec in self.set_to_data_provider: dp = self.set_to_data_provider[set_spec] if dp is not None and not dp.endswith(", Harvard University"): dp = dp + ", Harvard University" if dp is not None: self.mapped_data.update({"dataProvider": dp})
def harvard_enrich_location(body, ctype, action="harvard_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a Harvard JSON document and enriches the "spatial" field by translating any MARC country codes contained within the originalDocument place element into their names, for better geocoding accuracy. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (exists(data, "originalRecord/metadata/mods/originInfo/place")): places = getprop(data, "originalRecord/metadata/mods/originInfo/place") country = "" countryCode = "" name = "" # Add non-country terms for place in iterify(places): logger.info("place: %s" % place) placeTerm = getprop(place, "placeTerm", True) if (isinstance(placeTerm, basestring)): name += " " + placeTerm elif (not exists(placeTerm, "authority")): name += " " + getprop(placeTerm, "#text", True) # Add country for place in iterify(places): placeTerm = getprop(place, "placeTerm", True) if (exists(placeTerm, "authority") \ and "marccountry" == getprop(placeTerm, "authority", True)): countryCode = getprop(placeTerm, "#text", True) country = get_country_from_marccode(countryCode) if (country): name += ", " + country # logger.info("geocode: harvard: Converting name to %s" % name) spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))} if (country \ and (2 == len(countryCode) \ or countryCode.startswith("xx"))): spatial["country"] = country setprop(data, prop, [spatial]) return json.dumps(data)
def map_format_and_spec_type(self): prop = self.root_key + "physicalDescription" _dict = {"format": [], "specType": []} if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "form" in s: for f in iterify(s.get("form")): if (f.lower() in ["books", "government records"] and f.capitalize() not in _dict["specType"]): _dict["specType"].append(f.capitalize()) elif f not in _dict["format"]: _dict["format"].append(f) self.update_source_resource(self.clean_dict(_dict))
def map_format_and_spec_type(self): prop = self.root_key + "physicalDescription" _dict = { "format": [], } if exists(self.provider_data, prop): for s in iterify(getprop(self.provider_data, prop)): if "form" in s: for f in iterify(s.get("form")): if f.get("authority") == "marcform": _dict["format"].append(f["#text"]) self.update_source_resource(self.clean_dict(_dict))
def map_type(self): """Map sourceResource.type In feed XML: //record/metadata/mods/typeOfResource In body of mapper request JSON: { "typeOfResource": { "#text": "the type", "xmlns:default": "http://www.loc.gov/mods/v3" } } """ # missing from MDH and WUSTL collections def type_strings(els): for el in els: try: yield textnode(el) except NoTextNodeError: pass tor = iterify(self.provider_data.get('typeOfResource', [])) if tor: types = [t for t in type_strings(tor)] self.update_source_resource({'type': types})
def map_has_view(self): """Map hasView See location_url() and map_is_shown_at(). Use //record/metadata/mods/location/url[@access='object in context'] for hasView.@id. Use //record/metadata/mods/physicalDescription/internetMediaType for hasView.format. """ def first_media_type(phys_descs): """First internetMediaType string from physicalDescription list""" for pd in phys_descs: imt = pd.get('internetMediaType') if imt: try: return textnode(imt) except NoTextNodeError: pass return None try: if not 'hasView' in self.mapped_data: self.mapped_data['hasView'] = {} self.mapped_data['hasView'].update( {'@id': self.location_url('access', 'object in context')}) phys_desc = iterify( self.provider_data.get('physicalDescription', [])) if phys_desc: media_type = first_media_type(phys_desc) if media_type: self.mapped_data['hasView'].update({'format': media_type}) except (KeyError, IndexError, TypeError): # Not required pass
def map_creator(self): """Map sourceResource.creator In feed XML: //record/metadata/mods/name/namePart[./role/roleTerm='creator'] In body of mapper request JSON: .name like this: {'namePart': 'Creator Name', 'role': { 'roleTerm': {'#text': 'creator', 'type': 'text'} } } """ # only in MHM, SLU, and WUSTL collections def creator_names(names): """Creator names from name elements with creator role Can pass a TypeError or KeyError. """ for n in names: try: if n['role']['roleTerm'] == 'creator' \ or textnode(n['role']['roleTerm']) == 'creator': yield n['namePart'] except (KeyError, NoTextNodeError): # No name with a roleTerm of "creator," but it's not # required. pass name = iterify(self.provider_data.get('name', [])) if name: creators = [n for n in creator_names(name)] if creators: self.update_source_resource({'creator': creators})