def map_type(self): """Get type from objectType or object_type element Specifically, look at freetext/objectType[label=Type] and indexedStructured/object_type. """ object_type_strings = [] phys_type_strings = [] ot_ccase = self.extract_xml_items("freetext", "objectType") phys_desc = self.extract_xml_items("freetext", "physicalDescription") ot_uscore = self.extract_xml_items("indexedStructured", "object_type") for pd in phys_desc: pd_text = pd.get("#text", "").strip() if pd_text: phys_type_strings.append(pd_text.lower()) for ot in ot_ccase: if ot.get("@label", "") == "Type": s = ot.get("#text", "").strip() if s: object_type_strings.append(s.lower()) for ot in ot_uscore: s = ot.strip() if s: object_type_strings.append(s.lower()) try: new_type = itemtype.type_for_strings_and_mappings( [(phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword)] ) except itemtype.NoTypeError: id_for_msg = self.provider_data.get("_id", "[no _id]") logger.warning("Can not deduce type for item with _id: %s" % id_for_msg) new_type = "image" self.update_source_resource({"type": new_type})
def map_rights(self, _dict, tag, codes): values = self._get_values(_dict, codes) code = values[0] try: rights = self.rights_desc[code] rights += ". Learn more at http://www.hathitrust.org/access_use" setprop(self.mapped_data, "sourceResource/rights", rights) except KeyError as e: logger.warning("Unacceptable rights code for %s: %s" % (self.provider_data["_id"], e.message)) except: logger.error("Could not get rights from %s" % self.provider_data["_id"])
def validatemapv3(body, ctype): """ Service that accepts a JSON document and validates it against the DPLA Metadata Application Profile v3 JSON Schema. """ # TODO: Send GET request to API once schema endpoint is created try: data = json.loads(body) id_for_msg = data.get('_id', '[no id]') except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" valid = None validation_message = None try: ingest_type = data.get('ingestType', None) if ingest_type is not None: validate(data, MAPV3_SCHEMAS[ingest_type]) valid = True else: logger.warning( 'Could not get ingestType for record with id %s; refusing to validate.' % id_for_msg) except ValidationError as err: valid = False logger.warning('Could not validate %s record with id %s: %s' % (ingest_type, id_for_msg, err.message)) validation_message = err.message if "admin" in data: data["admin"]["valid_after_enrich"] = valid data["admin"]["validation_message"] = validation_message else: data["admin"] = { "valid_after_enrich": valid, "validation_message": validation_message } return json.dumps(data)
def validatemapv3(body, ctype): """ Service that accepts a JSON document and validates it against the DPLA Metadata Application Profile v3 JSON Schema. """ # TODO: Send GET request to API once schema endpoint is created try: data = json.loads(body) id_for_msg = data.get('_id', '[no id]') except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" valid = None validation_message = None try: ingest_type = data.get('ingestType', None) if ingest_type is not None: validate(data, MAPV3_SCHEMAS[ingest_type]) valid = True else: logger.warning('Could not get ingestType for record with id %s; refusing to validate.' % id_for_msg) except ValidationError as err: valid = False logger.warning('Could not validate %s record with id %s: %s' % (ingest_type, id_for_msg, err.message)) validation_message = err.message if "admin" in data: data["admin"]["valid_after_enrich"] = valid data["admin"]["validation_message"] = validation_message else: data["admin"] = { "valid_after_enrich": valid, "validation_message": validation_message } return json.dumps(data)
def map_type(self): """Get type from objectType or object_type element Specifically, look at freetext/objectType[label=Type] and indexedStructured/object_type. """ object_type_strings = [] phys_type_strings = [] ot_ccase = self.extract_xml_items("freetext", "objectType") phys_desc = self.extract_xml_items("freetext", "physicalDescription") ot_uscore = self.extract_xml_items("indexedStructured", "object_type") for pd in phys_desc: pd_text = pd.get("#text", "").strip() if pd_text: phys_type_strings.append(pd_text.lower()) for ot in ot_ccase: if ot.get("@label", "") == "Type": s = ot.get("#text", "").strip() if s: object_type_strings.append(s.lower()) for ot in ot_uscore: s = ot.strip() if s: object_type_strings.append(s.lower()) try: new_type = itemtype.type_for_strings_and_mappings([ (phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword) ]) except itemtype.NoTypeError: id_for_msg = self.provider_data.get("_id", "[no _id]") logger.warning("Can not deduce type for item with _id: %s" % id_for_msg) new_type = 'image' self.update_source_resource({"type": new_type})
def filter_path(_dict, path): """ Repeatedly runs cleaner function until all empty values are removed from given path (hash stops changing). Arguments: _dict - dictionary to clean; path - a xpath-like path to the value, that must be checked Returns: cleaned dictionary """ d = copy.deepcopy(_dict) embracing_path, sep, value_key = path.rpartition(PATH_DELIM) try: dict_to_clean = getprop(d, embracing_path) except KeyError: logger.warning("Attempt to clean non existent path \"%s\"", embracing_path) return _dict else: if value_key: cleaned_dict = filter_dict(dict_to_clean, filter_fields, value_key) setprop(d, embracing_path, cleaned_dict) return d else: return filter_dict(dict_to_clean, filter_fields, embracing_path)
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format", default=None, send_rejects_to_format=False): """ Service that accepts a JSON document and enriches the "type" field of that document by: By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter. A default type, if none can be determined, may be specified with the "default" querystring parameter. If no default is given, the type field will be unmodified, or not added, in the result. """ global type_for_type_keyword, type_for_format_keyword try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" type_strings = [] format_strings = [] try: sr_type = data['sourceResource'].get('type', []) sr_format = data['sourceResource'].get('format', []) except KeyError: # In this case, sourceResource is not present, so give up and return # the original data unmodified. id_for_msg = data.get('_id', '[no id]') logger.warning('enrich-type lacks sourceResource for _id %s' % \ id_for_msg) return body if sr_type: for t in sr_type if (type(sr_type) == list) else [sr_type]: t_flat = t if type(t) == dict: t_flat = t.get('#text', None) if not t_flat: t_flat = t.get('text', '') type_strings.append(t_flat.lower()) if sr_format: for f in sr_format if (type(sr_format) == list) else [sr_format]: format_strings.append(f.lower()) try: data['sourceResource']['type'] = \ itemtype.type_for_strings_and_mappings([ (type_strings, type_for_type_keyword), (format_strings, type_for_format_keyword), ]) except itemtype.NoTypeError: id_for_msg = data.get('_id', '[no id]') logger.warning('Can not deduce type for item with _id: %s' % \ id_for_msg) if default: data['sourceResource']['type'] = default else: try: del data['sourceResource']['type'] except: pass finally: if send_rejects_to_format and type_strings: rej = itemtype.rejects([(type_strings, type_for_type_keyword)]) if rej: if (not isinstance(sr_format, list)): sr_format = [sr_format] sr_format.extend(rej) data['sourceResource']['format'] = sr_format return json.dumps(data)
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format", default=None, send_rejects_to_format=False): """ Service that accepts a JSON document and enriches the "type" field of that document by: By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter. A default type, if none can be determined, may be specified with the "default" querystring parameter. If no default is given, the type field will be unmodified, or not added, in the result. """ global type_for_type_keyword, type_for_format_keyword try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" type_strings = [] format_strings = [] try: sr_type = data['sourceResource'].get('type', []) sr_format = data['sourceResource'].get('format', []) except KeyError: # In this case, sourceResource is not present, so give up and return # the original data unmodified. id_for_msg = data.get('_id', '[no id]') logger.warning('enrich-type lacks sourceResource for _id %s' % \ id_for_msg) return body if sr_type: for t in sr_type if (type(sr_type) == list) else [sr_type]: if type(t) == dict: t = t.get('#text', '') if t is not None: type_strings.append(t.lower()) if sr_format: for f in sr_format if (type(sr_format) == list) else [sr_format]: if f is not None: format_strings.append(f.lower()) try: data['sourceResource']['type'] = \ itemtype.type_for_strings_and_mappings([ (format_strings, type_for_format_keyword), (type_strings, type_for_type_keyword) ]) except itemtype.NoTypeError: id_for_msg = data.get('_id', '[no id]') logger.warning('Can not deduce type for item with _id: %s' % \ id_for_msg) if default: data['sourceResource']['type'] = default else: try: del data['sourceResource']['type'] except: pass finally: if send_rejects_to_format and type_strings: rej = itemtype.rejects([(type_strings, type_for_type_keyword)]) if rej: if (not isinstance(sr_format, list)): sr_format = [sr_format] sr_format.extend(rej) data['sourceResource']['format'] = sr_format return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): """ Service that accepts a JSON document and sets the language ISO 639-3 code(s) and language name from the current language value(s) by: a) Checking if the value is a language code, else a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else c) Attempting to find an exact language name match, else d) Attempting to find language name matches withing the value """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) language_strings = [v] if not isinstance(v, list) else v iso_codes = [] for lang_string in language_strings: # Check if raw value is a code if lang_string not in iso_codes and lang_string in ISO639_3_SUBST: iso_codes.append(lang_string) else: # If lang_string is an ISO 639-1 code, convert to ISO 639-3 iso3 = iso1_to_iso3( re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip() ) if iso3 not in iso_codes and iso3 in ISO639_3_SUBST: iso_codes.append(iso3) else: # First check for exact language name matches for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items(): match = regex.match(lang_string.strip()) if match: iso_codes.append(iso_code) break if match is None: # Check for language names with word boundary regex for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items(): if regex.search(lang_string): iso_codes.append(iso_code) if iso_codes: seen = set() language = [{"iso639_3": code, "name": ISO639_3_SUBST[code]} for code in iso_codes if not (code in seen or seen.add(code))] setprop(data, prop, language) else: logger.warning("Did not find language code in [%s] for record %s" % (language_strings, data["_id"])) delprop(data, prop) return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): """ Service that accepts a JSON document and sets the language ISO 639-3 code(s) and language name from the current language value(s) by: a) Checking if the value is a language code, else a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else c) Attempting to find an exact language name match, else d) Attempting to find language name matches withing the value """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) language_strings = [v] if not isinstance(v, list) else v iso_codes = [] for lang_string in language_strings: # Check if raw value is a code if lang_string not in iso_codes and lang_string in ISO639_3_SUBST: iso_codes.append(lang_string) else: # If lang_string is an ISO 639-1 code, convert to ISO 639-3 iso3 = iso1_to_iso3( re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip()) if iso3 not in iso_codes and iso3 in ISO639_3_SUBST: iso_codes.append(iso3) else: # First check for exact language name matches for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items(): match = regex.match(lang_string.strip()) if match: iso_codes.append(iso_code) break if match is None: # Check for language names with word boundary regex for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items( ): if regex.search(lang_string): iso_codes.append(iso_code) if iso_codes: seen = set() language = [{ "iso639_3": code, "name": ISO639_3_SUBST[code] } for code in iso_codes if not (code in seen or seen.add(code))] setprop(data, prop, language) else: logger.warning("Did not find language code in [%s] for record %s" % (language_strings, data["_id"])) delprop(data, prop) return json.dumps(data)