def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): spatials = [] for spatial in iterify(getprop(data,prop)): if (is_spatial(spatial)): spatials.append(format_spatial(spatial)) if (len(spatials) > 0): setprop(data, prop, spatials) else: delprop(data, prop) return json.dumps(data)
def geocode_region(spatial): setprop(spatial, "coordinates", "%s, %s" % REGIONS[getprop(spatial, "name")]) delprop(spatial, "county") setprop(spatial, "state", "South Carolina") setprop(spatial, "country", "United States") return spatial
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def mdlenrichlocation(body, ctype, action="mwdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): spatials = [] for spatial in iterify(getprop(data, prop)): if (is_spatial(spatial)): spatials.append(format_spatial(spatial)) if (len(spatials) > 0): setprop(data, prop, spatials) else: delprop(data, prop) return json.dumps(data)
def update_title(self): prop = "sourceResource/title" title_list = filter(None, getprop(self.mapped_data, prop)) if title_list: title = [" ".join(t) for t in title_list] setprop(self.mapped_data, prop, title) else: delprop(self.mapped_data, prop)
def delete_field_and_queue_image_harvest(doc, field, cdb, enq): print 'Delete {} for {}'.format(field, doc['_id']) delprop(doc, field, keyErrorAsNone=True) cdb.save(doc) timeout = 10000 results = enq.queue_list_of_ids([doc['_id']], timeout, harvest_image_for_doc, )
def delete_field_and_queue_image_harvest(doc, field, cdb, enq): print 'Delete {} for {}'.format(field, doc['_id']) delprop(doc, field, keyErrorAsNone=True) cdb.save(doc) timeout = 10000 results = enq.queue_list_of_ids( [doc['_id']], timeout, harvest_image_for_doc, )
def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None): """Unsets the value of prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to unset condition -- the condition to be met (uses prop by default) condition_prop -- the prop(s) to use in the condition (comma-separated if multiple props) """ CONDITIONS = { "is_digit": lambda v: v[0].isdigit(), "mwdl_exclude": lambda v: (v[0] == "collections" or v[0] == "findingAids"), "hathi_exclude": lambda v: "Minnesota Digital Library" in v, "finding_aid_title": lambda v: v[0].startswith("Finding Aid"), "usc_no_contributor": lambda v: not v[0].get("contributor", False) } def condition_met(condition_prop, condition): values = [] props = condition_prop.split(",") for p in props: iterified = iterify(getprop(data, p, True)) [values.append(i) for i in iterified] return CONDITIONS[condition](values) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Check if prop exists to avoid key error if exists(data, prop): if not condition: delprop(data, prop) else: if not condition_prop: condition_prop = prop try: if condition_met(condition_prop, condition): logger.debug("Unsetting prop %s for doc with id %s" % (prop, data["_id"])) delprop(data, prop) except KeyError: logger.error("CONDITIONS does not contain %s" % condition) return json.dumps(data)
def convert_dates(data, prop, earliest): """Converts dates. Arguments: data Dict - Data for conversion. prop Str - Properties dividided with comma. earliest Bool - True - the function will set only the earliest date. False - the function will set all dates. Returns: Nothing, the replacement is done in place. """ for p in prop.split(','): dates = [] if exists(data, p): v = getprop(data, p) if not isinstance(v, dict): if is_year_range_list(v): dates.append( { "begin": v[0], "end": v[-1], "displayDate": "%s-%s" % (v[0], v[-1]) }) else: for s in (v if not isinstance(v, basestring) else [v]): for part in s.split(";"): display_date = remove_single_brackets_and_strip( part ) stripped = clean_date( remove_all_brackets_and_strip(part) ) if len(stripped) < 4: continue a, b = parse_date_or_range(stripped) if b != DEFAULT_DATETIME_STR: dates.append( { "begin": a, "end": b, "displayDate": display_date }) else: # Already filled in, probably by mapper continue dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR) if dates: if earliest: value_to_set = dates[0] else: value_to_set = dates setprop(data, p, value_to_set) else: delprop(data, p)
def update_language(self): out_languages = [] for language in iterify(getprop(self.mapped_data, "sourceResource/language", True)): if isinstance(language, dict): out_languages.append(language) elif isinstance(language, basestring): out_languages.append({"name": language}) if out_languages: self.update_source_resource({"language": out_languages}) else: delprop(self.mapped_data, "language", True)
def update_language(self): out_languages = [] for language in iterify( getprop(self.mapped_data, "sourceResource/language", True)): if isinstance(language, dict): out_languages.append(language) elif isinstance(language, basestring): out_languages.append({"name": language}) if out_languages: self.update_source_resource({"language": out_languages}) else: delprop(self.mapped_data, "language", True)
def artstor_spatial_to_dataprovider(body, ctype, prop="sourceResource/spatial"): """Sets the dataProvider from sourceResource/spatial by: 1. Deleting the dataProvider field 2. Splitting on semicolon if sourceResource/spatial is a string 3. Moving the first sourceResource/spatial value to dataProvider for DPLA* collections 4. Moving the "Repository: " value to dataProvider for SS* collections 5. Removing the sourceResource/spatial field for DPLA* collections 6. Removing any "Accession number: " values from sourceResource/spatial for SS* collections 7. Removing the string "Repository: " from the dataProvider value """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" delprop(data, "dataProvider") if exists(data, prop): v = getprop(data, prop) if isinstance(v, basestring): v = v.split(";") spatial = [] data_provider = None collections = getprop(data, "originalRecord/setSpec", True) for coll in iterify(collections): if coll.startswith("DPLA"): data_provider = v[0] break elif coll.startswith("SS"): spatial = [] for s in v: if "Repository" in s: data_provider = s elif "Accession" not in s: spatial.append(s) break delprop(data, prop) if spatial: setprop(data, prop, spatial) if data_provider: setprop(data, "dataProvider", data_provider.replace("Repository: ", "")) return json.dumps(data)
def movedatestotemporal(body,ctype,action="move_dates_to_temporal",prop=None): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("No prop supplied") return body REGSUB = ("\(", ""), ("\)", "") REGSEARCH = ["(\( *)?(\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4})( *\))?", "(\( *)?(\d{4} *[-/] *\d{4})( *\))?", "(\( *)?(\d{4})( *\))?"] def cleanup(s): for p,r in REGSUB: s = re.sub(p,r,s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): p = [] temporal_field = "aggregatedCHO/temporal" temporal = getprop(data, temporal_field) if exists(data, temporal_field) else [] for d in getprop(data, prop): for regsearch in REGSEARCH: pattern = re.compile(regsearch) for match in pattern.findall(d["name"]): m = "".join(match) #TODO (\( *)? matches 0 and produces '' in m if m: d["name"] = re.sub(re.escape(m),"",d["name"]) temporal.append({"name": cleanup(m)}) if d["name"].strip(): # Append to p, which will overwrite data[prop] p.append(d) if temporal: setprop(data, temporal_field, temporal) if p: setprop(data, prop, p) else: delprop(data, prop) return json.dumps(data)
def update_subject(self): subjects = [] if exists(self.mapped_data, "sourceResource/subject"): for subject in iterify(getprop(self.mapped_data, "sourceResource/subject")): if isinstance(subject, basestring): subjects.append(subject) elif isinstance(subject, dict): s = getprop(subject, "name", True) if s: subjects.append(s) else: pass delprop(self.mapped_data, "sourceResource/subject", True) if subjects: self.update_source_resource({"subject": subjects})
def update_subject(self): subjects = [] if exists(self.mapped_data, "sourceResource/subject"): for subject in iterify( getprop(self.mapped_data, "sourceResource/subject")): if isinstance(subject, basestring): subjects.append(subject) elif isinstance(subject, dict): s = getprop(subject, "name", True) if s: subjects.append(s) else: pass delprop(self.mapped_data, "sourceResource/subject", True) if subjects: self.update_source_resource({"subject": subjects})
def convert(data, prop): value = getprop(data, prop) if isinstance(value, list): values = [] for v in value: if check_date_dict(v): values.append(v) if values: setprop(data, prop, values) else: delprop(data, prop) elif not check_date_dict(value): delprop(data, prop)
def update_data_provider(self): new_data_provider = getprop(self.mapped_data, "dataProvider", True) # if unset or dict or list if not isinstance(new_data_provider, basestring): f = getprop(self.provider_data, "doc/originalRecord/facet-institution") if isinstance(f, dict): new_data_provider = f.pop("text", None) elif isinstance(f, list) and len(f) > 0: new_data_provider = f[0].pop("text", None) if not isinstance(new_data_provider, basestring): new_data_provider = None if new_data_provider: new_data_provider = new_data_provider.replace("::", ", ") self.mapped_data.update({"dataProvider": new_data_provider}) else: delprop(self.mapped_data, "dataProvider", True)
def cdl_identify_object(body, ctype): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" url = None if exists(data, "object"): handle = getprop(data, "object") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if exists(data, "originalRecord/doc/isShownBy"): handle = getprop(data, "originalRecord/doc/isShownBy") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if url: if 'content.cdlib.org' in url: base_url, obj_id, object_type = url.rsplit("/", 2) is_shown_at = getprop(data, "isShownAt") is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1) if obj_id != is_shown_at_id: logger.warn( "Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id)) obj_id = is_shown_at_id url = "/".join([base_url, obj_id, object_type]) if object_type == "hi-res": setprop(data, "hasView", {"@id": url}) url = url.replace('hi-res', 'thumbnail') setprop(data, "object", url) else: logger.warn("No url found for object in id %s" % data["_id"]) delprop(data, "object", True) return json.dumps(data)
def cdl_identify_object(body, ctype): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" url = None if exists(data, "object"): handle = getprop(data, "object") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if exists(data, "originalRecord/doc/isShownBy"): handle = getprop(data, "originalRecord/doc/isShownBy") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if url: if 'content.cdlib.org' in url: base_url, obj_id, object_type = url.rsplit("/", 2) is_shown_at = getprop(data, "isShownAt") is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1) if obj_id != is_shown_at_id: logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id)) obj_id = is_shown_at_id url = "/".join([base_url, obj_id, object_type]) if object_type == "hi-res": setprop(data, "hasView", {"@id": url}) url = url.replace('hi-res', 'thumbnail') setprop(data, "object", url) else: logger.warn("No url found for object in id %s" % data["_id"]) delprop(data, "object", True) return json.dumps(data)
def comparewithschema(body, ctype): """ Service that accepts a JSON document and removes any fields not listed as part of the schema. """ # TODO: Send GET request to API once schema endpoint is created try : data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if "_id" not in data or ("sourceResource" not in data and data.get("ingestType") == "item"): return body type = data.get("ingestType") if type: props = ["collection/properties"] if type == "collection" else \ ["item/properties", "item/properties/sourceResource/properties"] for prop in props: schema_keys = getprop(schema, prop).keys() if "sourceResource" in prop: data_keys = data["sourceResource"].keys() field_prefix = "sourceResource/" else: data_keys = data.keys() data_keys.remove("_id") field_prefix = "" # Remove any keys in the document that are not found in the schema for field in [k for k in data_keys if k not in schema_keys]: field = field_prefix + field logger.error("Field %s for %s not found in schema; deleting" % (field, data.get("_id"))) delprop(data, field) else: logger.error("Unknown type %s for %s" % (type, data.get("_id"))) return json.dumps(data)
def comparewithschema(body, ctype): """ Service that accepts a JSON document and removes any fields not listed as part of the schema. """ # TODO: Send GET request to API once schema endpoint is created try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if "_id" not in data or ("sourceResource" not in data and data.get("ingestType") == "item"): return body type = data.get("ingestType") if type: props = ["collection/properties"] if type == "collection" else \ ["item/properties", "item/properties/sourceResource/properties"] for prop in props: schema_keys = getprop(schema, prop).keys() if "sourceResource" in prop: data_keys = data["sourceResource"].keys() field_prefix = "sourceResource/" else: data_keys = data.keys() data_keys.remove("_id") field_prefix = "" # Remove any keys in the document that are not found in the schema for field in [k for k in data_keys if k not in schema_keys]: field = field_prefix + field logger.error("Field %s for %s not found in schema; deleting" % (field, data.get("_id"))) delprop(data, field) else: logger.error("Unknown type %s for %s" % (type, data.get("_id"))) return json.dumps(data)
def convert_dates(data, prop, earliest): """Converts dates. Arguments: data Dict - Data for conversion. prop Str - Properties dividided with comma. earliest Bool - True - the function will set only the earliest date. False - the function will set all dates. Returns: Nothing, the replacement is done in place. """ dates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) if not isinstance(v, dict): for s in (v if not isinstance(v, basestring) else [v]): for part in s.split(";"): display_date = remove_brackets_and_strip(part) stripped = clean_date(display_date) if len(stripped) < 4: continue a, b = parse_date_or_range(stripped) if b != '3000-01-01': dates.append({ "begin": a, "end": b, "displayDate": display_date }) dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR) value_to_set = dates if earliest and dates: value_to_set = dates[0] if value_to_set: setprop(data, p, value_to_set) else: if exists(data, p): delprop(data, p)
def convert_dates(data, prop, earliest): """Converts dates. Arguments: data Dict - Data for conversion. prop Str - Properties dividided with comma. earliest Bool - True - the function will set only the earliest date. False - the function will set all dates. Returns: Nothing, the replacement is done in place. """ dates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) if not isinstance(v, dict): for s in (v if not isinstance(v, basestring) else [v]): for part in s.split(";"): display_date = remove_brackets_and_strip(part) stripped = clean_date(display_date) if len(stripped) < 4: continue a, b = parse_date_or_range(stripped) if b != '3000-01-01': dates.append( { "begin": a, "end": b, "displayDate" : display_date }) dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR) value_to_set = dates if earliest and dates: value_to_set = dates[0] if value_to_set: setprop(data, p, value_to_set) else: if exists(data, p): delprop(data, p)
def convert(data, prop): value = getprop(data, prop, True) if not value: return if isinstance(value, basestring): if value == "creator": delprop(data, prop) else: v = convert_field(value) setprop(data, prop, v) elif isinstance(value, list): values = [] for item in value: if item == "creator": continue v = convert_field(item) values.append(v) setprop(data, prop, values)
def artstor_spatial_to_dataprovider(body, ctype, prop="sourceResource/spatial"): """ Splits spatial on semicolon and copies the first value to dataProvider """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) if isinstance(v, list): v = v[0] if isinstance(v, basestring): v = v.split(";")[0] setprop(data, "dataProvider", v) delprop(data, prop) return json.dumps(data)
def remove_list_values(body, ctype, prop=None, values=None): """Given a comma-separated string of values, removes any instance of each value from the prop. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" v = getprop(data, prop, True) if isinstance(v, list) and values is not None: values = values.split(",") v = [s for s in v if s not in values] if v: setprop(data, prop, v) else: delprop(data, prop) return json.dumps(data)
def enrichformat(body, ctype, action="enrich-format", prop="sourceResource/format", type_field="sourceResource/type"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) Setting the format to be all lowercase b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) Checking to see if the field is a valid IMT See http://www.iana.org/assignments/media-types for list of valid media-types. We require that a subtype is defined. d) Removing any extra text after the IMT e) Moving valid IMT values to hasView/format if hasView exists and its format is not set f) Setting type field from format field, if it is not set. The format field is taken if it is a string, or the first element if it is a list. It is then split and the first part of IMT is taken. By default works on the 'sourceResource/format' field but can be overridden by passing the name of the field to use as the 'prop' parameter. """ FORMAT_2_TYPE_MAPPINGS = { "audio": "sound", "image": "image", "video": "moving image", "text": "text" } REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \ ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \ ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \ ('^jpg$', 'image/jpeg'), ('\W$', '') IMT_TYPES = [ 'application', 'audio', 'image', 'message', 'model', 'multipart', 'text', 'video' ] def get_ext(s): ext = os.path.splitext(s)[1].split('.') return ext[1] if len(ext) == 2 else "" def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s) return s def is_imt(s): logger.debug("Checking: " + s) imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) imt_values = [] if exists(data, prop): v = getprop(data, prop) format = [] hasview_format = [] for s in (v if not isinstance(v, basestring) else [v]): if s.startswith("http") and is_absolute(s): s = get_ext(s) cleaned = cleanup(s) if is_imt(cleaned): # Append to imt_values for use in type imt_values.append(cleaned) # Move IMT values to hasView/format else discard if exists(data, "hasView") and not \ exists(data, "hasView/format") and \ cleaned not in hasview_format: hasview_format.append(cleaned) else: # Retain non-IMT values in sourceResource/format, non-cleaned if s not in format: format.append(s) if format: if len(format) == 1: format = format[0] setprop(data, prop, format) else: delprop(data, prop) if hasview_format: if len(hasview_format) == 1: hasview_format = hasview_format[0] setprop(data, "hasView/format", hasview_format) # Setting the type if it is empty. if not exists(data, type_field) and imt_values: type = [] for imt in imt_values: t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True) if t and t not in type: type.append(t) if type: if len(type) == 1: type = type[0] setprop(data, type_field, type) return json.dumps(data)
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format"): """ Service that accepts a JSON document and enriches the "type" field of that document by: a) making the type lowercase b) converting "image" to "still image" (TODO: Amy to confirm that this is ok) c) applying a set of regexps to do data cleanup (remove plural forms) d) moving all items that are not standard DC types to the sourceResource/format (http://dublincore.org/documents/resource-typelist/) By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter """ REGEXPS = ('images','image'), ('still image','image'),\ ('textual records', 'text'),\ ('photographs and other graphic materials', 'image'),\ ('texts', 'text') DC_TYPES = [ 'collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'moving image', 'physical object', 'service', 'software', 'sound', 'text' ] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) return s def is_dc_type(s): return s in DC_TYPES try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) dctype = [] f = getprop(data, format_field) if exists(data, format_field) else [] if not isinstance(f, list): f = [f] for s in (v if not isinstance(v, basestring) else [v]): if is_dc_type(cleanup(s)): dctype.append(cleanup(s)) else: f.append(s) if dctype: if len(dctype) == 1: dctype = dctype[0] setprop(data, prop, dctype) else: delprop(data, prop) if len(f) > 1: setprop(data, format_field, f) elif len(f) == 1: setprop(data, format_field, f[0]) return json.dumps(data)
def copyprop(body, ctype, prop=None, to_prop=None, create=False, key=None, remove=None, no_replace=None, no_overwrite=None): """Copies value in one prop to another prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) create -- creates to_prop if True (default False) key -- the key to use if to_prop is a dict (default None) remove -- removes prop if True (default False) no_replace -- creates list of to_prop string and appends prop if True """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, to_prop) and no_overwrite: pass else: if exists(data, prop) and create and not exists(data, to_prop): val = {} if key else "" setprop(data, to_prop, val) if exists(data, prop) and exists(data, to_prop): val = getprop(data, prop) to_element = getprop(data, to_prop) if isinstance(to_element, basestring): if no_replace: el = [to_element] if to_element else [] el.append(val) # Flatten val = [ e for s in el for e in (s if not isinstance(s, basestring) else [s]) ] setprop(data, to_prop, val) else: # If key is set, assume to_element is dict or list of dicts if key: if not isinstance(to_element, list): to_element = [to_element] for dict in to_element: if exists(dict, key) or create: setprop(dict, key, val) else: msg = "Key %s does not exist in %s" % (key, to_prop) logger.debug(msg) else: # Handle case where to_element is a list if isinstance(to_element, list): if isinstance(val, list): to_element = to_element + val else: to_element.append(val) setprop(data, to_prop, to_element) else: # to_prop is dictionary but no key was passed. msg = "%s is a dictionary but no key was passed" % to_prop logger.warn(msg) setprop(data, to_prop, val) if remove: delprop(data, prop) return json.dumps(data)
def __init__(self, provider_data, key_prefix=None, datafield_tag='datafield', controlfield_tag='controlfield', pymarc=False): super(MARCMapper, self).__init__(provider_data, key_prefix) # Fields controlfield, datafield, and leader may be nested within the # metadata/record field for DPLA fetcher items prop = "metadata/record" if exists(self.provider_data, prop): self.provider_data.update(getprop(self.provider_data, prop)) delprop(self.provider_data, prop) self.control_001 = "" self.control_007_01 = "" self.control_008_18 = "" self.control_008_21 = "" self.control_008_28 = "" self.control_format_char = "" self.datafield_tag = datafield_tag self.controlfield_tag = controlfield_tag self.datafield_086_or_087 = False self.pymarc = pymarc self.identifier_tag_labels = { "020": "ISBN:", "022": "ISSN:", "050": "LC call number:" } # Mapping dictionary for use with datafield # Keys are used to check if there is a tag match. If so, the value # provides a list of (property, code) tuples. In the case where certain # tags have prominence over others, an index is used and the tuples # will be of the form (property, index, code). To exclude a code, # prefix it with a "!": [("format", "!cd")] will exclude the "c" # "d" codes (see method _get_values). self.mapping_dict = { lambda t: t == "856": [(self.map_is_shown_at, "u"), (self.map_is_shown_by, "u")], lambda t: t == "041": [(self.map_language, "a")], lambda t: t == "260": [(self.map_display_date, "c"), (self.map_publisher, "ab")], lambda t: t == "300": [(self.map_extent, None)], lambda t: t in ("337", "338"): [(self.map_format, "a")], lambda t: t == "340": [(self.map_extent, "b"), (self.map_format, "a")], lambda t: t == "050": [(self.map_identifier, "ab")], lambda t: t in ("020", "022", "035"): [(self.map_identifier, "a")], lambda t: t in ("100", "110", "111"): [(self.map_creator, None)], lambda t: (760 <= int(t) <= 787): [(self.map_relation, None)], lambda t: (t != "538" and t.startswith("5")): [(self.map_description, "a")], lambda t: t in ("506", "540"): [(self.map_rights, None)], lambda t: t == "648": [(self.map_temporal, None)], lambda t: t in ("700", "710", "711", "720"): [(self.map_contributor, None)], lambda t: t == "245": [(self.map_title, 0, "!c")], lambda t: t == "242": [(self.map_title, 1, None)], lambda t: t == "240": [(self.map_title, 2, None)], lambda t: t == "651": [(self.map_spatial, "a")], lambda t: (int(t) in set([600, 630, 650, 651] + range(610, 620) + range( 653, 659) + range(690, 700))): [(self.map_subject, None), (self.map_format, "v"), (self.map_temporal, "y"), (self.map_spatial, "z")], } self.type_mapping = { "datafield": OrderedDict([("AJ", ("Journal", "Text")), ("AN", ("Newspaper", "Text")), ("BI", ("Biography", "Text")), ("BK", ("Book", "Text")), ("CF", ("Computer File", "Interactive Resource")), ("CR", ("CDROM", "Interactive Resource")), ("CS", ("Software", "Software")), ("DI", ("Dictionaries", "Text")), ("DR", ("Directories", "Text")), ("EN", ("Encyclopedias", "Text")), ("HT", ("HathiTrust", None)), ("MN", ("Maps-Atlas", "Image")), ("MP", ("Map", "Image")), ("MS", ("Musical Score", "Text")), ("MU", ("Music", "Text")), ("MV", ("Archive", "Collection")), ("MW", ("Manuscript", "Text")), ("MX", ("Mixed Material", "Collection")), ("PP", ("Photograph/Pictorial Works", "Image")), ("RC", ("Audio CD", "Sound")), ("RL", ("Audio LP", "Sound")), ("RM", ("Music", "Sound")), ("RS", ("Spoken word", "Sound")), ("RU", (None, "Sound")), ("SE", ("Serial", "Text")), ("SX", ("Serial", "Text")), ("VB", ("Video (Blu-ray)", "Moving Image")), ("VD", ("Video (DVD)", "Moving Image")), ("VG", ("Video Games", "Moving Image")), ("VH", ("Video (VHS)", "Moving Image")), ("VL", ("Motion Picture", "Moving Image")), ("VM", ("Visual Material", "Image")), ("WM", ("Microform", "Text")), ("XC", ("Conference", "Text")), ("XS", ("Statistics", "Text"))]), "leader": OrderedDict([("am", ("Book", "Text")), ("asn", ("Newspapers", "Text")), ("as", ("Serial", "Text")), ("aa", ("Book", "Text")), ("a(?![mcs])", ("Serial", "Text")), ("[cd].*", ("Musical Score", "Text")), ("t.*", ("Manuscript", "Text")), ("[ef].*", ("Maps", "Image")), ("g.[st]", ("Photograph/Pictorial Works", "Image")), ("g.[cdfo]", ("Film/Video", "Moving Image")), ("g.*", (None, "Image")), ("k.*", ("Photograph/Pictorial Works", "Image")), ("i.*", ("Nonmusic", "Sound")), ("j.*", ("Music", "Sound")), ("r.*", (None, "Physical object")), ("p[cs].*", (None, "Collection")), ("m.*", (None, "Interactive Resource")), ("o.*", (None, "Collection"))]) }
from akara import response from akara.services import simple_service from amara.thirdparty import json from dplaingestion.selector import delprop @simple_service( 'POST', 'http://purl.org/la/dp/remove_property', 'remove_property', 'application/json') def remove_property( body, ctype, prop, action="remove_property", ): try: data = json.loads(body) except Exception, err: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON: " + str(err) delprop(data, prop, True) return json.dumps(data)
def enrichformat(body, ctype, action="enrich-format", prop="sourceResource/format", type_field="sourceResource/type"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) Setting the format to be all lowercase b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) Checking to see if the field is a valid IMT See http://www.iana.org/assignments/media-types for list of valid media-types. We require that a subtype is defined. d) Removing any extra text after the IMT e) Moving valid IMT values to hasView/format if hasView exists and its format is not set f) Setting type field from format field, if it is not set. The format field is taken if it is a string, or the first element if it is a list. It is then split and the first part of IMT is taken. By default works on the 'sourceResource/format' field but can be overridden by passing the name of the field to use as the 'prop' parameter. """ FORMAT_2_TYPE_MAPPINGS = { "audio": "sound", "image": "image", "video": "moving image", "text": "text" } REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \ ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \ ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \ ('^jpg$', 'image/jpeg'), ('\W$', '') IMT_TYPES = ['application', 'audio', 'image', 'message', 'model', 'multipart', 'text', 'video'] def get_ext(s): ext = os.path.splitext(s)[1].split('.') return ext[1] if len(ext) == 2 else "" def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1", s) return s def is_imt(s): imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" imt_values = [] if exists(data, prop): v = getprop(data, prop) format = [] hasview_format = [] for s in (filter(None,v) if not isinstance(v, basestring) else [v]): if s is not None and s.startswith("http") and is_absolute(s): s = get_ext(s) cleaned = cleanup(s) if is_imt(cleaned): # Append to imt_values for use in type imt_values.append(cleaned) # Move IMT values to hasView/format else discard if exists(data, "hasView") and not \ exists(data, "hasView/format") and \ cleaned not in hasview_format: hasview_format.append(cleaned) else: # Retain non-IMT values in sourceResource/format, non-cleaned if s not in format: format.append(s) if format: if len(format) == 1: format = format[0] setprop(data, prop, format) else: delprop(data, prop) if hasview_format: if len(hasview_format) == 1: hasview_format = hasview_format[0] setprop(data, "hasView/format", hasview_format) # Setting the type if it is empty. if not exists(data, type_field) and imt_values: type = [] for imt in imt_values: t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True) if t and t not in type: type.append(t) if type: if len(type) == 1: type = type[0] setprop(data, type_field, type) return json.dumps(data)
def copyprop( body, ctype, prop=None, to_prop=None, create=False, key=None, remove=None, no_replace=None, no_overwrite=None ): """Copies value in one prop to another prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) create -- creates to_prop if True (default False) key -- the key to use if to_prop is a dict (default None) remove -- removes prop if True (default False) no_replace -- creates list of to_prop string and appends prop if True """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, to_prop) and no_overwrite: pass else: if exists(data, prop) and create and not exists(data, to_prop): val = {} if key else "" setprop(data, to_prop, val) if exists(data, prop) and exists(data, to_prop): val = getprop(data, prop) to_element = getprop(data, to_prop) if isinstance(to_element, basestring): if no_replace: el = [to_element] if to_element else [] el.append(val) # Flatten val = [e for s in el for e in (s if not isinstance(s, basestring) else [s])] setprop(data, to_prop, val) else: # If key is set, assume to_element is dict or list of dicts if key: if not isinstance(to_element, list): to_element = [to_element] for dict in to_element: if exists(dict, key) or create: setprop(dict, key, val) else: logger.error("Key %s does not exist in %s" % (key, to_prop)) else: # Handle case where to_element is a list if isinstance(to_element, list): if isinstance(val, list): to_element = to_element + val else: to_element.append(val) setprop(data, to_prop, to_element) else: # to_prop is dictionary but no key was passed. logger.warn("%s is a dict but no key was passed" % to_prop) setprop(data, to_prop, val) if remove: delprop(data, prop) return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): """ Service that accepts a JSON document and sets the language ISO 639-3 code(s) and language name from the current language value(s) by: a) Checking if the value is a language code, else a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else c) Attempting to find an exact language name match, else d) Attempting to find language name matches withing the value """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) language_strings = [v] if not isinstance(v, list) else v iso_codes = [] for lang_string in language_strings: # Check if raw value is a code if lang_string not in iso_codes and lang_string in ISO639_3_SUBST: iso_codes.append(lang_string) else: # If lang_string is an ISO 639-1 code, convert to ISO 639-3 iso3 = iso1_to_iso3( re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip() ) if iso3 not in iso_codes and iso3 in ISO639_3_SUBST: iso_codes.append(iso3) else: # First check for exact language name matches for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items(): match = regex.match(lang_string.strip()) if match: iso_codes.append(iso_code) break if match is None: # Check for language names with word boundary regex for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items(): if regex.search(lang_string): iso_codes.append(iso_code) if iso_codes: seen = set() language = [{"iso639_3": code, "name": ISO639_3_SUBST[code]} for code in iso_codes if not (code in seen or seen.add(code))] setprop(data, prop, language) else: logger.warning("Did not find language code in [%s] for record %s" % (language_strings, data["_id"])) delprop(data, prop) return json.dumps(data)
def delete_field(doc, field): delprop(doc, field, keyErrorAsNone=True)
def enrichtype(body,ctype,action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format"): """ Service that accepts a JSON document and enriches the "type" field of that document by: a) making the type lowercase b) converting "image" to "still image" (TODO: Amy to confirm that this is ok) c) applying a set of regexps to do data cleanup (remove plural forms) d) moving all items that are not standard DC types to the sourceResource/format (http://dublincore.org/documents/resource-typelist/) By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter """ REGEXPS = ('images','image'), ('still image','image'),\ ('textual records', 'text'),\ ('photographs and other graphic materials', 'image'),\ ('texts', 'text') DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'moving image', 'physical object', 'service', 'software', 'sound', 'text'] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) return s def is_dc_type(s): return s in DC_TYPES try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if exists(data,prop): v = getprop(data, prop) dctype = [] f = getprop(data, format_field) if exists(data, format_field) else [] if not isinstance(f, list): f = [f] for s in (v if not isinstance(v,basestring) else [v]): if is_dc_type(cleanup(s)): dctype.append(cleanup(s)) else: f.append(s) if dctype: if len(dctype) == 1: dctype = dctype[0] setprop(data, prop, dctype) else: delprop(data, prop) if len(f) > 1: setprop(data, format_field, f) elif len(f) == 1: setprop(data, format_field, f[0]) return json.dumps(data)
def cleanup_language(body, ctype, action="cleanup_language", prop="sourceResource/language"): """ Service that accepts a JSON document and cleans each value of the language field of that document by: a) stripping periods, brackets and parentheses b) convert from ISO 639-1 to ISO 639-3 c) looking for matches in the value using LANGUAGE_NAME_REGEXES """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) v = [v] if not isinstance(v, list) else v languages = [] for s in v: if s not in languages and s in ISO639_3_SUBST: languages.append(s) else: s = re.sub("[\.\[\]]", "", s).lower().strip() iso = re.sub("[\(\)]", "", s) # First convert iso1 to iso3 iso = iso1_to_iso3(iso) if iso in ISO639_3_SUBST and iso not in languages: languages.append(iso) else: for n in iso.split(" "): # Since we split on whitespace, we only want to check # against single word reference names so we use # ISO639_3_1 n = n.title() if n in ISO639_3_1.values() and n not in languages: languages.append(n) # Use s (with parentheses intact) match = [r.search(s).group() for r in LANGUAGE_NAME_REGEXES if r.search(s)] if match: languages += list(set([m.strip().title() for m in match]) - set(languages)) if languages: # Remove duplicates lang = [] [lang.append(l) for l in languages if ISO639_3_SUBST.get(l, None) not in languages] setprop(data, prop, filter(None, lang)) else: delprop(data, prop) return json.dumps(data)
def geocode_region(spatial): setprop(spatial, "coordinates", "%s %s" % REGIONS[getprop(spatial, "name")]) delprop(spatial, "county") setprop(spatial, "state", "South Carolina") setprop(spatial, "country", "United States") return spatial
def cleanup_language(body, ctype, action="cleanup_language", prop="sourceResource/language"): """ Service that accepts a JSON document and cleans each value of the language field of that document by: a) stripping periods, brackets and parentheses b) convert from ISO 639-1 to ISO 639-3 c) looking for matches in the value using LANGUAGE_NAME_REGEXES """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) v = [v] if not isinstance(v, list) else v languages = [] for s in v: if s not in languages and s in ISO639_3_SUBST: languages.append(s) else: s = re.sub("[\.\[\]]", "", s).lower().strip() iso = re.sub("[\(\)]", "", s) # First convert iso1 to iso3 iso = iso1_to_iso3(iso) if iso in ISO639_3_SUBST and iso not in languages: languages.append(iso) else: for n in iso.split(" "): # Since we split on whitespace, we only want to check # against single word reference names so we use # ISO639_3_1 n = n.title() if n in ISO639_3_1.values() and n not in languages: languages.append(n) # Use s (with parentheses intact) match = [ r.search(s).group() for r in LANGUAGE_NAME_REGEXES if r.search(s) ] if match: languages += list( set([m.strip().title() for m in match]) - set(languages)) if languages: # Remove duplicates lang = [] [ lang.append(l) for l in languages if ISO639_3_SUBST.get(l, None) not in languages ] setprop(data, prop, filter(None, lang)) else: delprop(data, prop) return json.dumps(data)
def __init__(self, provider_data, key_prefix=None): super(MARCMapper, self).__init__(provider_data, key_prefix) # Fields controlfield, datafield, and leader may be nested within the # metadata/record field prop = "metadata/record" if exists(self.provider_data, prop): self.provider_data.update(getprop(self.provider_data, prop)) delprop(self.provider_data, prop) self.control_001 = "" self.control_007_01 = "" self.control_008_18 = "" self.control_008_21 = "" self.control_008_28 = "" self.control_format_char = "" self.datafield_086_or_087 = False self.identifier_tag_labels = { "020": "ISBN:", "022": "ISSN:", "050": "LC call number:" } # Mapping dictionary for use with datafield # Keys are used to check if there is a tag match. If so, the value # provides a list of (property, code) tuples. In the case where certain # tags have prominence over others, an index is used and the tuples # will be of the form (property, index, code). To exclude a code, # prefix it with a "!": [("format", "!cd")] will exclude the "c" # "d" codes (see method _get_values). self.mapping_dict = { lambda t: t == "856": [(self.map_is_shown_at, "u")], lambda t: t == "041": [(self.map_language, "a")], lambda t: t == "260": [(self.map_display_date, "c"), (self.map_publisher, "ab")], lambda t: t == "300": [(self.map_extent, "ac")], lambda t: t in ("337", "338"): [(self.map_format, "a")], lambda t: t == "340": [(self.map_extent, "b"), (self.map_format, "a")], lambda t: t == "050": [(self.map_identifier, "ab")], lambda t: t in ("020", "022", "035"): [(self.map_identifier, "a")], lambda t: t in ("100", "110", "111"): [(self.map_creator, None)], lambda t: (760 <= int(t) <= 787): [(self.map_relation, None)], lambda t: (t != "538" and t.startswith("5")): [(self.map_description, "a")], lambda t: t in ("506", "540"): [(self.map_rights, None)], lambda t: t == "648": [(self.map_temporal, None)], lambda t: t in ("700", "710", "711", "720"): [(self.map_contributor, None)], lambda t: t == "245": [(self.map_title, 0, "!c")], lambda t: t == "242": [(self.map_title, 1, None)], lambda t: t == "240": [(self.map_title, 2, None)], lambda t: t == "651": [(self.map_spatial, "a")], lambda t: (int(t) in set([600, 630, 650, 651] + range(610, 620) + range(653, 659) + range(690, 700))): [(self.map_subject, None), (self.map_format, "v"), (self.map_temporal, "y"), (self.map_spatial, "z")], } self.type_mapping = { "datafield": OrderedDict([ ("AJ", ("Journal", "Text")), ("AN", ("Newspaper", "Text")), ("BI", ("Biography", "Text")), ("BK", ("Book", "Text")), ("CF", ("Computer File", "Interactive Resource")), ("CR", ("CDROM", "Interactive Resource")), ("CS", ("Software", "Software")), ("DI", ("Dictionaries", "Text")), ("DR", ("Directories", "Text")), ("EN", ("Encyclopedias", "Text")), ("HT", ("HathiTrust", None)), ("MN", ("Maps-Atlas", "Image")), ("MP", ("Map", "Image")), ("MS", ("Musical Score", "Text")), ("MU", ("Music", "Text")), ("MV", ("Archive", "Collection")), ("MW", ("Manuscript", "Text")), ("MX", ("Mixed Material", "Collection")), ("PP", ("Photograph/Pictorial Works", "Image")), ("RC", ("Audio CD", "Sound")), ("RL", ("Audio LP", "Sound")), ("RM", ("Music", "Sound")), ("RS", ("Spoken word", "Sound")), ("RU", (None, "Sound")), ("SE", ("Serial", "Text")), ("SX", ("Serial", "Text")), ("VB", ("Video (Blu-ray)", "Moving Image")), ("VD", ("Video (DVD)", "Moving Image")), ("VG", ("Video Games", "Moving Image")), ("VH", ("Video (VHS)", "Moving Image")), ("VL", ("Motion Picture", "Moving Image")), ("VM", ("Visual Material", "Image")), ("WM", ("Microform", "Text")), ("XC", ("Conference", "Text")), ("XS", ("Statistics", "Text")) ]), "leader": OrderedDict([ ("am", ("Book", "Text")), ("asn", ("Newspapers", "Text")), ("as", ("Serial", "Text")), ("aa", ("Book", "Text")), ("a(?![mcs])", ("Serial", "Text")), ("[cd].*", ("Musical Score", "Text")), ("t.*", ("Manuscript", "Text")), ("[ef].*", ("Maps", "Image")), ("g.[st]", ("Photograph/Pictorial Works", "Image")), ("g.[cdfo]", ("Film/Video", "Moving Image")), ("g.*", (None, "Image")), ("k.*", ("Photograph/Pictorial Works", "Image")), ("i.*", ("Nonmusic", "Sound")), ("j.*", ("Music", "Sound")), ("r.*", (None, "Physical object")), ("p[cs].*", (None, "Collection")), ("m.*", (None, "Interactive Resource")), ("o.*", (None, "Collection")) ]) }
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): """ Service that accepts a JSON document and sets the language ISO 639-3 code(s) and language name from the current language value(s) by: a) Checking if the value is a language code, else a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else c) Attempting to find an exact language name match, else d) Attempting to find language name matches withing the value """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) language_strings = [v] if not isinstance(v, list) else v iso_codes = [] for lang_string in language_strings: # Check if raw value is a code if lang_string not in iso_codes and lang_string in ISO639_3_SUBST: iso_codes.append(lang_string) else: # If lang_string is an ISO 639-1 code, convert to ISO 639-3 iso3 = iso1_to_iso3( re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip()) if iso3 not in iso_codes and iso3 in ISO639_3_SUBST: iso_codes.append(iso3) else: # First check for exact language name matches for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items(): match = regex.match(lang_string.strip()) if match: iso_codes.append(iso_code) break if match is None: # Check for language names with word boundary regex for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items( ): if regex.search(lang_string): iso_codes.append(iso_code) if iso_codes: seen = set() language = [{ "iso639_3": code, "name": ISO639_3_SUBST[code] } for code in iso_codes if not (code in seen or seen.add(code))] setprop(data, prop, language) else: logger.warning("Did not find language code in [%s] for record %s" % (language_strings, data["_id"])) delprop(data, prop) return json.dumps(data)