def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def uscsetdataprovider(body, ctype, prop="dataProvider"): """ Service that accepts a JSON document and sets the "dataProvider" field of that document to: 1. The first value of the originalRecord/source field (placed in dataProvider in the oai-to-dpla module) for the chs set (setSpec p15799coll65) 2. The string "University of Southern California. Libraries" for all other sets For primary use with USC documents """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" data_provider = getprop(data, "dataProvider", True) if getprop(data, "originalRecord/setSpec") == "p15799coll65": setprop(data, "dataProvider", data_provider[0]) else: setprop(data, "dataProvider", "University of Southern California. Libraries") return json.dumps(data)
def update_document(document, filepath, mime, status): """ Updates the document with a filepath of downloaded thumbnail.. Arguments: document object - document for updating (decoded by json module) filepath string - filepath to insert Returns: The document from parameter with additional field containing the filepath. """ if filepath: base_url = module_config().get('thumbs_root_url') obj = document["object"] obj["@id"] = base_url + filepath obj["format"] = mime document["object"] = obj if mime: obj = document["object"] obj["format"] = mime if status: setprop(document, "admin/object_status", status) return document
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in", prop="sourceResource/stateLocatedIn"): """ Service that accepts a JSON document and enriches the "stateLocatedIn" field of that document by: For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): sli = [] values = getprop(data,prop) for v in values.split(";"): if STATE_CODES.get(v): sli.append(STATE_CODES[v]) else: sli.append(v) setprop(data, prop, "; ".join(sli)) return json.dumps(data)
def enrich_temporal_date(body, ctype, prop="aggregatedCHO/temporal", date_key="name"): """ Service that accepts a JSON document and extracts the "created date" of the item, using the following rules: a) Looks in the list of fields specified by the 'prop' parameter b) Extracts all dates, and sets the created date to the earliest date """ try : data = json.loads(body) except: response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return "Unable to parse body as JSON" date_candidates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) for s in v: a, b = parse_date_or_range(s[date_key]) date_candidates.append( { "begin": a, "end": b, "displayDate" : s[date_key] }) if date_candidates: setprop(data, p, date_candidates) return json.dumps(data)
def decode_html(body, ctype, prop=None): """Decodes any encoded html in the prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to decode """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" REGEX = ('"', '"'), ('&', '&'), ('<', '<'), ('>', '>') if prop and exists(data, prop): decoded = [] v = getprop(data, prop) if not isinstance(v, list): v = [v] for s in v: if isinstance(s, basestring): for p, r in REGEX: s = re.sub(p, r, s) decoded.append(s) setprop(data, prop, decoded) return json.dumps(data)
def setcontext(body, ctype, prop="@context"): """ Service that accepts a JSON document and sets the "@context" field of that document. """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" item_context = { "@context": "http://dp.la/api/items/context", "aggregatedCHO": "#sourceResource", "@type": "ore:Aggregation" } collection_context = { "@context": "http://dp.la/api/collections/context", "@type": "dcmitype:Collection" } if data["ingestType"] == "item": data.update(item_context) setprop(data, "sourceResource/@id", "%s#sourceResource" % data["@id"]) else: data.update(collection_context) return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = getprop(data, prop) remove = [] toprop = getprop(data, to_prop) if exists(data, to_prop) else [] for v in (values if isinstance(values, list) else [values]): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For primary use with MWDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): spatials = [] for spatial in iterify(getprop(data,prop)): if (is_spatial(spatial)): spatials.append(format_spatial(spatial)) if (len(spatials) > 0): setprop(data, prop, spatials) else: delprop(data, prop) return json.dumps(data)
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def georgiasetspectype(body, ctype): """ Service that accepts a JSON document and sets the "sourceResource/specType" field of that document from the "sourceResource/type" field For primary use with DLG documents """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" TYPE_TO_SPEC_TYPE = { "books": "Book", "government": "Government Document", "periodicals": "Serial" } type = getprop(data, "sourceResource/type", True) if type: spec_type = [] for s in iterify(type): for k, v in TYPE_TO_SPEC_TYPE.items(): if k in s.lower() and v not in spec_type: spec_type.append(v) if spec_type: setprop(data, "sourceResource/specType", spec_type) return json.dumps(data)
def uscenrichlocation(body, ctype, action="usc_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by: 1. If one of the spatial values is a lat/lon coordinate, removing all other values 2. Removing 1-3 digit numbers and values that contain "s.d" For primary use with USC documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): spatial = getprop(data, prop) coordinates = find_coordinates(spatial) if coordinates: spatial = [{"name": "%s, %s" % coordinates}] else: spatial = clean(spatial) spatial = join_values(spatial) setprop(data, prop, spatial) return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): ''' Service that accepts a JSON document and enriches the "language" field of that document by: a) converting a list of language values into list of dictionaries: {"name": language} By default it works on the 'language' field, but can be overridden by passing the name of the field to use as a parameter ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): langs = getprop(data, prop) if isinstance(langs, basestring): setprop(data, prop, {"name": langs}) elif isinstance(langs, list): languages = [] for l in langs: languages.append({"name": l}) setprop(data, prop, languages) return json.dumps(data)
def capitalize(data, prop): """ Capitalizes the value of the related property path. Modifies given dictionary (data argument). """ def str_capitalize(s): """ Changes the first letter of the string into uppercase. python "aaa".capitalize() can be used, other words first letters into lowercase. """ if s: return s[0].upper() + s[1:] return s if exists(data, prop): v = getprop(data, prop, keyErrorAsNone=True) if v: if isinstance(v, basestring): setprop(data, prop, str_capitalize(v)) elif isinstance(v, list): new_v = [] for s in v: if isinstance(s, basestring): new_v.append(str_capitalize(s)) else: new_v.append(s) setprop(data, prop, new_v)
def check_date_format(data, prop): """Checks that the begin and end dates are in the proper format""" date = getprop(data, prop, True) if date: for d in iterify(date): for k, v in d.items(): if v and k != "displayDate": try: ymd = [int(s) for s in v.split("-")] except: err = "Invalid date.%s: non-integer in %s for %s" % \ (k, v, data.get("_id")) logger.error(err) setprop(d, k, None) continue year = ymd[0] month = ymd[1] if len(ymd) > 1 else 1 day = ymd[2] if len(ymd) > 2 else 1 try: datetime.datetime(year=year, month=month, day=day) except ValueError, e: logger.error("Invalid date.%s: %s for %s" % (k, e, data.get("_id"))) setprop(d, k, None)
def setspectype(body, ctype, prop="sourceResource/type"): """ Service that accepts a JSON document and sets the "sourceResource/specType" field of that document from the prop field """ try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" TYPE_TO_SPEC_TYPE = { "book": "Book", "government": "Government Document", "periodical": "Serial", "nonmusic": "Nonmusic", "still image": "Photograph/Pictorial Works", "mixed material": "Mixed Material" } if exists(data, prop): spec_type = [] for s in iterify(getprop(data, prop)): for k, v in TYPE_TO_SPEC_TYPE.items(): if k in s.lower() and v not in spec_type: spec_type.append(v) if spec_type: setprop(data, "sourceResource/specType", spec_type) return json.dumps(data)
def enrichdate(body, ctype, action="enrich-format", prop="aggregatedCHO/date"): """ Service that accepts a JSON document and extracts the "created date" of the item, using the following rules: a) Looks in the list of fields specified by the 'prop' parameter b) Extracts all dates, and sets the created date to the earliest date """ try : data = json.loads(body) except: response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return "Unable to parse body as JSON" date_candidates = [] for p in prop.split(','): if exists(data, p): v = getprop(data, p) date_candidates = [] for s in (v if not isinstance(v, basestring) else [v]): a, b = parse_date_or_range(s) date_candidates.append( { "begin": a, "end": b, "displayDate" : s }) date_candidates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR) if date_candidates: setprop(data, p, date_candidates[0]) return json.dumps(data)
def replace_substring(body, ctype, prop=None, old=None, new=None): """Replaces a substring in prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to apply replacing old -- the substring to replace new -- the substring to replaced old with """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not old or not new: logger.error("No old or new parameters were provided") else: if exists(data, prop): v = getprop(data, prop) setprop(data, prop, v.replace(old, new)) return json.dumps(data)
def set_prop(body, ctype, prop=None, value=None, condition_prop=None, condition_value=None): """Sets the value of prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to set value -- the value to set prop to condition_prop -- (optional) the field that must exist to set the prop """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not value: logger.error("No value was supplied to set_prop.") else: # If there is no condition_prop, set the prop, creating it if it does #not exist. If there is a condition_prop, only set the prop if the # condition_prop exists. if not condition_prop or exists(data, condition_prop): setprop(data, prop, value) return json.dumps(data)
def digital_commonwealth_enrich_location(body, ctype, action="digital_commonwealth_enrich_location", prop="sourceResource/spatial"): """ Service that massages a Digital Commonwealth JSON document. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Strings which are present in the spatial field, which do end up being geocoded, # but are not locations NON_SPATIALS = ["Aerial views.", "Church history.", "Dwellings", "Dwellings.", "History", "Pictorial works"] if (exists(data, prop)): # Spatial field is simply a list of strings, convert to a list # of dictionaries with the name key set to the string value spatials = [] for spatial in iterify(getprop(data, prop)): if (isinstance(spatial, basestring) \ and spatial not in NON_SPATIALS): spatials.append({"name": format_spatial(spatial)}) setprop(data, prop, spatials) return json.dumps(data)
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def map_contributor(self): prop = "contributor" if exists(self.provider_data, prop): contributors = iterify(self.provider_data.get(prop)) setprop(self.mapped_data, "dataProvider", contributors[-1]) if len(contributors) > 1: self.update_source_resource({"contributor": contributors[:-1]})
def oaimodstodpla(body, ctype, geoprop=None, provider=None): """ Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } if provider == "BPL": data = remove_key_prefix(data, "mods:") # Apply all transformation rules from original document transformer_pipeline = {} transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}), **CHO_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out["sourceResource"].update(transformer_pipeline[p](data, p)) transformer_pipeline = {} transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}), **AGGREGATION_TRANSFORMER["common"]) for p in transformer_pipeline: if exists(data, p): out.update(transformer_pipeline[p](data, p)) # Apply transformations that are dependent on more than one # original document field if provider == "HARVARD": out["sourceResource"].update(identifier_transform_harvard(data)) out.update(url_transform_harvard(data)) out.update(data_provider_transform_harvard(data)) # Join dataProvider with isPartOf for BPL if provider == "BPL": try: ipo = getprop(out, "dataProvider") + ". " + \ getprop(out, "sourceResource/isPartOf") setprop(out, "sourceResource/isPartOf", ipo.replace("..", ".")) except: pass # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def format_spatial(spatial): name = getprop(spatial, "name") for regex, repl in REGEX_REPLACEMENTS: if (regex.search(name)): name = regex.sub(repl, name).strip() setprop(spatial, "name", name) return spatial
def map_provider(self, _dict, tag, codes): values = self._get_values(_dict, codes) if "HT" in values and "avail_ht" in values: provider = { "@id": "http://dp.la/api/contributor/hathitrust", "name": "HathiTrust" } setprop(self.mapped_data, "provider", provider)
def update_title(self): prop = "sourceResource/title" title_list = filter(None, getprop(self.mapped_data, prop)) if title_list: title = [" ".join(t) for t in title_list] setprop(self.mapped_data, prop, title) else: delprop(self.mapped_data, prop)
def map_data_provider(self, _dict, tag, codes): data_provider = [] for v in self._get_values(_dict, codes): namespace = v.split(".")[0] data_provider.append(self.data_provider_mapping.get(namespace)) data_provider = filter(None, data_provider) if data_provider: setprop(self.mapped_data, "dataProvider", data_provider)
def texas_enrich_location(body, ctype, action="texas_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the texas profile """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" def _get_coordinates(value): lat, lon = None, None for v in value.split(";"): if "north=" in v: lat = v.split("=")[-1] elif "east=" in v: lon = v.split("=")[-1] if lat and lon: return (lat, lon) else: return () if exists(data, prop): spatial = [] values = getprop(data,prop) for v in values: sp = {"name": v} shredded = [s.strip() for s in v.split(" - ")] coordinates = _get_coordinates(sp["name"]) if coordinates: sp["name"] = "%s, %s" % coordinates if len(shredded) < 5: if not re.search("\d", sp["name"]): sp["country"] = shredded[0] if "country" in sp: if sp["country"] in ["United States", "Canada"]: try: sp["state"] = shredded[1] sp["county"] = shredded[2] sp["city"] = shredded[3] except Exception, e: logger.debug("Error enriching location %s: %s" % (data["_id"], e)) spatial.append(sp) logger.debug("SPATIAL: %s" % spatial) setprop(data, prop, spatial)
def extend_prop(self, prop, _dict, codes, label=None, values=None): if values is None: values = self._get_values(_dict, codes) if values: if label: values.insert(0, label) prop_value = self._get_mapped_value(prop) prop_value.extend(self._join_values(prop, values)) setprop(self.mapped_data, prop, prop_value)
def update_is_shown_at(self): prop = "sourceResource/identifier" if exists(self.mapped_data, prop): for v in iterify(getprop(self.mapped_data, prop)): if v.startswith("Hathi: "): _id = v.split("Hathi: ")[-1] is_shown_at = "http://catalog.hathitrust.org/Record/%s" % \ _id setprop(self.mapped_data, "isShownAt", is_shown_at) break
def artstor_cleanup(body, ctype): try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % ( HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text data_provider_key = u"dataProvider" if exists(data, data_provider_key): item = getprop(data, data_provider_key) if isinstance(item, basestring): cleaned_data_provider = item.replace("Repository:", "").lstrip() setprop(data, data_provider_key, cleaned_data_provider) return json.dumps(data)
def test_geocode_skip_united_states(): """Should not add coordinates when name or country value is 'United States' or 'États-Unis' or 'USA' """ INPUT = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": "" } } url = server() + "geocode" for v in ["United States", "United States.", u"États-Unis", u"États-Unis.", "USA"]: for field in ["name", "country"]: setprop(INPUT, "sourceResource/spatial", {field: v}) resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 for place in json.loads(content)['sourceResource']['spatial']: assert 'coordinates' not in place.keys()
def set_ucldc_dataprovider(body, ctype): '''For ucldc, we always have a originalRecord/collection entry. This has a repository object which may or may not have a list of campuses. Concatenate the repo & campus if exisiting, separated by a , for dataProvider value ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" collection = getprop(data, 'originalRecord/collection')[0] repo = collection['repository'][0] campus = None if len(repo['campus']): campus = repo['campus'][0] dataProvider = repo['name'] if campus: dataProvider = ', '.join((campus['name'], repo['name'])) setprop(data, 'dataProvider', dataProvider) data['provider'] = {} setprop(data, 'provider/name', dataProvider) setprop(data, 'provider/@id', collection['@id']) data['sourceResource']['stateLocatedIn'] = [{'name': 'California'}] return json.dumps(data)
def set_field_from_value_mode(data, field, mode, value, multivalue=True): '''Set the value for the data "field" from data in collection ckey field with the value passed in. ''' logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue)) if value: #no value don't bother if mode=='overwrite': if exists(data, field): setprop(data, field, value) else: pp,pn = tuple(field.lstrip('/').split('/',1)) if not pp in data: data[pp] = {} data[pp][pn] = value elif mode=='append': new_value = [] if exists(data, field): old_value = getprop(data, field) if isinstance(old_value, list): new_value.extend(old_value) else: new_value.append(old_value) if isinstance(value, list): new_value.extend(value) else: new_value.append(value) setprop(data, field, new_value) else: # fill blanks if not exists(data, field) or not getprop(data, field,keyErrorAsNone=True): if multivalue and not isinstance(value, list): value = [value] setprop(data, field, value) return data
def geocode_region(spatial): setprop(spatial, "coordinates", "%s, %s" % REGIONS[getprop(spatial, "name")]) delprop(spatial, "county") setprop(spatial, "state", "South Carolina") setprop(spatial, "country", "United States") return spatial
def artstor_spatial_to_dataprovider(body, ctype, prop="sourceResource/spatial"): """ Splits spatial on semicolon and copies the first value to dataProvider """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) if isinstance(v, list): v = v[0] if isinstance(v, basestring): v = v.split(";")[0] setprop(data, "dataProvider", v) delprop(data, prop) return json.dumps(data)
def enrichlocation(body,ctype,action="enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by iterating through the spatial fields and mapping to the state and iso3166-2, if not already mapped, through teh get_isostate function. This function takes the optional parameter abbrev, and if it is set it will search the fields for State name abbreviations. If a previous provider- specific location enrichment module ran, the default is to not search those fields for State name abbreviations, but only for full State names. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): v = iterify(getprop(data,prop)) for i in range(len(v)): if isinstance(v[i], dict): for k in v[i].keys(): v[i][k] = remove_space_around_semicolons(v[i][k]) else: v[i] = {"name": remove_space_around_semicolons(v[i])} # If any of the spatial fields contain semi-colons, we need to create # multiple dictionaries. semicolons = None for d in v: for k in d.keys(): if d[k] and ';' in d[k]: semicolons = True break setprop(data,prop,(create_dictionaries(v) if semicolons else v)) return json.dumps(data)
def bhlcontributortocollection(body, ctype, contributor_field="sourceResource/contributor"): """ Copies BHL contributor field value to collection field """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, contributor_field): contributor = getprop(data, contributor_field) acronym = "".join(c[0] for c in contributor.split()) setprop(data, "sourceResource/collection/@id", "http://dp.la/api/collections/bhl--" + acronym) setprop(data, "sourceResource/collection/name", contributor) return json.dumps(data)
def mdlstatelocatedin(body, ctype): """ Service that accepts a JSON document and extracts the state from the address in the first dataProvider value """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" prop = "dataProvider" if exists(data, prop): address = iterify(getprop(data, prop))[0] for st, state in states.items(): if (re.search("\s+%s\s+" % st, address) or re.search("\s+%s\s+" % state, address)): setprop(data, "sourceResource/stateLocatedIn", state) break return json.dumps(data)
def nara_enrich_location(body, ctype, action="nara_enrich_location", prop="sourceResource/spatial"): """ Service that massages a NARA JSON document. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (exists(data, prop)): # Check spatial dictionaries to see if they are valid spatials = [] for spatial in iterify(getprop(data, prop)): spatials.append(format_spatial(spatial)) setprop(data, prop, spatials) return json.dumps(data)
def remove_list_values(body, ctype, prop=None, values=None): """Given a comma-separated string of values, removes any instance of each value from the prop. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" v = getprop(data, prop, True) if isinstance(v, list) and values is not None: values = values.split(",") v = [s for s in v if s not in values] if v: setprop(data, prop, v) else: delprop(data, prop) return json.dumps(data)
def jsonfy_prop(body, ctype, prop=None): """ Some data is packed as strings that contain json. (UCSD) Take the data in the given property and turn any sub-values that can be read by json.loads into json object. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if prop: obj = getprop(data, prop, True) else: obj = data obj_jsonfied = jsonfy_obj(obj) if prop: setprop(data, prop, obj_jsonfied) else: data = obj_jsonfied return json.dumps(data)
def filter_path(_dict, path): """ Repeatedly runs cleaner function until all empty values are removed from given path (hash stops changing). Arguments: _dict - dictionary to clean; path - a xpath-like path to the value, that must be checked Returns: cleaned dictionary """ d = copy.deepcopy(_dict) embracing_path, sep, value_key = path.rpartition(PATH_DELIM) try: dict_to_clean = getprop(d, embracing_path) except KeyError: logger.warning("Attempt to clean non existent path \"%s\"", embracing_path) return _dict else: if value_key: cleaned_dict = filter_dict(dict_to_clean, filter_fields, value_key) setprop(d, embracing_path, cleaned_dict) return d else: return filter_dict(dict_to_clean, filter_fields, embracing_path)
def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None): """ Service that accepts a JSON document and "shreds" or "unshreds" the value of the field(s) named by the "prop" parameter "prop" can include multiple property names, delimited by a comma (the delim property is used only for the fields to be shredded/unshredded). This requires that the fields share a common delimiter however. """ try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) def mismatch_parens(s): return s.count("(") != s.count(")") for p in prop.split(','): if exists(data, p): v = getprop(data, p) if action == "shred": if isinstance(v, list): try: v = delim.join(v) except Exception as e: logger.error("Can't join on delim. ID: %s\n%s" % (data["_id"], str(e))) if delim in v: setprop(data, p, v) else: continue shredded = [""] for s in re.split(re.escape(delim), v): if mismatch_parens(shredded[-1]): shredded[-1] += "%s%s" % (delim, s) else: shredded.append(s) shredded = [i.strip() for i in shredded if i.strip()] if not keepdup: result = [] for s in shredded: if s not in result: result.append(s) shredded = result setprop(data, p, shredded) elif action == "unshred": if isinstance(v, list): setprop(data, p, delim.join(v)) return json.dumps(data)
def convert_dates(data, prop, earliest): """Converts dates. Arguments: data Dict - Data for conversion. prop Str - Properties dividided with comma. earliest Bool - True - the function will set only the earliest date. False - the function will set all dates. Returns: Nothing, the replacement is done in place. """ for p in prop.split(','): dates = [] if exists(data, p): v = getprop(data, p) if isinstance(v, list): # fix for duplicate values in list v = list(OrderedDict.fromkeys(v)) if not isinstance(v, dict) and len(v): if is_year_range_list(v): dates.append({ "begin": v[0], "end": v[-1], "displayDate": "%s-%s" % (v[0], v[-1]) }) else: for s in (v if not isinstance(v, basestring) else [v]): for part in s.split(";"): display_date = remove_single_brackets_and_strip( part) stripped = clean_date( remove_all_brackets_and_strip(part)) # Stripping bogus -00-00 data if stripped[-6:] == "-00-00": stripped = stripped[:-6] display_date = stripped if len(stripped) < 4: continue a, b = parse_date_or_range(stripped) if b != DEFAULT_DATETIME_STR: dates.append({ "begin": a, "end": b, "displayDate": display_date }) else: # Already filled in, probably by mapper continue dates.sort( key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR ) if dates: ### if earliest: ### value_to_set = dates[0] ### else: ### value_to_set = dates ### setprop(data, p, value_to_set) setprop(data, p, dates) else: delprop(data, p)
def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None): """ Service that accepts a JSON document and "shreds" or "unshreds" the value of the field(s) named by the "prop" parameter "prop" can include multiple property names, delimited by a comma (the delim property is used only for the fields to be shredded/unshredded). This requires that the fields share a common delimiter however. The 'shred' action splits values by delimeter. It handles some complex edge cases beyond what split() expects. For example: ["a,b,c", "d,e,f"] -> ["a","b","c","d","e","f"] 'a,b(,c)' -> ['a', 'b(,c)'] Duplicate values are removed unless keepdup evaluates true. The 'unshred' action joins a list of values with delim. See: https://issues.dp.la/issues/2940 https://issues.dp.la/issues/4251 https://issues.dp.la/issues/4266 https://issues.dp.la/issues/4578 https://issues.dp.la/issues/4600 """ try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) def index_for_first_open_paren(values): """ Accepts a list of values. Returns the index of the index of the first value containing an opening paren. """ for v in values: if v.count("(") > v.count(")"): return values.index(v) return None def index_for_matching_close_paren(values): """ Accepts a list of values. Returns the index of the index of the first value containing a closing paren. """ index = None for v in values: if index is not None and v.count("(") > v.count(")"): return index elif v.count(")") > v.count("("): index = values.index(v) return index def rejoin_partials(values, delim): """ Accepts a list of values which have been split by delim. Searches for values that have been separated For example, this value: 'my (somewhat contrived; value) with a delimeter enclosed in parens' would be split into: ['my (somewhat contrived', 'value) with a delimeter enclosed in parens'] This method rejoins it. """ index1 = index_for_first_open_paren(values) index2 = index_for_matching_close_paren(values) if index1 is not None and index2 is not None: if index1 == 0 and index2 == len(values) - 1: return [delim.join(values)] elif index1 == 0: values = [delim.join(values[:index2 + 1]) ] + values[index2 + 1:] elif index2 == len(values) - 1: values = values[:index1] + [delim.join(values[index1:])] else: values = values[:index1] + [ delim.join(values[index1:index2 + 1]) ] + values[index2 + 1:] return rejoin_partials(values, delim) else: return values for p in prop.split(','): if exists(data, p): v = getprop(data, p) if action == "shred": if isinstance(v, list): v = filter(None, v) try: v = delim.join(v) v = v.replace("%s%s" % (delim, delim), delim) except Exception as e: logger.warn("Can't join list %s on delim for %s, %s" % (v, data["_id"], e)) if delim in v: setprop(data, p, v) else: continue shredded = [""] for s in re.split(re.escape(delim), v): shredded.append(s) shredded = rejoin_partials(shredded, delim) shredded = [i.strip() for i in shredded if i.strip()] if not keepdup: result = [] for s in shredded: if s not in result: result.append(s) shredded = result setprop(data, p, shredded) elif action == "unshred": if isinstance(v, list): setprop(data, p, delim.join(v)) return json.dumps(data)
def copyprop(body, ctype, prop=None, to_prop=None, create=False, key=None, remove=None, no_replace=None, no_overwrite=None): """Copies value in one prop to another prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to copy from (default None) to_prop -- the prop to copy into (default None) create -- creates to_prop if True (default False) key -- the key to use if to_prop is a dict (default None) remove -- removes prop if True (default False) no_replace -- creates list of to_prop string and appends prop if True """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, to_prop) and no_overwrite: pass else: if exists(data, prop) and create and not exists(data, to_prop): val = {} if key else "" setprop(data, to_prop, val) if exists(data, prop) and exists(data, to_prop): val = getprop(data, prop) to_element = getprop(data, to_prop) if isinstance(to_element, basestring): if no_replace: el = [to_element] if to_element else [] el.append(val) # Flatten val = [ e for s in el for e in (s if not isinstance(s, basestring) else [s]) ] setprop(data, to_prop, val) else: # If key is set, assume to_element is dict or list of dicts if key: if not isinstance(to_element, list): to_element = [to_element] for dict in to_element: if exists(dict, key) or create: setprop(dict, key, val) else: msg = "Key %s does not exist in %s" % (key, to_prop) logger.debug(msg) else: # Handle case where to_element is a list if isinstance(to_element, list): if isinstance(val, list): to_element = to_element + val else: to_element.append(val) setprop(data, to_prop, to_element) else: # to_prop is dictionary but no key was passed. msg = "%s is a dictionary but no key was passed" % to_prop logger.warn(msg) setprop(data, to_prop, val) if remove: delprop(data, prop) return json.dumps(data)
def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"): """ Service that accepts a JSON document and sets the language ISO 639-3 code(s) and language name from the current language value(s) by: a) Checking if the value is a language code, else a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else c) Attempting to find an exact language name match, else d) Attempting to find language name matches withing the value """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) language_strings = [v] if not isinstance(v, list) else v iso_codes = [] for lang_string in language_strings: # Check if raw value is a code if lang_string not in iso_codes and lang_string in ISO639_3_SUBST: iso_codes.append(lang_string) else: # If lang_string is an ISO 639-1 code, convert to ISO 639-3 iso3 = iso1_to_iso3( re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip()) if iso3 not in iso_codes and iso3 in ISO639_3_SUBST: iso_codes.append(iso3) else: # First check for exact language name matches for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items(): match = regex.match(lang_string.strip()) if match: iso_codes.append(iso_code) break if match is None: # Check for language names with word boundary regex for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items( ): if regex.search(lang_string): iso_codes.append(iso_code) if iso_codes: seen = set() language = [{ "iso639_3": code, "name": ISO639_3_SUBST[code] } for code in iso_codes if not (code in seen or seen.add(code))] setprop(data, prop, language) else: logger.warning("Did not find language code in [%s] for record %s" % (language_strings, data["_id"])) delprop(data, prop) return json.dumps(data)
def enrichformat(body, ctype, action="enrich-format", prop="sourceResource/format", type_field="sourceResource/type"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) Setting the format to be all lowercase b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) Checking to see if the field is a valid IMT See http://www.iana.org/assignments/media-types for list of valid media-types. We require that a subtype is defined. d) Removing any extra text after the IMT e) Moving valid IMT values to hasView/format if hasView exists and its format is not set f) Setting type field from format field, if it is not set. The format field is taken if it is a string, or the first element if it is a list. It is then split and the first part of IMT is taken. By default works on the 'sourceResource/format' field but can be overridden by passing the name of the field to use as the 'prop' parameter. """ FORMAT_2_TYPE_MAPPINGS = { "audio": "sound", "image": "image", "video": "moving image", "text": "text" } REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \ ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \ ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \ ('^jpg$', 'image/jpeg'), ('\W$', '') IMT_TYPES = [ 'application', 'audio', 'image', 'message', 'model', 'multipart', 'text', 'video' ] def get_ext(s): ext = os.path.splitext(s)[1].split('.') return ext[1] if len(ext) == 2 else "" def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s) return s def is_imt(s): logger.debug("Checking: " + s) imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) imt_values = [] if exists(data, prop): v = getprop(data, prop) format = [] hasview_format = [] for s in (v if not isinstance(v, basestring) else [v]): if s.startswith("http") and is_absolute(s): s = get_ext(s) cleaned = cleanup(s) if is_imt(cleaned): # Append to imt_values for use in type imt_values.append(cleaned) # Move IMT values to hasView/format else discard if exists(data, "hasView") and not \ exists(data, "hasView/format") and \ cleaned not in hasview_format: hasview_format.append(cleaned) else: # Retain non-IMT values in sourceResource/format, non-cleaned if s not in format: format.append(s) if format: if len(format) == 1: format = format[0] setprop(data, prop, format) else: delprop(data, prop) if hasview_format: if len(hasview_format) == 1: hasview_format = hasview_format[0] setprop(data, "hasView/format", hasview_format) # Setting the type if it is empty. if not exists(data, type_field) and imt_values: type = [] for imt in imt_values: t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True) if t and t not in type: type.append(t) if type: if len(type) == 1: type = type[0] setprop(data, type_field, type) return json.dumps(data)
def all_transform(d, p): global PROVIDER logger.debug("TRANSFORMING %s" % d["_id"]) # For spec_type use control_008_28 = None datafield_086_or_087 = None data = { "sourceResource": { "identifier": [], "contributor": [], "creator": [], "date": [], "description": [], "extent": [], "language": [], "spatial": [], "publisher": [], "isPartOf": [], "rights": [], "stateLocatedIn": [], "subject": [], "temporal": [], "title": [None, None, None], "format": [], "type": [], "specType": [] } } # Mapping dictionaries for use with datafield: # Keys are used to check if there is a tag match. If so, the value provides # a list of (property, code) tuples. In the case where certain tags have # prominence over others, the tuples will be of the form # (property, index, code). To exclude a code, prefix it with a "!": # [("format", "!cd")] will exclude the "c" and "d" codes (see def # _get_values). data_map = { lambda t: t == "856": [("isShownAt", "u")], lambda t: t == "973": [("provider", "ab")], lambda t: t == "974": [("dataProvider", "u")], lambda t: t == "852": [("dataProvider", "a")] } source_resource_map = { lambda t: t in ("020", "022", "035"): [("identifier", "a")], lambda t: t == "050": [("identifier", "ab")], lambda t: t in ("100", "110", "111"): [("creator", None)], lambda t: t == "041": [("language", "a")], lambda t: t == "260": [("date", "c"), ("publisher", "ab")], lambda t: t == "270": [("stateLocatedIn", "c")], lambda t: t == "300": [("extent", "ac")], lambda t: t in ("337", "338"): [("format", "a")], lambda t: t == "340": [("format", "a"), ("extent", "b")], lambda t: t.startswith("5"): [("description", "a")], lambda t: t in ("506", "540"): [("rights", None)], lambda t: t == "648": [("temporal", None)], lambda t: t in ("700", "710", "711", "720"): [("contributor", None)], #lambda t: t == "662": [("sourceResource/spatial", None)], lambda t: t == "240": [("title", 2, None)], lambda t: t == "242": [("title", 1, None)], lambda t: t == "245": [("title", 0, "!c")], lambda t: t == "970": [("type", "a")], lambda t: t == "651": [("spatial", "a")], lambda t: int(t) in set([600, 650, 651] + range(610, 620) + range(653, 659) + range(690, 700)): [("subject", None), ("format", "v"), ("temporal", "y"), ("spatial", "z")], lambda t: (760 <= int(t) <= 787): [("isPartOf", None)], } # Handle datafield for item in _as_list(getprop(d, p)): for _dict in _as_list(item): tag = _dict.get("tag", None) # Skip cases where there is no tag or where tag == "ERR" try: int(tag) except: continue # Handle data_map matches for match, tuples in data_map.iteritems(): if match(tag): for tup in tuples: prop, codes = tup values = _get_values(_dict, codes) if prop == "provider": data.update(provider_transform(values)) elif prop == "dataProvider": if tag == "974" and PROVIDER == "hathitrust": dp = dataprovider_transform_hathi(values) data.update(dp) elif tag == "852" and PROVIDER == "uiuc": if values: data["dataProvider"] = values[0] else: if values: data[prop] = values[0] # Handle source_resource_map matches for match, tuples in source_resource_map.iteritems(): if match(tag): for tup in tuples: if len(tup) == 2: prop, codes = tup if prop == "contributor": # Handle values for contributor values = _get_contributor_values(_dict, codes) elif prop == "subject": # Handle values for subject values = _get_subject_values(_dict, tag) elif prop == "spatial": # Handle values for spatial values = _get_spatial_values(_dict, tag, codes) else: # Handle values for all other sourceResource # fields values = _get_values(_dict, codes) if prop == "identifier": # Handle identifier labeling label = None if tag == "020": label = "ISBN:" elif tag == "022": label = "ISSN:" elif tag == "050": label = "LC call number:" if label: # Insert label as first value item as # values will be joined values.insert(0, label) values = _join_sourceresource_values(prop, values) if prop == "type": data["sourceResource"].update( datafield_type_transform(values) ) else: data["sourceResource"][prop].extend(values) elif len(tup) == 3: prop, index, codes = tup values = _get_values(_dict, codes) data["sourceResource"][prop][index] = values if tag == "662": # Test: Log document with 662 (spatial) logger.debug("Document has 662: %s" % d["_id"]) elif tag == "086" or tag == "087": datafield_086_or_087 = True # Handle sourceResource/title title = filter(None, data["sourceResource"]["title"]) if title: for i in range(len(title)): title[i] = " ".join(title[i]) data["sourceResource"]["title"] = title else: del data["sourceResource"]["title"] # Handle controlfield: values from here are needed to update # sourceResource/identifier, sourceResource/language, and # sourceResource/format format_char_control = None format_char_leader = None for item in _as_list(getprop(d, "controlfield")): if "#text" in item and "tag" in item: # Map tag 001 only for Hathi if item["tag"] == "001" and PROVIDER == "hathitrust": value = "Hathi: " + item["#text"] data["sourceResource"]["identifier"].append(value) if item["tag"] == "007": # For format use format_char_control = item["#text"][0] if item["tag"] == "008": if len(item["#text"]) > 28: # For spec_type use control_008_28 = item["#text"][28] if len(item["#text"]) > 37: data["sourceResource"]["language"].append( item["#text"][35:38] ) leader = getprop(d, "leader") if len(leader) > 6: format_char_leader = leader[6] format_values = format_transform(format_char_control, format_char_leader) data["sourceResource"]["format"].extend(format_values) # Split language language = [] for lang_str in data["sourceResource"]["language"]: language.extend([lang_str[i:i+3] for i in range(0, len(lang_str), 3)]) data["sourceResource"]["language"] = language # Add "Government Document" to spec_type if applicable gov_spec_type = get_gov_spec_type(control_008_28, datafield_086_or_087) if gov_spec_type: data["sourceResource"]["specType"].append(gov_spec_type) # Remove empty sourceResource values del_keys = [key for key in data["sourceResource"] if not data["sourceResource"][key]] for key in del_keys: del data["sourceResource"][key] # Handle Hathi isShownAt is_shown_at = None for id in _as_list(getprop(data, "sourceResource/identifier")): if id.startswith("Hathi: "): id = id.split("Hathi: ")[-1] is_shown_at = "http://catalog.hathitrust.org/Record/%s" % id break if is_shown_at: setprop(data, "isShownAt", is_shown_at) return data
def enrichlocation(body, ctype, action="enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by iterating through the spatial fields and mapping to the state and iso3166-2, if not already mapped, through teh get_isostate function. This function takes the optional parameter abbrev, and if it is set it will search the fields for State name abbreviations. If a previous provider- specific location enrichment module ran, the default is to not search those fields for State name abbreviations, but only for full State names. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) # If prior provider-specific location enrichment occured, # v[0] will be a dictrionary if isinstance(v[0], dict): for k in v[0].keys(): v[0][k] = remove_space_around_semicolons(v[0][k]) """ if 'state' in v[0]: # Handle case where a previous provider-specific location # enrichment set the state field isostate = get_isostate(v[0]['state']) # It may be the case that the 'state' field does not contain a # State name if isostate[0]: v[0]['iso3166-2'] = isostate[0] v[0]['state'] = isostate[1] else: # We may want to keep whatever non-State value was placed in # state v[0]['name'] = v[0]['state'] # Remove bogus state del v[0]['state'] else: # Handle case where a previous provider-specific location # enrichment did not set the state field for val in v[0].values(): isostate = get_isostate(val) if isostate[0]: v[0]['iso3166-2'] = isostate[0] v[0]['state'] = isostate[1] break """ else: # Handle the case where no previous provider-specific location # enrichment occured. Convert spatial from list of strings to # dictionary. sp = [] for s in (v if not isinstance(v, basestring) else [v]): d = {} d['name'] = remove_space_around_semicolons(s) """ isostate = get_isostate(d['name'], abbrev="Yes") if isostate[0]: d['iso3166-2'] = isostate[0] d['state'] = isostate[1] """ sp.append(d) v = sp # If any of the spatial fields contain semi-colons, we need to create # multiple dictionaries. semicolons = None for d in v: for k in d.keys(): if d[k] and ';' in d[k]: semicolons = True break setprop(data, prop, (create_dictionaries(v) if semicolons else v)) return json.dumps(data)
def map_intermediate_provider(self): prop = "source" if exists(self.provider_data, prop): im_prov = getprop(self.provider_data, prop) if im_prov: setprop(self.mapped_data, "intermediateProvider", im_prov)
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format"): """ Service that accepts a JSON document and enriches the "type" field of that document by: a) making the type lowercase b) converting "image" to "still image" (TODO: Amy to confirm that this is ok) c) applying a set of regexps to do data cleanup (remove plural forms) d) moving all items that are not standard DC types to the sourceResource/format (http://dublincore.org/documents/resource-typelist/) By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter """ REGEXPS = ('images','image'), ('still image','image'),\ ('textual records', 'text'),\ ('photographs and other graphic materials', 'image'),\ ('texts', 'text') DC_TYPES = [ 'collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'moving image', 'physical object', 'service', 'software', 'sound', 'text' ] def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) return s def is_dc_type(s): return s in DC_TYPES try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) dctype = [] f = getprop(data, format_field) if exists(data, format_field) else [] if not isinstance(f, list): f = [f] for s in (v if not isinstance(v, basestring) else [v]): if is_dc_type(cleanup(s)): dctype.append(cleanup(s)) else: f.append(s) if dctype: if len(dctype) == 1: dctype = dctype[0] setprop(data, prop, dctype) else: delprop(data, prop) if len(f) > 1: setprop(data, format_field, f) elif len(f) == 1: setprop(data, format_field, f[0]) return json.dumps(data)
def update_ingestion_doc(self, ingestion_doc, **kwargs): for prop, value in kwargs.items(): setprop(ingestion_doc, prop, value) self.dashboard_db.save(ingestion_doc)
def cleanup_language(body, ctype, action="cleanup_language", prop="sourceResource/language"): """ Service that accepts a JSON document and cleans each value of the language field of that document by: a) stripping periods, brackets and parentheses b) convert from ISO 639-1 to ISO 639-3 c) looking for matches in the value using LANGUAGE_NAME_REGEXES """ def iso1_to_iso3(s): s = re.sub("[-_/].*$", "", s).strip() return ISO639_1.get(s, s) try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" if exists(data, prop): v = getprop(data, prop) v = [v] if not isinstance(v, list) else v languages = [] for s in v: if s not in languages and s in ISO639_3_SUBST: languages.append(s) else: s = re.sub("[\.\[\]]", "", s).lower().strip() iso = re.sub("[\(\)]", "", s) # First convert iso1 to iso3 iso = iso1_to_iso3(iso) if iso in ISO639_3_SUBST and iso not in languages: languages.append(iso) else: for n in iso.split(" "): # Since we split on whitespace, we only want to check # against single word reference names so we use # ISO639_3_1 n = n.title() if n in ISO639_3_1.values() and n not in languages: languages.append(n) # Use s (with parentheses intact) match = [ r.search(s).group() for r in LANGUAGE_NAME_REGEXES if r.search(s) ] if match: languages += list( set([m.strip().title() for m in match]) - set(languages)) if languages: # Remove duplicates lang = [] [ lang.append(l) for l in languages if ISO639_3_SUBST.get(l, None) not in languages ] setprop(data, prop, filter(None, lang)) else: delprop(data, prop) return json.dumps(data)
def geocode(body, ctype, prop="sourceResource/spatial", newprop='coordinates'): ''' Adds geocode data to the record coming as follows: 1. If the coordinates property does not exist, attempt to extract it from name. 2. Run GeoNames enrichment, reverse encoding coordinate values to identify, parent features, or (if none exist) searching for name values. Put parent features in appropriate state/country values. 3. If we still haven't identified the place, use Bing to get lat/long values. If one is found, pass the coordinates through Geonames again to identify parent features. 4. Add any non-existing features to the spatial dictionary. ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (not exists(data, prop)): pass else: logger.debug("Geocoding %s" % data["_id"]) value = getprop(data, prop) places = [] for v in iterify(value): bing_geocode = True if not isinstance(v, dict): logger.error("Spatial value must be a dictionary; record %s" % data["_id"]) continue place = Place(v) if place.name: coords = get_coordinates(place.name) if coords: place.coordinates = coords place.name = None place.set_name() # Run Geonames enrichment to do initial search place.enrich_geodata(DplaGeonamesGeocoder()) # Don't enrich with geodata if place is 'United States' pattern = ur" *(United States(?!-)|États-Unis|USA)" if (place.name and re.search(pattern, place.name)): bing_geocode = False if bing_geocode: # Attempt to find this item's lat/lng coordinates if not place.coordinates: api_key = module_config().get("bing_api_key") place.enrich_geodata(DplaBingGeocoder(api_key=api_key)) # rerun geonames enrichment with new coordinates place.enrich_geodata(DplaGeonamesGeocoder()) if not place.validate(): if not place.set_name(): logger.error("Spatial dictionary must have a " + "'name' property. Could not enhance input " + "data to include a name property; " + "record %s" % data["_id"]) places.append(place) values = map(lambda x: x.to_map_json(), Place.merge_related(places)) setprop(data, prop, values) return json.dumps(data)
def movedatevalues(body, ctype, action="move_date_values", prop=None, to_prop="sourceResource/temporal"): """ Service that accepts a JSON document and moves any dates found in the prop field to the temporal field. """ if not prop: logger.error("Prop param is None in %s" % __name__) return body REGSEARCH = [ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}", "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}", "\d{4}\s*[-/]\s*\d{4}", "\d{1,2}\s*[-/]\s*\d{4}", "\d{4}\s*[-/]\s*\d{1,2}", "\d{4}s?", "\d{1,2}\s*(?:st|nd|rd|th)\s*century", ".*circa.*" ] def cleanup(s): s = re.sub("[\(\)\.\?]", "",s) return s.strip() try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): values = iterify(getprop(data, prop)) remove = [] toprop = iterify(getprop(data, to_prop)) if exists(data, to_prop) \ else [] for v in iterify(values): if isinstance(v, basestring): c = cleanup(v) for pattern in REGSEARCH: m = re.compile(pattern, re.I).findall(c) if len(m) == 1 and not re.sub(m[0], "", c).strip(): if m[0] not in toprop: toprop.append(m[0]) # Append the non-cleaned value to remove remove.append(v) break if toprop: setprop(data, to_prop, toprop) if len(values) == len(remove): delprop(data, prop) else: setprop(data, prop, [v for v in values if v not in remove]) return json.dumps(data)
def add_identifier(self, value): prop = "sourceResource/identifier" identifier = self._get_mapped_value(prop) identifier.append("Hathi: " + value) setprop(self.mapped_data, prop, identifier)