def test_substitution_with_deleting_missing_values(): data = { "xxx": "yyy", "aaa": { "bbb": "ccc", "xxx": [ {"eee": "aaa"}, {"xxx": "eee"}, {"eee": "bbb"}, {"eee": "doesnt exist"}, {"eee": "doesnt exist"} ] }, } INPUT = json.dumps(data) data["aaa"]["xxx"] = [ {"eee": "AAA222"}, {"xxx": "eee"}, {"eee": "BBB222"}, { }, { } ] EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa/xxx/eee", "aaa/xxx/eee", "test2", None, True) assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def test_enrich_date_parse_century_date(): """Correctly transform a date of format '19th c.'""" url = server() + "enrich_earliest_date?prop=date" INPUT = {"date": "19th c."} EXPECTED = { "date": { "begin": None, "end": None, "displayDate": "19th c" # period stripped assumed OK } } resp,content = H.request(url,"POST",body=json.dumps(INPUT)) result = json.loads(content) assert result["date"] == EXPECTED["date"], \ "%s != %s" % (result["date"], EXPECTED["date"]) INPUT = {"date": "19th century"} EXPECTED = { "date": { "begin": None, "end": None, "displayDate": "19th century" } } resp,content = H.request(url,"POST",body=json.dumps(INPUT)) result = json.loads(content) assert result["date"] == EXPECTED["date"], \ "%s != %s" % (result["date"], EXPECTED["date"])
def test_remove_spaces_around_dashes(): """Should remove spaces around dashes.""" INPUT = { "id": "123", "spatial": [ {"name": "Asheville"}, {"name": "North Carolina"} ], "subject": [ "hello there", "aaa--bbb", "aaa --bbb", "aaa-- bbb", "aaa -- bbb", "aaa -- bbb -- ccc - - ddd -- " ] } EXPECTED = { "id": "123", "spatial": [ {"name": "Asheville"}, {"name": "North Carolina"} ], "subject": [ {"name": "Hello there"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb--ccc - - ddd--"}, ] } resp, content = _get_server_response(json.dumps(INPUT)) assert_same_jsons(json.dumps(EXPECTED), content) assert resp.status == 200
def test_substitute_with_list_of_dictionaries(): """ Should convert all dicts in a list. """ data = { "xxx": "yyy", "aaa": { "bbb": "ccc", "xxx": [ {"eee": "aaa"}, {"xxx": "eee"}, {"eee": "bbb"} ] } } INPUT = json.dumps(data) data["aaa"]["xxx"] = [ {"eee": "AAA222"}, {"xxx": "eee"}, {"eee": "BBB222"}, ] EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa/xxx/eee", "aaa/xxx/eee", "test2") print_error_log() pinfo(resp, content) assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def geocode(body,ctype,prop=None,newprop=None): ''' Service that accepts a JSON document and "unshreds" the value of the field named by the "prop" parameter ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if prop not in data: return json.dumps(data) # graceful abort if not newprop: newprop = prop if hasattr(data[prop],'__iter__'): # Handle strings and iterables data[newprop] = [ lookup_place(place) for place in data[prop] ] else: data[newprop] = lookup_place(data[prop]) return json.dumps(data)
def test_substitution_using_scdl_format_dict(): formats = \ ("Pamphlets", "Pamphlets"), \ ("Pamphlet", "Pamphlets"), \ ("pamphlets", "Pamphlets"), \ ("Manuscripts", "Manuscripts"), \ ("Manuscript", "Manuscripts"), \ ("manuscripts", "Manuscripts"), \ ("Photograph", "Photographs"), \ ("Photographs", "Photographs"), \ ("Photograph", "Photographs") \ data = { "xxx": "yyy", "aaa": "" } for f in formats: data["aaa"] = f[0] INPUT = json.dumps(data) data["aaa"] = f[1] EXPECTED_OUTPUT = json.dumps(data) print "Checking: %s" + repr(f) resp, content = _get_server_response(INPUT, "aaa", "aaa", "scdl_fix_format", None, False) print_error_log() assert resp.status == 200 assert_same_jsons(EXPECTED_OUTPUT, content)
def enrich_storage(body, ctype): """Establishes a pipeline of services identified by an ordered list of URIs provided in request header "Pipeline-Item" """ request_headers = copy_headers_to_dict(request.environ) rec_enrichments = request_headers.get(u"Pipeline-Item","").split(",") records = json.loads(body) # Counts enriched_coll_count = 0 enriched_item_count = 0 missing_id_count = 0 missing_source_resource_count = 0 errors = [] enriched_records = {} for record in records: error, enriched_record_text = pipe(record, ctype, rec_enrichments, "HTTP_PIPELINE_ITEM") if error: errors.append(error) enriched_record = json.loads(enriched_record_text) if enriched_record.get("_id", None): ingest_type = enriched_record.get("ingestType") # Item records should have sourceResource if (ingest_type == "item" and not "sourceResource" in enriched_record): logger.error("Record %s does not have sourceResource: %s" % (enriched_record["_id"], enriched_record)) missing_source_resource_count += 1 else: enriched_records[enriched_record["_id"]] = enriched_record if ingest_type == "item": enriched_item_count += 1 else: enriched_coll_count += 1 else: logger.error("Found a record without an _id %s" % enriched_record) missing_id_count += 1 data = { "enriched_records": enriched_records, "enriched_coll_count": enriched_coll_count, "enriched_item_count": enriched_item_count, "missing_id_count": missing_id_count, "missing_source_resource_count": missing_source_resource_count, "errors": errors } return json.dumps(data) return json.dumps(docs)
def download_preview(body, ctype): """ Reponsible for: downloading a preview for a document Usage: as a module in separate pipeline, to be run on existing documents in the repository to download the thumbnails. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Check the "admin/object_status" field status = None try: status = getprop(data, "admin/object_status") if status in ["error", "downloaded"]: logger.debug("Status is %s, doing nothing" % status) return body except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Thumbnail URL url = None try: url = getprop(data, "object/@id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Document ID id = None try: id = getprop(data, "id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) download = False if status == "pending": download = True (relative_fname, mime, status) = download_image(url, id, download) if not relative_fname: logger.error("Cannot save thumbnail from: %s." % (url)) # so everything is OK and the file is on disk doc = update_document(data, relative_fname, mime, status) return json.dumps(doc)
def test_substitution_with_missing_value_and_the_same_field(): """Should remove the field if value is missing from dict.""" data = {"xxx": "yyy", "aaa": {"bbb": "ccc", "xxx": "doesnt exist"}} INPUT = json.dumps(data) data = {"xxx": "yyy", "aaa": {"bbb": "ccc"}} EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa/xxx", "aaa/xxx", "test2", None, True) print_error_log() assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def test_enrichment_for_creator_field(): """Should remove spaces around dashes.""" INPUT = { "id": "123", "spatial": [ {"name": "Asheville"}, {"name": "North Carolina"} ], "creator": [ "hello there", "123", ". hi ", ". hi", " . hi there ", "a banana", "''.more complicated....", '""""....even more complicated....."\'""""', "hello there;;", ";;hello there;;", "aaa--bbb", "aaa --bbb", "aaa-- bbb", "aaa -- bbb", "aaa -- bbb -- ccc - - ddd -- ", "aaa --- bbb --- ccc--- ddd---", "aaa ---- bbb ----ccc---- ddd----" ] } EXPECTED = { "id": "123", "spatial": [ {"name": "Asheville"}, {"name": "North Carolina"} ], "creator": [ {"name": "Hello there"}, {"name": "123"}, {"name": "Hi there"}, {"name": "A banana"}, {"name": "More complicated"}, {"name": "Even more complicated"}, {"name": "Hello there"}, {"name": "Hello there"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb"}, {"name": "Aaa--bbb--ccc--ddd--"}, {"name": "Aaa--bbb--ccc--ddd--"}, {"name": "Aaa--bbb--ccc--ddd--"} ] } resp, content = _get_server_response(json.dumps(INPUT), "creator") assert_same_jsons(json.dumps(EXPECTED), content) assert resp.status == 200
def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by combining all spatial fields into one. Will also split out country and state on a best-efforts basis. For primary use with MDL documents. Possible avenues of improvement: - For fields with semi-colons, permute and create multiple spatial elements - Create an ordered list of "names" for the geocoder to attempt to lookup as opposed to our single concatenated list: - Everything concatenated together - Everything concatenated together up to "United States" - Remove left-most elements one by one """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): sp = {} v = getprop(data,prop) fields = len(v) if not fields: logger.error("Spatial is empty.") return json.dumps(data) else: # Concatenate all values together to form the name field sp["name"] = ", ".join(v) logger.info("mdl-enrich-location: %s => %s" % (fields, sp["name"],)) if (1 == fields): # If there is only one element present, it is a country sp["country"] = clean(v[0]) elif "United States" in v: country_index = v.index("United States") sp["country"] = clean(v[country_index]) # The prior item is almost always a state if (country_index > 1): state = clean(v[country_index - 1]) if (is_state(state)): sp["state"] = state if sp: sp = [sp] setprop(data, prop, sp) return json.dumps(data)
def test_substitution_for_different_fields_and_array(): """ Should return json when original json is array. """ data = {"xxx": "yyy", "aaa": ["aa", "bbb", "ccc", "ddd"]} INPUT = json.dumps(data) data["zzz"] = ["aa", "BBB", "CCC", "DDD"] EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa", "zzz", "test") print_error_log() assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def test_dictionary_subsitution(): """ Should substitute when there is dictionary field. """ data = {"xxx": "yyy", "aaa": {"bbb": "ccc"}} INPUT = json.dumps(data) data["aaa"] = {"bbb": "CCC"} EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa/bbb", "aaa/bbb", "test") print_error_log() assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def test_dict_substitution_in_different_field(): """ Should add another field when prop is dictionary field. """ data = {"xxx": "yyy", "aaa": {"bbb": "ccc", "xxx": {"eee": "aaa"}}} INPUT = json.dumps(data) data["aaa"]["xxx"]["ccc"] = "AAA222" EXPECTED_OUTPUT = json.dumps(data) resp, content = _get_server_response(INPUT, "aaa/xxx/eee", "aaa/xxx/ccc", "test2") print_error_log() assert resp.status == 200 assert_same_jsons(content, EXPECTED_OUTPUT)
def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="aggregatedCHO/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document by: a) Mapping to city, county, state, country, and iso3166-2 if there are 4 fields OR b) Mapping to city, state, country, and iso3166-2 if there are 3 fields OR c) Mapping to county and country if there are 2 fields For primary use with MDL documents. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data,prop): sp = {} v = getprop(data,prop) fields = len(v) if not fields: logger.error("Spatial is empty.") return json.dumps(data) elif fields == 1: sp["country"] = v[0]["name"] elif fields == 2: sp["state"] = v[0]["name"] sp["country"] = v[1]["name"] elif fields == 3: sp["county"] = v[0]["name"] sp["state"] = v[1]["name"] sp["country"] = v[2]["name"] elif fields == 4: sp["city"] = v[0]["name"] sp["county"] = v[1]["name"] sp["state"] = v[2]["name"] sp["country"] = v[3]["name"] else: sp["city"] = v[0]["name"] sp["county"] = v[2]["name"] sp["state"] = v[3]["name"] sp["country"] = v[4]["name"] if sp: sp = [sp] setprop(data, prop, sp) return json.dumps(data)
def test_copy_prop_to_prop_create_dict_key1(): """Should copy to_prop into new dict with key""" prop1 = "key1" prop2 = "sourceResource/key2" to_prop = "sourceResource/to_dict" key1 = "key1" key2 = "key2" create = True INPUT = { "key1": "value1", "sourceResource": { "key2": "value2", "key3": "value3" }, "key4": "value4" } EXPECTED1 = { "key1": "value1", "sourceResource": { "key2": "value2", "key3": "value3", "to_dict" : {"key1": "value1"} }, "key4": "value4" } EXPECTED2 = { "key1": "value1", "sourceResource": { "key2": "value2", "key3": "value3", "to_dict" : { "key1": "value1", "key2": "value2" } }, "key4": "value4" } resp,content = _get_server_response(json.dumps(INPUT), prop=prop1, to_prop=to_prop, key=key1, create=create) assert resp.status == 200 assert json.loads(content) == EXPECTED1 resp,content = _get_server_response(json.dumps(EXPECTED1), prop=prop2, to_prop=to_prop, key=key2, create=create) assert resp.status == 200 assert json.loads(content) == EXPECTED2
def test_convert_spatial_string_to_dictionary(): """ Format UIUC spatial dictionaries """ INPUT = { "id": "12345", "sourceResource": { "spatial": [ { "name": "Honolulu, HI" }, { "name": "1972 to Present" } ] }, "creator": "David" } EXPECTED = { "id": "12345", "sourceResource": { "spatial": [ { "name": "Honolulu, HI" } ] }, "creator": "David" } url = server() + "uiuc_enrich_location" resp,content = H.request(url,"POST",body=json.dumps(INPUT)) assert resp.status == 200 assert json.loads(content) == EXPECTED
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def test_unset_prop6(): """Should unset prop since conditions are met for multiple condition props""" action = "unset" prop = "_id" condition = "hathi_exclude" condition_prop = "dataProvider%2CsourceResource%2Ftype" INPUT = { "_id": "12345", "dataProvider": ["Hathitrust", "University of Minnesota"], "sourceResource": { "type": "image" } } EXPECTED = { "dataProvider": ["Hathitrust", "University of Minnesota"], "sourceResource": { "type": "image" } } resp, content = _get_server_response(json.dumps(INPUT), action=action, prop=prop, condition=condition, condition_prop=condition_prop) print_error_log() assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_unset_prop2(): """Should unset prop since condition is met""" action = "unset" prop = "sourceResource/rights" condition = "is_digit" INPUT = { "_id": "12345", "key1": "value1", "sourceResource": { "key1" : "value1", "rights": "20010983784" }, "key2": "value2" } EXPECTED = { "_id": "12345", "key1": "value1", "sourceResource": { "key1" : "value1" }, "key2": "value2" } resp,content = _get_server_response(json.dumps(INPUT), action=action, prop=prop, condition=condition) assert resp.status == 200 print_error_log() assert json.loads(content) == EXPECTED
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data,prop) for v in iterify(value): name = replace_state_abbreviations(v["name"].rstrip()) v["name"] = name # Try to extract a County if " county " in name.lower(): # "XXX County (S.C.)" => county: XXX v["county"] = name[0:name.lower().index("county")].strip() elif "(S.C.)" in name: # "XXX (S.C)" => city: XXX v["city"] = name[0:name.index("(S.C.)")].strip() return json.dumps(data)
def test_set_prop2(): """Should create the prop and set its value""" prop = "sourceResource/rights" value = "rights" INPUT = { "key1": "value1", "sourceResource": { "key1" : "value1" }, "key2": "value2" } EXPECTED = { "key1": "value1", "sourceResource": { "key1" : "value1", "rights": "rights" }, "key2": "value2" } resp,content = _get_server_response(json.dumps(INPUT), prop=prop, value=value) assert resp.status == 200 assert json.loads(content) == EXPECTED
def nypl_identify_object(body, ctype, download="True"): try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" original_document_key = u"originalRecord" original_preview_key = u"tmp_image_id" preview_format = "http://images.nypl.org/index.php?id={0}&t=t" if original_document_key not in data: logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id']) return body if original_preview_key not in data[original_document_key]: logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id']) return body preview_url = preview_format.format(data[original_document_key][original_preview_key]) data["object"] = preview_url status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def test_enrich_location_after_provider_specific_enrich_location4(): """ Previous specific-provider location did not set state. """ INPUT = { "id": "12345", "sourceResource": { "spatial": [{"city": "Asheville; La Jolla", "county": "Buncombe;San Diego", "country": "United States"}] }, "creator": "Miguel", } EXPECTED = { "id": "12345", "sourceResource": { "spatial": [ {"city": "Asheville", "county": "Buncombe", "country": "United States"}, {"city": "La Jolla", "county": "San Diego"}, ] }, "creator": "Miguel", } url = server() + "enrich_location" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_set_prop5(): """Should set prop to value, since condition_prop exists""" prop = "sourceResource/rights" value = "rights" condition_prop = "sourceResource" INPUT = { "key1": "value1", "sourceResource": { "key1" : "value1", "rights": "value2" }, "key2": "value2" } EXPECTED = { "key1": "value1", "sourceResource": { "key1" : "value1", "rights": "rights" }, "key2": "value2" } resp,content = _get_server_response(json.dumps(INPUT), prop=prop, value=value, condition_prop="sourceResource") assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_dedup_value1(): """Should remove duplicate values""" props = "subject,spatial,description" INPUT = { "subject": [ "This is a subject", "This is a subject.", " this is a SuBject . ", "This is another subject (1780).", "This is another subject 1780", " thiS IS anOther subject (1780)" ], "spatial": ["North Carolina", "New York"], "description": "A description" } EXPECTED = { "subject": [ "This is a subject", "This is another subject (1780)." ], "spatial": ["North Carolina", "New York"], "description": "A description" } resp, content = _get_server_response(json.dumps(INPUT), props) assert resp.status == 200 assert_same_jsons(EXPECTED, content)
def test_removing_bracket(): """Should remove bracket from the beginning of the name""" INPUT = { "id": "12345", "sourceResource": {"spatial": ["Charleston (S.C.); [Germany; Poland; Israel; New York (N.Y.); Georgia (U.S.)"]}, "creator": "Miguel", } EXPECTED = { "id": "12345", "sourceResource": { "spatial": [ {"name": "Charleston (S.C.)"}, {"name": "Germany"}, {"name": "Poland"}, {"name": "Israel"}, {"name": "New York (N.Y.)"}, {"name": "Georgia (U.S.)"}, ] }, "creator": "Miguel", } url = server() + "enrich_location" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_enrich_list_of_dictionaries_and_strings(): """Should handle list of dictionaries and strings""" INPUT = { "id": "12345", "sourceResource": { "spatial": [ {"country": "United States", "county": "Buncombe", "state": "North Carolina"}, "Rushmore, Mount", "Mount Rushmore National Memorial", ] }, } EXPECTED = { "id": "12345", "sourceResource": { "spatial": [ {"country": "United States", "county": "Buncombe", "state": "North Carolina"}, {"name": "Rushmore, Mount"}, {"name": "Mount Rushmore National Memorial"}, ] }, } url = server() + "enrich_location" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_unset_prop1(): """Should unset prop""" action = "unset" prop = "sourceResource/rights" INPUT = { "_id": "12345", "key1": "value1", "sourceResource": { "key1" : "value1", "rights": "value2" }, "key2": "value2" } EXPECTED = { "_id": "12345", "key1": "value1", "sourceResource": { "key1" : "value1" }, "key2": "value2" } resp,content = _get_server_response(json.dumps(INPUT), action=action, prop=prop) assert resp.status == 200 assert json.loads(content) == EXPECTED
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"): """ Service that accepst a JSON document and removes cleans the sourceResource/creator field by removing the values in REGEXES if the field value begins with them """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): item = getprop(data, prop) if not isinstance(item, list): item = [item] for i in range(len(item)): for s in CLEANUP: item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip() setprop(data, prop, item[0] if len(item) == 1 else item) return json.dumps(data)
def test_drop_long_values(): """Correctly transform a date value that cannot be parsed""" INPUT = { "sourceResource": { "description": [ "could be 1928ish?", "this is a long string will blow up flake 8, should drop this", "short" ] } } EXPECTED = { "sourceResource": { "description": ["could be 1928ish?", "short"] } } url = server() + "drop-long-values?field=description&max_length=20" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) TC.assertEqual(resp.status, 200) TC.assertEqual(json.loads(content), EXPECTED)
def test_unset_prop8(): """Should not unset prop since condition is not met with dataProvider""" action = "unset" prop = "_id" condition = "hathi_exclude" condition_prop = "dataProvider%2CsourceResource%2Ftype" INPUT = { "_id": "12345", "dataProvider": "Hathitrust", "sourceResource": { "type": "image" } } resp, content = _get_server_response(json.dumps(INPUT), action=action, prop=prop, condition=condition, condition_prop=condition_prop) print_error_log() assert resp.status == 200 assert json.loads(content) == INPUT
def test_copy_prop_str_to_str(): """Should extend to_prop""" prop = "note" to_prop = "sourceResource/description" INPUT = { "note": "This is a note", "sourceResource": { "description": "This is a description" } } EXPECTED = { "note": "This is a note", "sourceResource": { "description": ["This is a description", "This is a note"] } } resp, content = _get_server_response(json.dumps(INPUT), prop=prop, to_prop=to_prop) assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_date_with_brackets(): """Should transform date with brackets.""" ranges = [ "[1960-05-01]", "[ 1960-05-01 ]" ] for r in ranges: INPUT = {"date": r} EXPECTED = { u'date' : { u'begin' : u'1960-05-01', u'end' : u'1960-05-01', "displayDate" : r } } url = server() + "enrich_earliest_date?prop=date" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert str(resp.status).startswith("2") assert_same_jsons(EXPECTED, content)
def test_dedup_value1(): """Should remove duplicate values""" props = "subject,spatial,description" INPUT = { "subject": [ "This is a subject", "This is a subject.", " this is a SuBject . ", "This is another subject (1780).", "This is another subject 1780", " thiS IS anOther subject (1780)" ], "spatial": ["North Carolina", "New York"], "description": "A description" } EXPECTED = { "subject": ["This is a subject", "This is another subject (1780)."], "spatial": ["North Carolina", "New York"], "description": "A description" } resp, content = _get_server_response(json.dumps(INPUT), props) assert resp.status == 200 assert_same_jsons(EXPECTED, content)
def mdlstatelocatedin(body, ctype): """ Service that accepts a JSON document and extracts the state from the address in the first dataProvider value """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" prop = "dataProvider" if exists(data, prop): address = iterify(getprop(data, prop))[0] for st, state in states.items(): if (re.search("\s+%s\s+" % st, address) or re.search("\s+%s\s+" % state, address)): setprop(data, "sourceResource/stateLocatedIn", state) break return json.dumps(data)
def nara_enrich_location(body, ctype, action="nara_enrich_location", prop="sourceResource/spatial"): """ Service that massages a NARA JSON document. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if (exists(data, prop)): # Check spatial dictionaries to see if they are valid spatials = [] for spatial in iterify(getprop(data, prop)): spatials.append(format_spatial(spatial)) setprop(data, prop, spatials) return json.dumps(data)
def test_full_date_range(): """Should handle full date range""" INPUT = [ "1901-01-01-1902-01-01", "1901-01-01/1902-01-01", "1901/01/01-1902/01/01", "1901/01/01/1902/01/01", "01/01/1901-01/01/1902", "1/1/1901/1/1/1902", "01-01-1901/01-01-1902", "1-1-1901-1-1-1902" ] url = server() + "enrich_earliest_date?prop=date" for i in range(len(INPUT)): input = {"date": INPUT[i]} expected = { "date": [{ "begin": "1901-01-01", "end": "1902-01-01", "displayDate": INPUT[i] }] } resp, content = H.request(url, "POST", body=json.dumps(input)) assert str(resp.status).startswith("2") assert_same_jsons(expected, content)
def test_copy_prop_dict_to_list(): """Should append to to_prop""" prop = "sourceResource/from_dict" to_prop = "sourceResource/to_list" INPUT = { "key1": "value1", "sourceResource": { "key1": "value1", "from_dict": { "key1": "value1" }, "to_list": ["a", "b", "c"], "key2": "value2" }, "key2": "value2" } EXPECTED = { "key1": "value1", "sourceResource": { "key1": "value1", "from_dict": { "key1": "value1" }, "to_list": ["a", "b", "c", { "key1": "value1" }], "key2": "value2" }, "key2": "value2" } resp, content = _get_server_response(json.dumps(INPUT), prop=prop, to_prop=to_prop) assert resp.status == 200 assert json.loads(content) == EXPECTED
def test_geocode_coordinate_provided2(): """Should use coordinates provided in the coordinates property""" INPUT = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": [ { "name": "United States--Massachussetts", "coordinates": "42.358631134, -71.0567016602" } ] }, "creator": "David" } EXPECTED = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": [ { "county": "Suffolk County", "state": "Massachusetts", "country": "United States", "name": "United States--Massachussetts", "coordinates": "42.358631134, -71.0567016602" } ] }, "creator": "David" } url = server() + "geocode" resp,content = H.request(url,"POST",body=json.dumps(INPUT)) assert resp.status == 200 assert_same_jsons(EXPECTED, json.loads(content))
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data, prop) for v in iterify(value): name = replace_state_abbreviations(v["name"].rstrip()) v["name"] = name # Try to extract a County if " county " in name.lower(): # "XXX County (S.C.)" => county: XXX v["county"] = name[0:name.lower().index("county")].strip() if "(S.C.)" in name: v["state"] = "South Carolina" v["country"] = "United States" elif "(S.C.)" in name: # "XXX (S.C)" => city: XXX v["city"] = name[0:name.index("(S.C.)")].strip() v["state"] = "South Carolina" v["country"] = "United States" return json.dumps(data)
def test_texas_enrich_location4(): """Should do nothing with limits""" INPUT = { "id": "12345", "sourceResource": { "spatial": [ "Canada - British Columbia Province - Vancouver Island - Victoria", "north=34.19; east=-99.94;", "northlimit=34.25; eastlimit=-99.88; southlimit=34.13; westlimit=-100;" ] } } EXPECTED = { "id": "12345", "sourceResource": { "spatial": [ { "name": "Canada - British Columbia Province - Vancouver Island - Victoria", "country": "Canada", "state": "British Columbia Province", "county": "Vancouver Island", "city": "Victoria" }, { "name": "34.19, -99.94" }, { "name": "northlimit=34.25; eastlimit=-99.88; southlimit=34.13; westlimit=-100;" } ] } } url = server() + "texas_enrich_location" resp,content = H.request(url,"POST",body=json.dumps(INPUT)) assert resp.status == 200 assert json.loads(content) == EXPECTED
def jsonfy_prop(body, ctype, prop=None): """ Some data is packed as strings that contain json. (UCSD) Take the data in the given property and turn any sub-values that can be read by json.loads into json object. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if prop: obj = getprop(data, prop, True) else: obj = data obj_jsonfied = jsonfy_obj(obj) if prop: setprop(data, prop, obj_jsonfied) else: data = obj_jsonfied return json.dumps(data)
def test_geocode(): """ Simple geocode """ INPUT = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": [ { "name": "Boston, MA" } ] }, "creator": "David" } EXPECTED = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": [ { "name": "Boston, MA", "state": "Massachusetts", "country": "United States", "coordinates": "42.358631134, -71.0567016602" } ] }, "creator": "David" } url = server() + "geocode" resp,content = H.request(url,"POST",body=json.dumps(INPUT)) assert resp.status == 200 assert_same_jsons(json.loads(content), EXPECTED)
def test_usc_enrich_location_find_coordinates_and_flip(): """Should flip long/lat value, then remove all spatial values except the lat/long coordinate """ INPUT = { "sourceResource": { "spatial": [{ "name": " 123 " }, { "name": "-130.4560,,32.9870" }, { "name": "1234" }, { "name": "Asheville" }, { "name": "92.5542, 35.6008" }] } } EXPECTED = {"sourceResource": {"spatial": [{"name": "35.6008, 92.5542"}]}} resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp["status"] == "200" assert_same_jsons(EXPECTED, json.loads(content))
def test_suppress_creator_value_for_string(): """Should remove creator value if the is 'creator'.""" INPUT = { "sourceResource": { "creator": "creator", "aaa": [ "Fisker, Kay--Architect--Danish--Male", "Fisker, Kay--Architect--Danish--Male", "bbb", "ccc" ] } } EXPECTED_OUTPUT = { "sourceResource": { "aaa": [ "Fisker, Kay--Architect--Danish--Male", "Fisker, Kay--Architect--Danish--Male", "bbb", "ccc" ] } } resp, content = _get_server_response(json.dumps(INPUT)) print_error_log() assert resp["status"].startswith("2") assert_same_jsons(EXPECTED_OUTPUT, content)
def test_usc_enrich_location(): """Should join values on whitespace""" INPUT = { "sourceResource": { "spatial": [{ "name": "-130.4560,,32.9870" }, { "name": "1234" }, { "name": "Asheville" }] } } EXPECTED = { "sourceResource": { "spatial": [{ "name": "-130.4560,,32.9870 1234 Asheville" }] } } resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp["status"] == "200" assert_same_jsons(EXPECTED, json.loads(content))
def augment_freemix(body, ctype): #See: http://foundry.zepheira.com/issues/133#note-4 ''' Render the contents of a file as best as possible in Exhibit JSON * Supports Excel, BibTex and JSON for now Sample queries: * curl "http://*****:*****@foo.xls" --header "Content-Type: application/vnd.ms-excel" "http://localhost:8880/freemix.json" ''' fixup_obj_labels = True obj = json.loads(body) dataprofile = obj['data_profile'] objkeys = {} source = obj[u'items'] augmented_items = [] failed_items = {} for prop in dataprofile["properties"]: if not prop["enabled"]: continue prop_types = [ t[PROP_TYPE_MARKER_LEN:] for t in prop["tags"] if t.startswith(PROP_TYPE_MARKER) ] #logger.debug("PROPERTY TYPES: " + repr(prop_types)) if prop_types: for aug, sid in AUGMENTATIONS.items(): handler = service_proxy(sid) if aug in prop_types and (u"composite" in prop or aug == u'shredded_list'): handler(source, prop, augmented_items, failed_items) #logger.debug('AUGMENTATION: ' + repr((prop['property'], augmented_items))) #Inefficiency of creating a dict only to get its values response = {'items': augmented_items, 'failed': failed_items} return json.dumps(response, indent=4)
def test_unset_prop4(): """Should do nothing to INPUT but catch keyError since condition is not in CONDITIONS """ action = "unset" prop = "sourceResource/rights" condition = "is_digits" INPUT = { "_id": "12345", "key1": "value1", "sourceResource": { "key1": "value1", "rights": "value2" }, "key2": "value2" } resp, content = _get_server_response(json.dumps(INPUT), action=action, prop=prop, condition=condition) assert resp.status == 200 assert json.loads(content) == INPUT
def scdl_geocode_regions(body, ctype, action="scdl_geocode_regions", prop="sourceResource/spatial"): """ Service that accepts a JSON document and forcibly sets the coordinates for South Carolina regions. For use with the scdl profiles """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if exists(data, prop): value = getprop(data, prop) for v in iterify(value): if (is_region(v)): geocode_region(v) return json.dumps(data)
def test_enrich_dates_with_tildes_and_x(): """Should remove tildes and x characters from dates""" INPUT = [{"date": "1946-10x"}, {"date": "1946-10~"}] EXPECTED = [{ "date": [{ "begin": "1946-10", "end": "1946-10", "displayDate": "1946-10x" }] }, { "date": [{ "begin": "1946-10", "end": "1946-10", "displayDate": "1946-10~" }] }] url = server() + "enrich_earliest_date?prop=date" for i in range(len(INPUT)): resp, content = H.request(url, "POST", body=json.dumps(INPUT[i])) print 'CONTENT:{}'.format(content) assert str(resp.status).startswith("2") assert_same_jsons(EXPECTED[i], content)
def cleanup_value(body, ctype, action="cleanup_value", prop=",".join(DEFAULT_PROP + DONT_STRIP_DOT_END)): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) applying a set of regexps to do data cleanup ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): convert(data, p) else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def test_geocode_name_search_context(): """Contextualize a place name using any additional feature names If feature names for city, county, or state are given, use them to disambiguate place names that have multiple interpretations. """ INPUT = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": { "name": "Portland", "state": "Maine" } } } EXPECTED = { "id": "12345", "_id": "12345", "sourceResource": { "spatial": [{ "city": "Portland", "county": "Cumberland County", "country": "United States", "state": "Maine", "name": "Portland", "coordinates": "43.66147, -70.25533" } ]} } url = server() + "geocode" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 assert_same_jsons(EXPECTED, json.loads(content))
def test_enrich_temporal_date(): """Correctly enrich temporal dates""" INPUT = { "sourceResource": { "spatial" : [ "1901-1999", " 1901 - 1999 ", " 1901 / 01 / 01", "1905-04-12", "01/01/1901", "1901", "North Carolina" ]} } EXPECTED = { "sourceResource": { "temporal": [ {"begin": "1901", "end": "1999", "displayDate": "1901-1999"}, {"begin": "1901", "end": "1999", "displayDate": "1901 - 1999"}, {"begin": "1901", "end": "1901", "displayDate": "1901"}, {"begin": "1901-01-01", "end": "1901-01-01", "displayDate": "1901 / 01 / 01"}, {"begin": "1901-01-01", "end": "1901-01-01", "displayDate": "01/01/1901"}, {"begin": "1905-04-12", "end": "1905-04-12", "displayDate": "1905-04-12"}, ], "spatial" : ["North Carolina"]} } url = server() + "move_date_values?prop=sourceResource/spatial" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert resp.status == 200 url = server() + "enrich_date" resp, content = H.request(url, "POST", body=content) assert resp.status == 200 assert_same_jsons(EXPECTED, content)
def test_capitalize_value(): """Should capitalize first letter of each property""" INPUT = { "id": "123", "spatial": { "key1": "asheville", "key2": "north Carolina" }, "subject": ["subject", "hi there", "hello"] } EXPECTED = { "id": "123", "spatial": { "key1": "Asheville", "key2": "North Carolina" }, "subject": ["Subject", "Hi there", "Hello"] } resp, content = _get_server_response( json.dumps(INPUT), prop="spatial/key1,spatial/key2,subject") assert resp.status == 200 FETCHED = json.loads(content) assert FETCHED == EXPECTED, DictDiffer(EXPECTED, FETCHED).diff()
def nypl_identify_object(body, ctype, list_sets=None): try: assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % ( HTTP_HEADER_TYPE, HTTP_TYPE_JSON) data = json.loads(body) except Exception as e: error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e)) logger.exception(error_text) response.code = HTTP_INTERNAL_SERVER_ERROR response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT) return error_text H = httplib2.Http('/tmp/.cache') H.force_exception_as_status_code = True resp, content = H.request(list_sets) if not resp[u'status'].startswith('2'): logger.error(' HTTP error (' + resp[u'status'] + ') resolving URL: ' + list_sets) return body content_dict = xmltodict.parse(content, xml_attribs=True, attr_prefix='', force_cdata=False, ignore_whitespace_cdata=True) sets = content_dict["nyplAPI"]["response"] for r in sets: if "collection" == r: for coll_dict in sets[r]: if "uuid" in coll_dict and "title" in coll_dict and ( coll_dict["uuid"] == data["title"] or coll_dict["uuid"] in data["@id"]): data["title"] = coll_dict["title"] return json.dumps(data)
def test_enrich_format_cleanup_multiple(): "Test format normalization and removal of non IMT formats" INPUT = { "format": [ "Still Images", "image/JPEG", "audio", "Images", 'application', "audio/mp3 (1.46 MB; 1 min., 36 sec.)", "Still Images", "image/JPEG", "audio", "Images", 'application', "audio/mp3 (1.46 MB; 1 min., 36 sec.)", "Images/jpeg", "images/jpeg" ] } EXPECTED = { u'format': [ "Still Images", 'image/jpeg', "audio", "Images", 'application', 'audio/mpeg' ], u'type': ["image", "sound"] } url = server() + "enrich-format?prop=format&type_field=type" resp, content = H.request(url, "POST", body=json.dumps(INPUT)) assert_same_jsons(EXPECTED, content) assert str(resp.status).startswith("2")
def dc_clean_invalid_dates(body, ctype, action="cleanup_value", prop="sourceResource/date"): if prop is None: response.code = 500 response.add_header('content-type', 'text/plain') msg = "Prop param is None" logger.error(msg) return msg try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): convert(data, p) return json.dumps(data)
def required_values_from_collection_registry(body, ctype, field, mode): '''Get values for the required fields sourceResource.rights & sourceResource.type from the collection registry data. Default mode is to fill in missing data. mode='overwrite' will overwrite existing data mode='append' will add the values ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if field == 'rights': data = set_rights_from_collection(data, mode) elif field == 'type': data = set_type_from_collection(data, mode) elif field == 'title': data = set_title_for_object(data) #ensure "@context" is there if not exists(data, "@context"): data["@context"] = "http://dp.la/api/items/context" return json.dumps(data)
def capitalize_value(body, ctype, prop=",".join(DEFAULT_PROP), exclude=None): """ Service that accepts a JSON document and capitalizes the prop field of that document """ if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" prop = prop.split(",") if exclude in prop: prop.remove(exclude) for p in prop: if p: capitalize(data, p) else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)