Example #1
0
def nypl_select_hasview(body, ctype):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_high_res_link"
    source_key = u"hasView"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].",
                     original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].",
                     original_document_key, original_preview_key, data[u'id'])
        return body

    data[source_key] = {
        "@id": data[original_document_key][original_preview_key],
        "format": None
    }
    return json.dumps(data)
Example #2
0
    def map_subject_and_spatial_and_temporal(self):
        prop = self.root_key + "subject"
        mapped_props = {
            "subject": [],
            "spatial": [],
            "temporal": []
        }

        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                try:
                    if "geographic" in s and s.get("geographic"):
                        mapped_props["spatial"].append(s.get("geographic"))
                    elif "topic" in s and s.get("topic"):
                        mapped_props["subject"].append(s.get("topic"))
                    elif "temporal" in s and s.get("temporal"):
                        mapped_props["temporal"].append(s.get("temporal"))
                except Exception as e:
                    logger.error("Error mapping geo/subject/temporal"
                                 "for record %s\nException:\n%s" %
                                 (self.provider_data["_id"], e))

            for k in mapped_props.keys():
                mapped_props[k] = self.unnest_list(mapped_props.get(k), [])

            if mapped_props:
                self.update_source_resource(self.clean_dict(
                    mapped_props))
Example #3
0
def nypl_select_hasview(body, ctype):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (
            HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_high_res_link"
    source_key = u"hasView"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].",
                     original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].",
                     original_document_key, original_preview_key, data[u'id'])
        return body

    data[source_key] = {
        "@id": data[original_document_key][original_preview_key],
        "format": None
    }
    return json.dumps(data)
Example #4
0
    def map_subject(self):
        # Mapped from subject and genre
        #
        # Per discussion with Amy on 10 April 2014, don't worry about
        # checking whether heading maps to authority file. Amy simplified the
        # crosswalk.
        #
        # TODO: When present, we should probably pull in the valueURI and
        # authority values into the sourceResource.subject - this would
        # represent an index/API change, however.
        subject = []

        if exists(self.provider_data, "subject"):
            for v in iterify(getprop(self.provider_data, "subject")):
                if "topic" in v:
                    if isinstance(v, basestring):
                        subject.append(v["topic"])
                    elif isinstance(v["topic"], dict):
                        subject.append(v["topic"].get("#text"))
                    else:
                        logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"])
                if exists(v, "name/namePart"):
                    subject.append(getprop(v, "name/namePart"))

        if exists(self.provider_data, "genre"):
            for v in iterify(getprop(self.provider_data, "genre")):
                if isinstance(v, basestring):
                    subject.append(v)
                elif isinstance(v, dict):
                    subject.append(v.get("#text"))
                else:
                    logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"])

        if subject:
            self.update_source_resource({"subject": subject})
def nypl_select_hasview(body, ctype):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_high_res_link"
    source_key = u"hasView"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id'])
        return body

    data[source_key] = {"@id": data[original_document_key][original_preview_key], "format": None}
    return json.dumps(data)
Example #6
0
def cleanup_value(body, ctype, action="cleanup_value", prop=",".join(DEFAULT_PROP + DONT_STRIP_DOT_END)):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) applying a set of regexps to do data cleanup
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        convert(data, p)

    return json.dumps(data)
Example #7
0
def nypl_identify_object(body, ctype, list_sets=None):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    H = httplib2.Http('/tmp/.cache')
    H.force_exception_as_status_code = True
    resp, content = H.request(list_sets)
    if not resp[u'status'].startswith('2'):
        logger.error('  HTTP error (' + resp[u'status'] + ') resolving URL: ' + list_sets)
        return body
    content_dict = xmltodict.parse(content, xml_attribs=True, attr_prefix='', force_cdata=False, ignore_whitespace_cdata=True)
    sets = content_dict["nyplAPI"]["response"]

    for r in sets:
        if "collection" == r:
            for coll_dict in sets[r]:
                if "uuid" in coll_dict and "title" in coll_dict and (coll_dict["uuid"] == data["title"] or coll_dict["uuid"] in data["@id"]):
                    data["title"] = coll_dict["title"]

    return json.dumps(data)
Example #8
0
    def map_creator_and_contributor(self):
        prop = self.root_key + "name"
        mapped_props = {
            "creator": [],
            "contributor": []
        }

        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                name = s.get("namePart")
                if name:
                    role_terms = []
                    try:
                        for r in iterify(s.get("role")):
                            role_term = r.get("roleTerm")
                            if isinstance(role_term, dict):
                                role_terms.append(
                                        role_term.get("#text").lower())
                            elif isinstance(role_term, list):
                                for rt in role_term:
                                    role_terms.append(rt.lower())
                            else:
                                role_terms.append(role_term.lower())
                    except Exception as e:
                        logger.error("Error getting name/role/roleTerm for " +
                                     "record %s\nException:%\n%s" %
                                     (self.provider_data["_id"], e))
                        continue

                    if "creator" in role_terms:
                        mapped_props["creator"].append(name)
                    elif "contributor" in role_terms:
                        mapped_props["contributor"].append(name)

            self.update_source_resource(self.clean_dict(mapped_props))
Example #9
0
def nypl_identify_object(body, ctype, list_sets=None):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    H = httplib2.Http('/tmp/.cache')
    H.force_exception_as_status_code = True
    resp, content = H.request(list_sets)
    if not resp[u'status'].startswith('2'):
        logger.error('  HTTP error (' + resp[u'status'] + ') resolving URL: ' + list_sets)
        return body
    content_dict = xmltodict.parse(content, xml_attribs=True, attr_prefix='', force_cdata=False, ignore_whitespace_cdata=True)
    sets = content_dict["nyplAPI"]["response"]

    for r in sets:
        if "collection" == r:
            for coll_dict in sets[r]:
                if "uuid" in coll_dict and "title" in coll_dict and (coll_dict["uuid"] == data["title"] or coll_dict["uuid"] in data["@id"]):
                    data["title"] = coll_dict["title"]

    return json.dumps(data)
Example #10
0
    def map_rights(self):
        prop = self.root_key + "rights"

        if exists(self.provider_data, prop):
            rights = None
            license = None
            statement = None
            for s in iterify(getprop(self.provider_data, prop)):
                try:
                    qualifier = s.get("qualifier")
                    text = s.get("#text")
                except:
                    continue

                if qualifier == "license":
                    try:
                        license = "License: " + self.rights_term_label[text]
                    except:
                        msg = ("Term %s not in self.rights_term_label for %s" %
                               (text, self.provider_data["_id"]))
                        logger.error(msg)
                elif qualifier == "statement":
                    statement = text

            rights = "; ".join(filter(None, [rights, statement]))

            if rights:
                self.update_source_resource({"rights": rights})
def ucsb_aleph_marc_id(body, ctype):
    '''MARC sucks'''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    ident = None
    for field in data['fields']:
        if '856' in field:
            subfields = field['856']['subfields']
            for subf in subfields:
                if 'u' in subf:
                    # restrict to ones that have url like
                    # http://www.library.ucsb.edu/OBJID/Cylinder0002
                    if 'OBJID' in subf['u']:
                        ident = subf['u']

    if not ident:
        logger.error('NO 856 u for doc leader:{}'.format(data['leader']))
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "No id property was found"

    request_headers = copy_headers_to_dict(request.environ)
    source_name = request_headers.get('Source')

    data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident)
    data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()

    return json.dumps(data)
Example #12
0
    def map_is_show_at_object_has_view_and_dataprovider(self):
        def _get_media_type(d):
            pd = iterify(getprop(d, "physicalDescription"))
            for _dict in pd:
                if exists(_dict, "internetMediaType"):
                    return getprop(_dict, "internetMediaType")

        prop = "location"
        if exists(self.provider_data, prop):
            location = iterify(getprop(self.provider_data, prop))
            format = _get_media_type(self.provider_data)
            out = {}
            try:
                for _dict in location:
                    if "url" in _dict:
                        for url_dict in _dict["url"]:
                            if url_dict and "access" in url_dict:
                                if url_dict["access"] == "object in context":
                                    out["isShownAt"] = url_dict.get("#text")
                                elif url_dict["access"] == "preview":
                                    out["object"] = url_dict.get("#text")
                                elif url_dict["access"] == "raw object":
                                    has_view = {
                                        "@id": url_dict.get("#text"),
                                        "format": format
                                    }
                                    out["hasView"] = has_view
                    if ("physicalLocation" in _dict and isinstance(
                            _dict["physicalLocation"], basestring)):
                        out["dataProvider"] = _dict["physicalLocation"]
            except Exception as e:
                logger.error(e)

            if out:
                self.mapped_data.update(out)
Example #13
0
def replace_regex(body, ctype, prop=None, regex=None, new=None):
    """Replaces a regex in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    regex -- the regex to replace
    new -- the substring to replaced regex with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not regex:
        logger.error("No regex parameter supplied")
    else:
        if not new:
            logger.debug("NO New parameter, will replace with empty string")
            new = ''
        if exists(data, prop):
            v = getprop(data, prop)
            new_val = replace_regex_recurse_field(v, regex, new)
            setprop(data, prop, new_val)

    return json.dumps(data)
Example #14
0
    def map_creator_and_contributor(self):
        prop = self.root_key + "name"
        _dict = {
            "creator": [],
            "contributor": []
        }

        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                name = s.get("namePart")
                if name:
                    try:
                        role_terms = [r.get("roleTerm") for r in
                                      iterify(s.get("role"))]
                    except:
                        logger.error("Error getting name/role/roleTerm for " +
                                     "record %s" % self.provider_data["_id"])
                        continue

                    if "creator" in role_terms:
                       _dict["creator"].append(name)
                    elif "contributor" in role_terms:
                       _dict["contributor"].append(name)

            self.update_source_resource(self.clean_dict(_dict))
Example #15
0
def nypl_identify_object(body, ctype, download="True"):

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_document_key = u"originalRecord"
    original_preview_key = u"tmp_image_id"
    preview_format = "http://images.nypl.org/index.php?id={0}&t=t"

    if original_document_key not in data:
        logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id'])
        return body

    if original_preview_key not in data[original_document_key]:
        logger.error("There is no '%s/%s' key in JSON for doc [%s].", original_document_key, original_preview_key, data[u'id'])
        return body

    preview_url = preview_format.format(data[original_document_key][original_preview_key])
    data["object"] = preview_url

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #16
0
def location_transform_uva(d, p):
    def _get_media_type(d):
        pd = _as_list(getprop(d, "physicalDescription"))
        for _dict in pd:
            try:
                return selector_getprop(_dict, "internetMediaType")
            except KeyError:
                pass

    location = _as_list(getprop(d, p))
    format = _get_media_type(d)
    out = {}
    try:
        for _dict in location:
            if "url" in _dict:
                for url_dict in _dict["url"]:
                    if url_dict and "access" in url_dict:
                        if url_dict["access"] == "object in context":
                            out["isShownAt"] = url_dict.get("#text")
                        elif url_dict["access"] == "preview":
                            out["object"] = url_dict.get("#text")
                        elif url_dict["access"] == "raw object":
                            out["hasView"] = {
                                "@id:": url_dict.get("#text"),
                                "format": format
                            }
            if "physicalLocation" in _dict and isinstance(
                    _dict["physicalLocation"], basestring):
                out["dataProvider"] = _dict["physicalLocation"]
    except Exception as e:
        logger.error(e)
    finally:
        return out
Example #17
0
    def reverse_geocode(self, lat, lng):
        """
        Accepts latitude and longitude values and returns a geonames place
        that matches their value.
        """
        params = {
            "lat": lat,
            "lng": lng,
            "username": module_config().get("geonames_username"),
            "token": module_config().get("geonames_token")
        }
        url = DplaGeonamesGeocoder.base_uri + \
              "findNearbyJSON?%s" % urlencode(params)
        if (url not in DplaGeonamesGeocoder.resultCache):
            result = DplaGeonamesGeocoder._get_result(url)
            if ("geonames" in result \
                and len(result["geonames"]) > 0):
                DplaGeonamesGeocoder.resultCache[url] = result["geonames"][0]
            else:
                logger.error("Could not reverse geocode (%s, %s)" % (
                    lat,
                    lng,
                ))
                return None

        return DplaGeonamesGeocoder.resultCache[url]
def check_date_format(data, prop):
    """Checks that the begin and end dates are in the proper format"""
    date = getprop(data, prop, True)
    if date:
        for d in iterify(date):
            for k, v in d.items():
                if v and k != "displayDate":
                    try:
                        ymd = [int(s) for s in v.split("-")]
                    except:
                        err = "Invalid date.%s: non-integer in %s for %s" % \
                              (k, v, data.get("_id"))
                        logger.error(err)
                        setprop(d, k, None)
                        continue

                    year = ymd[0]
                    month = ymd[1] if len(ymd) > 1 else 1
                    day = ymd[2] if len(ymd) > 2 else 1
                    try:
                        datetime.datetime(year=year, month=month, day=day)
                    except ValueError, e:
                        logger.error("Invalid date.%s: %s for %s" %
                                     (k, e, data.get("_id")))
                        setprop(d, k, None)
Example #19
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens
                clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))
            
                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
Example #20
0
def location_transform_uva(d, p):
    def _get_media_type(d):
        pd = iterify(getprop(d, "physicalDescription"))
        for _dict in pd:
            try:
                return selector_getprop(_dict, "internetMediaType")
            except KeyError:
                pass

    location = iterify(getprop(d, p))
    format = _get_media_type(d)
    out = {}
    try:
        for _dict in location:
            if "url" in _dict:
                for url_dict in _dict["url"]:
                    if url_dict and "access" in url_dict:
                        if url_dict["access"] == "object in context":
                            out["isShownAt"] = url_dict.get("#text")
                        elif url_dict["access"] == "preview":
                            out["object"] = url_dict.get("#text")
                        elif url_dict["access"] == "raw object":
                            out["hasView"] = {"@id:": url_dict.get("#text"), "format": format}
            if "physicalLocation" in _dict and isinstance(_dict["physicalLocation"], basestring):
                out["dataProvider"] = _dict["physicalLocation"]
    except Exception as e:
        logger.error(e)
    finally:
        return out
Example #21
0
    def map_is_show_at_object_has_view_and_dataprovider(self):
        def _get_media_type(d):
            pd = iterify(getprop(d, "physicalDescription"))
            for _dict in pd:
                if exists(_dict, "internetMediaType"):
                    return getprop(_dict, "internetMediaType")

        prop = "location"
        if exists(self.provider_data, prop):
            location = iterify(getprop(self.provider_data, prop))
            format = _get_media_type(self.provider_data)
            out = {}
            try:
                for _dict in location:
                    if "url" in _dict:
                        for url_dict in _dict["url"]:
                            if url_dict and "access" in url_dict:
                                if url_dict["access"] == "object in context":
                                    out["isShownAt"] = url_dict.get("#text")
                                elif url_dict["access"] == "preview":
                                    out["object"] = url_dict.get("#text")
                                elif url_dict["access"] == "raw object":
                                    has_view = {"@id": url_dict.get("#text"),
                                                "format": format}
                                    out["hasView"] = has_view
                    if ("physicalLocation" in _dict and
                        isinstance(_dict["physicalLocation"], basestring)):
                        out["dataProvider"] = _dict["physicalLocation"]
            except Exception as e:
                logger.error(e)

            if out:
                self.mapped_data.update(out)
Example #22
0
    def map_date(self):
        originInfoPath = self.root_key + "originInfo"
        dateCreated = []
        dateIssued = []
        date_begin, date_end = None, None

        if exists(self.provider_data, originInfoPath):
            for date in iterify(getprop(self.provider_data, originInfoPath)):
                if "dateCreated" in date:
                    dateCreated.append(textnode(date["dateCreated"]))

                if "dateIssued" in date:
                    t = date["dateIssued"]
                    try:
                        if "point" not in t:
                            dateIssued.append(textnode(t))
                        elif "point" in t and t["point"] == "start":
                            date_begin = textnode(t)
                        elif "point" in t and t["point"] == "end":
                            date_end = textnode(t)
                    except Exception as e:
                        logger.error("Exception when trying to map date "
                                     "values. for record %s \n\n%s" %
                                     (self.provider_data % e.message))

        # If there are no dateIssued or dateCreated properties then construct
        # a date range from begin and end points (if they exist).
        if date_begin and date_end and not dateCreated and not dateIssued:
            dateIssued.append(date_begin + "-" + date_end)

        if dateCreated:
            self.update_source_resource({"date": dateCreated})
        elif dateIssued:
            self.update_source_resource({"date": dateIssued})
Example #23
0
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to return
    a single 8601 date from the input string. No range checking is performed, and
    any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid dates,
    e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    dd = dateparser.to_iso8601(d)
    if dd is None or out_of_range(d):
        try:
            dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dd.year == DEFAULT_DATETIME.year:
                dd = None
        except Exception:
            try:
                dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                pass
            except Exception as e:
                logger.error(e)

        if dd:
            ddiso = dd.isoformat()
            return ddiso[:ddiso.index('T')]

    return dd
Example #24
0
def set_prop(body,
             ctype,
             prop=None,
             value=None,
             condition_prop=None,
             condition_value=None):
    """Sets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to set
    value -- the value to set prop to
    condition_prop -- (optional) the field that must exist to set the prop
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not value:
        logger.error("No value was supplied to set_prop.")
    else:
        # If there is no condition_prop, set the prop, creating it if it does
        #not exist. If there is a condition_prop, only set the prop if the
        # condition_prop exists.
        if not condition_prop or exists(data, condition_prop):
            setprop(data, prop, value)

    return json.dumps(data)
Example #25
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
Example #26
0
def find_file_extension(mime):
    """
    Finds out the file extension based on the MIME type from the opened
    connection.

    Implementation:
        Function is using the configuration field 'mime_to_type' stored
        at akara.conf.

    Arguments:
        mime (String)   -   MIME type read from the HTTP headers

    Returns:
        file extension (String) - extension for the file -
        WITH DOT AT THE BEGINNING!!!

    Throws:
        throws exception if it cannot find the extension
    """

    if mime in MIME_TYPES:
        ext = MIME_TYPES[mime]
        logger.debug("MIME type is [%s], returning extension [%s]" % \
                (mime, ext))
        return ext
    else:
        msg = "Cannot find extension for mime type: [%s]." % mime
        logger.error(msg)
        raise FileExtensionException(msg)
Example #27
0
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(
            getprop(data, "originalRecord/_id"),
            getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]",
                     original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #28
0
def cleanup_value(body,
                  ctype,
                  action="cleanup_value",
                  prop=",".join(DEFAULT_PROP + DONT_STRIP_DOT_END)):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) applying a set of regexps to do data cleanup
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        convert(data, p)

    return json.dumps(data)
Example #29
0
def replace_substring(body, ctype, prop=None, old=None, new=None):
    """Replaces a substring in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    old -- the substring to replace
    new -- the substring to replaced old with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not old or not new:
        logger.error("No old or new parameters were provided")
    else:
        if exists(data, prop):
            v = getprop(data, prop)
            setprop(data, prop, v.replace(old, new))

    return json.dumps(data)
Example #30
0
def ia_identify_object(body, ctype, download="True"):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #31
0
    def map_date(self):
        originInfoPath = self.root_key + "originInfo"
        dateCreated = []
        dateIssued = []
        date_begin, date_end = None, None

        if exists(self.provider_data, originInfoPath):
            for date in iterify(getprop(self.provider_data, originInfoPath)):
                if "dateCreated" in date:
                    dateCreated.append(textnode(date["dateCreated"]))

                if "dateIssued" in date:
                    t = date["dateIssued"]
                    try:
                        if "point" not in t:
                            dateIssued.append(textnode(t))
                        elif "point" in t and t["point"] == "start":
                            date_begin = textnode(t)
                        elif "point" in t and t["point"] == "end":
                            date_end = textnode(t)
                    except Exception as e:
                        logger.error("Exception when trying to map date "
                                     "values. for record %s \n\n%s" %
                                     (self.provider_data % e.message))

        # If there are no dateIssued or dateCreated properties then construct
        # a date range from begin and end points (if they exist).
        if date_begin and date_end and not dateCreated and not dateIssued:
            dateIssued.append(date_begin + "-" + date_end)

        if dateCreated:
            self.update_source_resource({"date": dateCreated})
        elif dateIssued:
            self.update_source_resource({"date": dateIssued})
def creator_and_contributor_transform(d, p):
    val = {}
    creator = []
    contributor = []

    for s in iterify(getprop(d, p)):
        name_part = s.get("namePart")
        if name_part:
            try:
                role_terms = [r.get("roleTerm") for r in
                              iterify(s.get("role"))]
            except:
                logger.error("Error getting name/role/roleTerm for record %s" %
                             d["_id"])
                continue

            if "creator" in role_terms:
                creator.append(name_part)
            elif "contributor" in role_terms:
                contributor.append(name_part)

    if creator:
        val["creator"] = creator
    if contributor:
        val["contributor"] = contributor

    return val
Example #33
0
def set_prop(body, ctype, prop=None, value=None, condition_prop=None,
             condition_value=None):
    """Sets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to set
    value -- the value to set prop to
    condition_prop -- (optional) the field that must exist to set the prop
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not value:
        logger.error("No value was supplied to set_prop.")
    else:
        # If there is no condition_prop, set the prop, creating it if it does
        #not exist. If there is a condition_prop, only set the prop if the
        # condition_prop exists.
        if not condition_prop or exists(data, condition_prop):
            setprop(data, prop, value)

    return json.dumps(data)
Example #34
0
def david_rumsey_identify_object(body, ctype, download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    handle_field = "originalRecord/handle"
    if exists(data, handle_field):
        handle = getprop(data, handle_field)
    else:
        logger.error("Field %s does not exist" % handle_field)
        return body

    data["object"] = handle[1]

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #35
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue  # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://"
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
Example #37
0
def capitalize_value(body, ctype, prop=",".join(DEFAULT_PROP), exclude=None):
    """
    Service that accepts a JSON document and capitalizes the prop field of that document
    """

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    prop = prop.split(",")
    if exclude in prop:
        prop.remove(exclude)

    for p in prop:
        if p:
            capitalize(data, p)

    return json.dumps(data)
def replace_substring(body, ctype, prop=None, old=None, new=None):
    """Replaces a substring in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    old -- the substring to replace
    new -- the substring to replaced old with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not old or not new:
        logger.error("No old or new parameters were provided")
    else:
        if exists(data, prop):
            v = getprop(data, prop)
            setprop(data, prop, v.replace(old, new))

    return json.dumps(data)
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = ("akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if latest_ingestion_doc and \
       getprop(latest_ingestion_doc, "delete_process/status") != "complete":
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path)
    logger.debug("Ingestion document %s created." % ingestion_document_id)

    return ingestion_document_id
Example #40
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens
                clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))

                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
def find_file_extension(mime):
    """
    Finds out the file extension based on the MIME type from the opened
    connection.

    Implementation:
        Function is using the configuration field 'mime_to_type' stored
        at akara.conf.

    Arguments:
        mime (String)   -   MIME type read from the HTTP headers

    Returns:
        file extension (String) - extension for the file -
        WITH DOT AT THE BEGINNING!!!

    Throws:
        throws exception if it cannot find the extension
    """

    if mime in MIME_TYPES:
        ext = MIME_TYPES[mime]
        logger.debug("MIME type is [%s], returning extension [%s]" % \
                (mime, ext))
        return ext
    else:
        msg = "Cannot find extension for mime type: [%s]." % mime
        logger.error(msg)
        raise FileExtensionException(msg)
Example #42
0
def pipe(content, ctype, enrichments, wsgi_header):
    body = json.dumps(content)
    error = None
    for uri in enrichments:
        if not uri: continue # in case there's no pipeline
        if not is_absolute(uri):
            prefix = request.environ["wsgi.url_scheme"] + "://" 
            if request.environ.get("HTTP_HOST"):
                prefix += request.environ["HTTP_HOST"]
            else:
                prefix += request.environ["SERVER_NAME"]
            # Join the prefix and given pipeline module path, ensuring the
            # path starts with "/".
            uri = prefix + re.sub(r"^(?!/)", "/", uri)
        headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header])
        headers["content-type"] = ctype
        logger.debug("Calling url: %s " % uri)
        resp, cont = H.request(uri, "POST", body=body, headers=headers)
        if not str(resp.status).startswith("2"):
            error = "Error in enrichment pipeline at %s" % uri
            logger.error(error)
            continue

        body = cont

    return error, body
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
Example #44
0
def robust_date_parser(d):
    """
    Robust wrapper around some date parsing libs, making a best effort to return
    a single 8601 date from the input string. No range checking is performed, and
    any date other than the first occuring will be ignored.

    We use timelib for its ability to make at least some sense of invalid dates,
    e.g. 2012/02/31 -> 2012/03/03

    We rely only on dateutil.parser for picking out dates from nearly arbitrary
    strings (fuzzy=True), but at the cost of being forgiving of invalid dates
    in those kinds of strings.

    Returns None if it fails
    """
    dd = dateparser.to_iso8601(d)
    if dd is None or out_of_range(d):
        try:
            dd = dateutil_parse(d, fuzzy=True, default=DEFAULT_DATETIME)
            if dd.year == DEFAULT_DATETIME.year:
                dd = None
        except Exception:
            try:
                dd = timelib.strtodatetime(d, now=DEFAULT_DATETIME_SECS)
            except ValueError:
                pass
            except Exception as e:
                logger.error(e)

        if dd:
            ddiso = dd.isoformat()
            return ddiso[:ddiso.index('T')]

    return dd
Example #45
0
def check_date_format(data, prop):
    """Checks that the begin and end dates are in the proper format"""
    date = getprop(data, prop, True)
    if date:
        for d in iterify(date):
            for k, v in d.items():
                if v and k != "displayDate":
                    try:
                        ymd = [int(s) for s in v.split("-")]
                    except:
                        err = "Invalid date.%s: non-integer in %s for %s" % \
                              (k, v, data.get("_id"))
                        logger.error(err)
                        setprop(d, k, None)
                        continue

                    year = ymd[0]
                    month = ymd[1] if len(ymd) > 1 else 1
                    day = ymd[2] if len(ymd) > 2 else 1
                    try:
                        datetime.datetime(year=year, month=month, day=day)
                    except ValueError, e:
                        logger.error("Invalid date.%s: %s for %s" %
                                     (k, e, data.get("_id")))
                        setprop(d, k, None)
Example #46
0
def convert(data, path, name, conv, path_delim="/"):
    """ Converts data using converters.

    Args:
        data (obj) : Structure changed in place.

        path (Str) : Path to the key to read from and convert value.

        name (Str) : Name of the key for writing the converted value.
                     This key is stored in the same dictionary as value read
                     from using the path argument.

        conv (dict): Dictionary used for conversion

    Returns:
        Nothing, the data argument is changed in place.

    Raises:
        Nothing

    This function is called recursively. Inspired by the setprop from selector
    module.

    Each call the path is stripped by one element.

    If the data argumetn is a dictionary, then data[path_part] is passed to
    the next recursive call.

    If the data is a list, then the function is called for each element.

    """
    # So we should convert now
    if not path_delim in path:
        # There is a list to convert.
        # If there is a list of dictionaries, each dictionary has to be
        # converted.
        if isinstance(data, list):
            for el in data:
                convert_last(el, path, name, conv)
        else:
            convert_last(data, path, name, conv)

        return

    # If there is deeper path, let's check it
    pp, pn = tuple(path.lstrip(path_delim).split(path_delim, 1))

    # For list: iterate over elements
    if isinstance(data, list):
        for el in data:
            convert(el, path, name, conv)
    elif isinstance(data, dict):
        if pp not in data:
            # Then just do nothing
            logger.error("Couldn't find {0} in data.".format(pp))
        else:
            convert(data[pp], pn, name, conv)
    else:
        logger.error("Got data of unknown type")
Example #47
0
def remove_pid(pid_file):
    "Remove the given filename (which should be the PID file)"
    try:
        os.remove(pid_file)
    except Exception, error:
        if not os.path.exists(pid_file):
            logger.error("Unable to remove PID file %r: %s",
                      pid_file, error)
Example #48
0
 def map_is_shown_at(self):
     # these live on calisphere. so the isShownAt is:
     # https://calisphere.org/item/<ID>
     self.mapped_data.update({"isShownAt": url_calisphere_item_base +
         self.provider_data.get('uid', None)})
     logger.error("keys in PDS:{}".format(self.provider_data.keys()))
     self.mapped_data.update({"isShownBy":
                               self.provider_data.get('isShownBy')})
Example #49
0
 def _get_result(url):
     try:
         result = json.loads(util.decode_page(urlopen_with_retries(url)))
         return result
     except URLError, e:
         logger.error("GeoNames error, could not open URL: %s, error: %s" %
                      (url, e))
         return {}
Example #50
0
 def _get_result(url):
     try:
         result = json.loads(util.decode_page(urlopen_with_retries(url)))
         return result
     except URLError, e:
         logger.error("GeoNames error, could not open URL: %s, error: %s" %
                      (url, e))
         return {}
Example #51
0
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header "Pipeline-Item"
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u"Pipeline-Item","").split(",")

    records = json.loads(body)

    # Counts
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        error, enriched_record_text = pipe(record, ctype, rec_enrichments,
                                           "HTTP_PIPELINE_ITEM")
        if error:
            errors.append(error)

        enriched_record = json.loads(enriched_record_text)

        if enriched_record.get("_id", None):
            ingest_type = enriched_record.get("ingestType")
            # Item records should have sourceResource
            if (ingest_type == "item" and not
                "sourceResource" in enriched_record):
                logger.error("Record %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            logger.error("Found a record without an _id %s" % enriched_record)
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)


    return json.dumps(docs)
Example #52
0
def enrich_storage(body, ctype):
    """Establishes a pipeline of services identified by an ordered list of URIs
       provided in request header "Pipeline-Item"
    """

    request_headers = copy_headers_to_dict(request.environ)
    rec_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")

    records = json.loads(body)

    # Counts
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        error, enriched_record_text = pipe(record, ctype, rec_enrichments,
                                           "HTTP_PIPELINE_ITEM")
        if error:
            errors.append(error)

        enriched_record = json.loads(enriched_record_text)

        if enriched_record.get("_id", None):
            ingest_type = enriched_record.get("ingestType")
            # Item records should have sourceResource
            if (ingest_type == "item"
                    and not "sourceResource" in enriched_record):
                logger.error("Record %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            logger.error("Found a record without an _id %s" % enriched_record)
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)

    return json.dumps(docs)
Example #53
0
def mdlenrichlocation(body,
                      ctype,
                      action="mdl-enrich-location",
                      prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document by
    combining all spatial fields into one. Will also split out country and state on a 
    best-efforts basis.

    For primary use with MDL documents.

    Possible avenues of improvement:
      - For fields with semi-colons, permute and create multiple spatial elements 
      - Create an ordered list of "names" for the geocoder to attempt to lookup 
        as opposed to our single concatenated list:
          - Everything concatenated together 
          - Everything concatenated together up to "United States" 
          - Remove left-most elements one by one
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        sp = {}
        v = getprop(data, prop)
        fields = len(v)
        if not fields:
            logger.error("Spatial is empty.")
            return json.dumps(data)
        else:
            # Concatenate all values together to form the name field
            sp["name"] = ", ".join(v)

            if (1 == fields):
                # If there is only one element present, it is a country
                sp["country"] = clean(v[0])
            elif "United States" in v:
                country_index = v.index("United States")
                sp["country"] = clean(v[country_index])

                # The prior item is almost always a state
                if (country_index > 1):
                    state = clean(v[country_index - 1])
                    if (is_state(state)):
                        sp["state"] = state

        if sp:
            sp = [sp]
            setprop(data, prop, sp)

    return json.dumps(data)
Example #54
0
def origin_info_transform(d, p):
    val = {}
    v = getprop(d, p)

    # date
    date = None
    if "dateCreated" in v:
        date = v["dateCreated"]
    if not date and getprop(v, "dateOther/keyDate", True) == "yes":
        date = getprop(v, "dateOther/#text", True)

    if isinstance(date, list):
        dd = {}
        for i in date:
            if isinstance(i, basestring):
                dd["displayDate"] = i
            elif "point" in i:
                if i["point"] == "start":
                    dd["begin"] = i["point"]
                else:
                    dd["end"] = i["point"]
            else:
                # Odd date? Log error and investigate
                logger.error("Invalid date in record %s" % d["_id"])
        date = dd if dd else None

    if date and date != "unknown":
        val["date"] = date

    # publisher
    if "publisher" in v:
        val["publisher"] = []
        pub = v["publisher"]

        di = v.get("dateIssued", None)
        di = di[0] if isinstance(di, list) else di

        # Get all placeTerms of type "text"
        terms = []
        if "place" in v:
            place = v["place"]
            for p in (place if isinstance(place, list) else [place]):
                if getprop(p, "placeTerm/type", True) == "text":
                    terms.append(getprop(p, "placeTerm/#text", True))

        for t in filter(None, terms):
            if di:
                val["publisher"].append("%s: %s, %s" % (t, pub, di))
            else:
                val["publisher"].append("%s: %s" % (t, pub))
        if len(val["publisher"]) == 1:
            val["publisher"] = val["publisher"][0]

    return val