コード例 #1
0
    def map_creator_and_contributor(self):
        prop = self.root_key + "name"

        if exists(self.provider_data, prop):
            creator_and_contributor = {}
            names = []
            for s in iterify(getprop(self.provider_data, prop)):
                name = {}
                name["name"] = self.name_from_name_part(
                    getprop(s, "namePart", True))
                if name["name"]:
                    name["type"] = getprop(s, "type", True)
                    name["roles"] = []
                    if "role" in s:
                        roles = getprop(s, "role")
                        for r in iterify(roles):
                            role = r["roleTerm"]
                            if isinstance(role, dict):
                                role = role["#text"]
                            name["roles"].append(role)

                    names.append(name)

            # Set creator
            creator = [name for name in names if "creator" in name["roles"]]
            creator = creator[0] if creator else names[0]
            names.remove(creator)
            creator_and_contributor["creator"] = creator["name"]

            # Set contributor
            contributor = [name["name"] for name in names]
            if contributor:
                creator_and_contributor["contributor"] = contributor

            self.update_source_resource(creator_and_contributor)
コード例 #2
0
def creator_and_contributor_transform(d, p):
    val = {}

    v = getprop(d, p)
    names = []
    for s in (v if isinstance(v, list) else [v]):
        name = {}
        name["name"] = name_from_name_part(getprop(s, "namePart", True))
        if name["name"]:
            name["type"] = getprop(s, "type", True)
            name["roles"] = []
            if "role" in s:
                roles = getprop(s, "role")
                for r in (roles if isinstance(roles, list) else [roles]):
                    role = r["roleTerm"]
                    if isinstance(role, dict):
                        role = role["#text"]
                    name["roles"].append(role)

            names.append(name)

    # Set creator
    creator = [name for name in names if "creator" in name["roles"]]
    creator = creator[0] if creator else names[0]
    names.remove(creator)
    val["creator"] = creator["name"]

    # Set contributor
    contributor = [name["name"] for name in names]
    if contributor:
        val["contributor"] = contributor

    return val
コード例 #3
0
def ia_identify_object(body, ctype, download="True"):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
コード例 #4
0
    def map_relation(self):
        prop = self.root_key + "relatedItem"

        if exists(self.provider_data, prop):
            relation = []
            host = None
            series = None
            for s in iterify(getprop(self.provider_data, prop)):
                title = getprop(s, "titleInfo/title", True)
                if title is not None:
                    if s.get("type") == "host":
                        host = title
                    if s.get("type") == "series":
                        series = title

                if host:
                    val = host
                    if series:
                        val += ". " + series
                    relation.append(val)

            relation = relation[0] if len(relation) == 1 else relation

            if relation:
                self.update_source_resource({"relation": relation})
コード例 #5
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def map_spatial_and_subject_and_temporal(self):
        path = "/metadata/mods/subject"
        subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"]
        spatials = []
        temporals = []
        subjects = []

        if exists(self.provider_data, path):
            for subject in iterify(getprop(self.provider_data, path)):
                if "cartographics" in subject and \
                                "coordinates" in subject["cartographics"]:
                    coord = subject["cartographics"]["coordinates"]
                    spatials.append({"name": coord})

                if "geographic" in subject:
                    for g in iterify(getprop(subject, "geographic")):
                        spatials.append({"name": textnode(g)})

                if "temporal" in subject:
                    for t in iterify(getprop(subject, "temporal")):
                        temporals.append(textnode(t))

                for s_path in subject_props:
                    for s in iterify(getprop(subject, s_path, True)):
                        subjects.append(s)

        if spatials:
            self.update_source_resource({"spatial": spatials})
        if temporals:
            self.update_source_resource({"temporal": temporals})
        if subjects:
            self.update_source_resource({"subject": subjects})
コード例 #6
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
コード例 #7
0
def uscsetdataprovider(body, ctype, prop="dataProvider"):
    """   
    Service that accepts a JSON document and sets the "dataProvider"
    field of that document to:

    1. The first value of the originalRecord/source field (placed in
       dataProvider in the oai-to-dpla module) for the chs set (setSpec
       p15799coll65)
    2. The string "University of Southern California. Libraries" for all
       other sets

    For primary use with USC documents
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"


    data_provider = getprop(data, "dataProvider", True)
    if getprop(data, "originalRecord/setSpec") == "p15799coll65":
        setprop(data, "dataProvider", data_provider[0])
    else:
        setprop(data, "dataProvider",
                "University of Southern California. Libraries")

    return json.dumps(data)
コード例 #8
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
コード例 #9
0
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(
            getprop(data, "originalRecord/_id"),
            getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]",
                     original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
コード例 #10
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_spatial_and_subject_and_temporal(self):
        path = "/metadata/mods/subject"
        subject_props = ['topic', 'genre', 'occupation', "/titleInfo/title"]
        spatials = []
        temporals = []
        subjects = []

        if exists(self.provider_data, path):
            for subject in iterify(getprop(self.provider_data, path)):
                if "cartographics" in subject and \
                                "coordinates" in subject["cartographics"]:
                    coord = subject["cartographics"]["coordinates"]
                    spatials.append({"name": coord })

                if "geographic" in subject:
                    for g in iterify(getprop(subject, "geographic")):
                        spatials.append({"name": textnode(g)})

                if "temporal" in subject:
                    for t in iterify(getprop(subject, "temporal")):
                        temporals.append(textnode(t))

                for s_path in subject_props:
                    for s in iterify(getprop(subject, s_path, True)):
                        subjects.append(s)

        if spatials:
            self.update_source_resource({"spatial": spatials})
        if temporals:
            self.update_source_resource({"temporal": temporals})
        if subjects:
            self.update_source_resource({"subject": subjects})
def set_field_from_value_mode(data, field, mode, value, multivalue=True):
    '''Set the value for the data "field" from data in collection
    ckey field with the value passed in.
    '''
    logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue))
    if value: #no value don't bother
        if mode=='overwrite':
            if exists(data, field):
                setprop(data, field, value)
            else:
                pp,pn = tuple(field.lstrip('/').split('/',1))
                if not pp in data:
                    data[pp] = {}
                data[pp][pn] = value
        elif mode=='append':
            new_value = []
            if exists(data, field):
                old_value = getprop(data, field)
                if isinstance(old_value, list):
                    new_value.extend(old_value)
                else:
                    new_value.append(old_value)
            if isinstance(value, list):
                new_value.extend(value)
            else:
                new_value.append(value)
            setprop(data, field, new_value)
        else: # fill blanks
            if not exists(data, field) or not getprop(data,
                    field,keyErrorAsNone=True):
                if multivalue and not isinstance(value, list):
                    value = [value]
                setprop(data, field, value)
    return data
コード例 #12
0
def creator_and_contributor_transform(d, p):
    val = {}

    v = getprop(d, p)
    names = []
    for s in (v if isinstance(v, list) else [v]):
        name = {}
        name["name"] = name_from_name_part(getprop(s, "namePart", True))
        if name["name"]:
            name["type"] = getprop(s, "type", True)
            name["roles"] = []
            if "role" in s:
                roles = getprop(s, "role")
                for r in (roles if isinstance(roles, list) else [roles]):
                    role = r["roleTerm"]
                    if isinstance(role, dict):
                        role = role["#text"]
                    name["roles"].append(role)

            names.append(name)

    # Set creator
    creator = [name for name in names if "creator" in name["roles"]]
    creator = creator[0] if creator else names[0]
    names.remove(creator)
    val["creator"] = creator["name"]

    # Set contributor
    contributor = [name["name"] for name in names]
    if contributor:
        val["contributor"] = contributor

    return val
コード例 #13
0
def uscsetdataprovider(body, ctype, prop="dataProvider"):
    """   
    Service that accepts a JSON document and sets the "dataProvider"
    field of that document to:

    1. The first value of the originalRecord/source field (placed in
       dataProvider in the oai-to-dpla module) for the chs set (setSpec
       p15799coll65)
    2. The string "University of Southern California. Libraries" for all
       other sets

    For primary use with USC documents
    """

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    data_provider = getprop(data, "dataProvider", True)
    if getprop(data, "originalRecord/setSpec") == "p15799coll65":
        setprop(data, "dataProvider", data_provider[0])
    else:
        setprop(data, "dataProvider",
                "University of Southern California. Libraries")

    return json.dumps(data)
コード例 #14
0
ファイル: cdl_json_mapper.py プロジェクト: mlhale7/ingestion
    def map_source_resource(self):
        super(CDLJSONMapper, self).map_source_resource()
        maps = {
            "alternative_title_ss": "alternative",
            "contributor_ss": "contributor",
            "creator_ss": "creator",
            "date_ss": "date",
            "description": "description",
            "extent_ss": "extent",
            "format_ss": "format",
            "genre_ss": "hasType",
            "identifier_ss": "identifier",
            "language_ss": "language",
            "coverage_ss": "spatial",
            "publisher_ss": "publisher",
            "relation_ss": "relation",
            "rights_ss": "rights",
            "rights_note_ss": "rights",
            "rights_date_ss": "rights",
            "rightsholder_ss": "rights",
            "subject_ss": "subject",
            "temporal_ss": "temporal",
            "title_ss": "title",
            "type_ss": "type"
        }

        for (source, dest) in maps.iteritems():
            values = \
                iterify(getprop(self.provider_data, source, True))
            if values:
                existing_values = \
                    getprop(self.mapped_data["sourceResource"], dest, True)
                if existing_values:
                    values = list(set(values + existing_values))
                self.update_source_resource({dest: values})
コード例 #15
0
ファイル: bhl_mods.py プロジェクト: dpla/ingestion
    def map_creator_and_contributor(self):
        prop = self.root_key + "name"

        if exists(self.provider_data, prop):
            creator_and_contributor = {}
            names = []
            for s in iterify(getprop(self.provider_data, prop)):
                name = {}
                name["name"] = self.name_from_name_part(
                                getprop(s, "namePart", True)
                                )
                if name["name"]:
                    name["type"] = getprop(s, "type", True)
                    name["roles"] = []
                    if "role" in s:
                        roles = getprop(s, "role")
                        for r in iterify(roles):
                            role = r["roleTerm"]
                            if isinstance(role, dict):
                                role = role["#text"]
                            name["roles"].append(role)

                    names.append(name)

            # Set creator
            creator = [name["name"] for name in names if "creator" in name["roles"]]
            if creator:
                creator_and_contributor["creator"] = creator

            # Set contributor
            contributor = [name["name"] for name in names if "contributor" in name["roles"]]
            if contributor:
                creator_and_contributor["contributor"] = contributor

            self.update_source_resource(creator_and_contributor)
コード例 #16
0
def ia_identify_object(body, ctype, download="True"):
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    original_preview_key = "originalRecord/files/gif"
    preview_format = "http://www.archive.org/download/{0}/{1}"

    try:
        preview_url = preview_format.format(getprop(data, "originalRecord/_id"), getprop(data, original_preview_key))
    except KeyError:
        logger.error("Can not build preview url by path \"%s\" for doc [%s]", original_preview_key, data[u"id"])
        return body

    data["object"] = preview_url
    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
コード例 #17
0
 def map_format(self):
     if exists(self.provider_data, "medium"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "medium")})
     elif exists(self.provider_data, "format"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "format")})
コード例 #18
0
    def map_spatial(self):
        spatial = []
        prop = "subject"
        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "hierarchicalGeographic" in s:
                    spatial = s["hierarchicalGeographic"]
                    name = ", ".join(
                        filter(None, [
                            spatial.get("city"),
                            spatial.get("county"),
                            spatial.get("state"),
                            spatial.get("country")
                        ]))
                    spatial["name"] = name
                    spatial = [spatial]

        prop = "originInfo/place"
        if not spatial and exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "placeTerm" in s:
                    for place in iterify(s["placeTerm"]):
                        if "type" in place and place["type"] != "code":
                            spatial.append(place["#text"])

        if spatial:
            self.update_source_resource({"spatial": spatial})
コード例 #19
0
ファイル: cdl_json_mapper.py プロジェクト: dpla/ingestion
    def map_source_resource(self):
        super(CDLJSONMapper, self).map_source_resource()
        maps = {
            "alternative_title_ss": "alternative",
            "contributor_ss": "contributor",
            "creator_ss": "creator",
            "date_ss": "date",
            "description": "description",
            "extent_ss": "extent",
            "format_ss": "format",
            "genre_ss": "hasType",
            "identifier_ss": "identifier",
            "language_ss": "language",
            "coverage_ss": "spatial",
            "publisher_ss": "publisher",
            "relation_ss": "relation",
            "rights_ss": "rights",
            "rights_note_ss": "rights",
            "rights_date_ss": "rights",
            "rightsholder_ss": "rights",
            "subject_ss": "subject",
            "temporal_ss": "temporal",
            "title_ss": "title",
            "type_ss": "type"
        }

        for (source, dest) in maps.iteritems():
            values = \
                iterify(getprop(self.provider_data, source, True))
            if values:
                existing_values = \
                    getprop(self.mapped_data["sourceResource"], dest, True)
                if existing_values:
                    values = list(set(values + existing_values))
                self.update_source_resource({dest: values})
コード例 #20
0
ファイル: nypl_mapper.py プロジェクト: dpla/ingestion
    def map_subject(self):
        # Mapped from subject and genre
        #
        # Per discussion with Amy on 10 April 2014, don't worry about
        # checking whether heading maps to authority file. Amy simplified the
        # crosswalk.
        #
        # TODO: When present, we should probably pull in the valueURI and
        # authority values into the sourceResource.subject - this would
        # represent an index/API change, however.
        subject = []

        if exists(self.provider_data, "subject"):
            for v in iterify(getprop(self.provider_data, "subject")):
                if "topic" in v:
                    if isinstance(v, basestring):
                        subject.append(v["topic"])
                    elif isinstance(v["topic"], dict):
                        subject.append(v["topic"].get("#text"))
                    else:
                        logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"])
                if exists(v, "name/namePart"):
                    subject.append(getprop(v, "name/namePart"))

        if exists(self.provider_data, "genre"):
            for v in iterify(getprop(self.provider_data, "genre")):
                if isinstance(v, basestring):
                    subject.append(v)
                elif isinstance(v, dict):
                    subject.append(v.get("#text"))
                else:
                    logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"])

        if subject:
            self.update_source_resource({"subject": subject})
コード例 #21
0
ファイル: oai_mods_mapper.py プロジェクト: dpla/ingestion
    def map_subject_spatial_and_temporal(self, geographic_subject=True):
        prop = self.root_key + "subject"

        if exists(self.provider_data, prop):
            ret_dict = {
                "subject": [],
                "spatial": [],
                "temporal": []
            }
            for s in iterify(getprop(self.provider_data, prop)):
                subject = []
                if "name" in s:
                    namepart = getprop(s, "name/namePart", True)
                    name = self.name_from_name_part(namepart)
                    if name and name not in subject:
                        subject.append(name)

                if "topic" in s:
                    for t in iterify(s["topic"]):
                        if t and t not in subject:
                            subject.append(t)

                if "geographic" in s:
                    for g in iterify(s["geographic"]):
                        if g:
                            if geographic_subject and g not in subject:
                                subject.append(g)
                            if g not in ret_dict["spatial"]:
                                ret_dict["spatial"].append(g)

                if "hierarchicalGeographic" in s:
                    for h in iterify(s["hierarchicalGeographic"]):
                        if isinstance(h, dict):
                            # TODO:  use set logic and declarative style, as
                            # in MissouriMapper, instead of deleting list
                            # elements
                            for k in h.keys():
                                if k not in ["city", "county", "state",
                                             "country", "coordinates"]:
                                    del h[k]
                            if h not in ret_dict["spatial"]:
                                ret_dict["spatial"].append(h)
                            if "country" in h:
                                ret_dict["spatial"].append(h["country"])

                coords = getprop(s, "cartographics/coordinates", True)
                if coords and coords not in ret_dict["spatial"]:
                    ret_dict["spatial"].append(coords)

                if "temporal" in s:
                    ret_dict["temporal"].append(s["temporal"])

                ret_dict["subject"].append("--".join(subject))

            for k in ret_dict.keys():
                if not ret_dict[k]:
                    del ret_dict[k]

            self.update_source_resource(ret_dict)
コード例 #22
0
def oaimodstodpla(body, ctype, geoprop=None, provider=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    if provider == "BPL":
        data = remove_key_prefix(data, "mods:")

    # Apply all transformation rules from original document
    transformer_pipeline = {}
    transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}),
                                **CHO_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out["sourceResource"].update(transformer_pipeline[p](data, p))
    transformer_pipeline = {}
    transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}),
                                **AGGREGATION_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out.update(transformer_pipeline[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    if provider == "HARVARD":
        out["sourceResource"].update(identifier_transform_harvard(data))
        out.update(url_transform_harvard(data))
        out.update(data_provider_transform_harvard(data))

    # Join dataProvider with isPartOf for BPL
    if provider == "BPL":
        try:
            ipo = getprop(out, "dataProvider") + ". " + \
                  getprop(out, "sourceResource/isPartOf")
            setprop(out, "sourceResource/isPartOf", ipo.replace("..", "."))
        except:
            pass

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
コード例 #23
0
 def update_relation(self):
     # Join dataProvider with relation
     try:
         relation = getprop(self.mapped_data, "dataProvider") + ". " + \
                    getprop(self.mapped_data, "sourceResource/relation")
         self.update_source_resource({"relation":
                                       relation.replace("..", ".").strip()})
     except:
         pass
コード例 #24
0
 def map_date(self):
     if exists(self.provider_data, "date"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "date")
         })
     elif exists(self.provider_data, "created"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "created")
         })
コード例 #25
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if (exists(url, "access")
                             and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})
コード例 #26
0
    def map_subject_spatial_and_temporal(self, geographic_subject=True):
        prop = self.root_key + "subject"

        if exists(self.provider_data, prop):
            ret_dict = {"subject": [], "spatial": [], "temporal": []}
            for s in iterify(getprop(self.provider_data, prop)):
                subject = []
                if "name" in s:
                    namepart = getprop(s, "name/namePart", True)
                    name = self.name_from_name_part(namepart)
                    if name and name not in subject:
                        subject.append(name)

                if "topic" in s:
                    for t in iterify(s["topic"]):
                        if t and t not in subject:
                            subject.append(t)

                if "geographic" in s:
                    for g in iterify(s["geographic"]):
                        if g:
                            if geographic_subject and g not in subject:
                                subject.append(g)
                            if g not in ret_dict["spatial"]:
                                ret_dict["spatial"].append(g)

                if "hierarchicalGeographic" in s:
                    for h in iterify(s["hierarchicalGeographic"]):
                        if isinstance(h, dict):
                            # TODO:  use set logic and declarative style, as
                            # in MissouriMapper, instead of deleting list
                            # elements
                            for k in h.keys():
                                if k not in [
                                        "city", "county", "state", "country",
                                        "coordinates"
                                ]:
                                    del h[k]
                            if h not in ret_dict["spatial"]:
                                ret_dict["spatial"].append(h)
                            if "country" in h:
                                ret_dict["spatial"].append(h["country"])

                coords = getprop(s, "cartographics/coordinates", True)
                if coords and coords not in ret_dict["spatial"]:
                    ret_dict["spatial"].append(coords)

                if "temporal" in s:
                    ret_dict["temporal"].append(s["temporal"])

                ret_dict["subject"].append("--".join(subject))

            for k in ret_dict.keys():
                if not ret_dict[k]:
                    del ret_dict[k]

            self.update_source_resource(ret_dict)
コード例 #27
0
ファイル: save_avro.py プロジェクト: mlhale7/ingestion
def get_enrich_dir(ingestion_document_id):
    couch = Couch()
    ingestion_doc = couch.dashboard_db[ingestion_document_id]

    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        raise AssertionError(
            "Cannot save Avro files, enrich process did not complete")

    return getprop(ingestion_doc, "enrich_process/data_dir")
コード例 #28
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "access")
                        and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})
コード例 #29
0
    def _get_media_type(d):
        pd = iterify(getprop(d, "physicalDescription", True))
        for _dict in filter(None, pd): 
            try:
                return getprop(_dict, "internetMediaType")
            except KeyError:
                pass

        return None
コード例 #30
0
def download_preview(body, ctype):
    """
    Reponsible for:  downloading a preview for a document
    Usage: as a module in separate pipeline, to be run on existing
    documents in the repository to download the thumbnails.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check the "admin/object_status" field
    status = None
    try:
        status = getprop(data, "admin/object_status")
        if status in ["error", "downloaded"]:
            logger.debug("Status is %s, doing nothing" % status)
            return body
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    # Thumbnail URL
    url = None
    try:
        url = getprop(data, "object/@id")
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    # Document ID
    id = None
    try:
        id = getprop(data, "id")
    except KeyError as e:
        logger.error(e.args[0])
        data = set_error(data)
        return json.dumps(data)

    download = False
    if status == "pending":
        download = True

    (relative_fname, mime, status) = download_image(url, id, download)

    if not relative_fname:
        logger.error("Cannot save thumbnail from: %s." % (url))

    # so everything is OK and the file is on disk
    doc = update_document(data, relative_fname, mime, status)
    return json.dumps(doc)
コード例 #31
0
ファイル: montana_mapper.py プロジェクト: mlhale7/ingestion
    def map_date(self):
        """<mods:originInfo><mods:dateCreated>"""
        prop = self.root_key + "originInfo"
        dates = []

        for oi in iterify(getprop(self.provider_data, prop, True)):
            for d in iterify(getprop(oi, "dateCreated", True)):
                dates.append(textnode(d))
        if dates:
            self.update_source_resource({"date": dates})
コード例 #32
0
ファイル: montana_mapper.py プロジェクト: dpla/ingestion
    def map_date(self):
        """<mods:originInfo><mods:dateCreated>"""
        prop = self.root_key + "originInfo"
        dates = []

        for oi in iterify(getprop(self.provider_data, prop,True)):
            for d in iterify(getprop(oi, "dateCreated", True)):
                dates.append(textnode(d))
        if dates:
            self.update_source_resource({"date": dates})
コード例 #33
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})
コード例 #34
0
ファイル: montana_mapper.py プロジェクト: dpla/ingestion
    def map_spatial(self):
        """<mods:subject><mods:geographic>"""
        prop = self.root_key + "subject"
        geo = []
        for s in iterify(getprop(self.provider_data, prop, True)):
            for g in iterify(getprop(s, "geographic", True)):
                geo.append(textnode(g))

        if geo:
            self.update_source_resource({"spatial": geo})
コード例 #35
0
 def map_extent(self):
     extents = set()
     for physical_description in iterify(
             getprop(self.provider_data, "physicalDescription", True)):
         if exists(physical_description, "extent"):
             for extent in iterify(
                     getprop(physical_description, "extent", True)):
                 extents.add(extent)
     if extents:
         self.update_source_resource({"extent": list(extents)})
コード例 #36
0
        def _get_media_type():
            pd = iterify(getprop(self.provider_data,
                         self.root_key + "physicalDescription", True))
            for _dict in filter(None, pd):
                try:
                    return getprop(_dict, "internetMediaType", True)
                except KeyError:
                    pass

            return None
コード例 #37
0
 def map_rights(self):
     set_spec = getprop(self.provider_data, "header/setSpec", True)
     if set_spec == 'eda':
         rights = 'CC BY-NC-ND 3.0 http://www.edickenson.org/terms'
     elif set_spec == 'cna':
         rights = getprop(self.provider_data,
                          self.root_key + 'accessCondition', True)
     else:
         rights = 'Held in the collections of Harvard University.'
     self.update_source_resource({'rights': rights})
コード例 #38
0
ファイル: montana_mapper.py プロジェクト: mlhale7/ingestion
    def map_spatial(self):
        """<mods:subject><mods:geographic>"""
        prop = self.root_key + "subject"
        geo = []
        for s in iterify(getprop(self.provider_data, prop, True)):
            for g in iterify(getprop(s, "geographic", True)):
                geo.append(textnode(g))

        if geo:
            self.update_source_resource({"spatial": geo})
コード例 #39
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
 def map_is_shown_at(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "usage") and exists(url, "access")
                        and url["usage"].lower().startswith("primary")
                        and url["access"].lower() == "object in context"):
                         self.mapped_data.update({"isShownAt": textnode(url)})
コード例 #40
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})
コード例 #41
0
ファイル: enrich-format.py プロジェクト: eldios/ingestion
def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium"):
    """
    Service that accepts a JSON document and enriches the "format" field of that document
    by: 

    a) setting the format to be all lowercase
    b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) checking to see if the field is a valid IMT, and moving it to a separatee field if not
       See http://www.iana.org/assignments/media-types for list of valid media-types. We do not
       require that a subtype be defined. 
    d) Remove any extra text after the IMT   
    
    By default works on the 'format' field, but can be overridden by passing the name of the field to use
    as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter.
    """

    REGEXPS = ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','')
    IMT_TYPES = ['application','audio','image','message','model','multipart','text','video']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s)
        return s

    def is_imt(s):
        imt_regexes = [re.compile('^' + x + '(/|\Z)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data,prop)
        format = []
        physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
        if not isinstance(physicalFormat,list):
            physicalFormat = [physicalFormat]

        for s in (v if not isinstance(v,basestring) else [v]):
            format.append(cleanup(s)) if is_imt(cleanup(s)) else physicalFormat.append(s)

        if format:
            setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format)
        else:
            setprop(data,prop,None)
        if physicalFormat:
            setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)

    return json.dumps(data)
コード例 #42
0
def is_part_of_transform(d, p):
    ipo = []
    v = getprop(d, p)
    for s in (v if isinstance(v, list) else [v]):
        if "type" in v and v["type"] == "series":
            ipo.append(getprop(s, "titleInfo/title", True))

    ipo = filter(None, ipo)
    ipo = ipo[0] if len(ipo) == 1 else ipo

    return {"isPartOf": ipo} if ipo else {}
コード例 #43
0
def origin_info_transform(d, p):
    val = {}
    v = getprop(d, p)

    # date
    date = None
    if "dateCreated" in v:
        date = v["dateCreated"]
    if not date and getprop(v, "dateOther/keyDate", True) == "yes":
        date = getprop(v, "dateOther/#text", True)

    if isinstance(date, list):
        dd = {}
        for i in date:
            if isinstance(i, basestring):
                dd["displayDate"] = i
            elif "point" in i:
                if i["point"] == "start":
                    dd["begin"] = i["point"]
                else:
                    dd["end"] = i["point"]
            else:
                # Odd date? Log error and investigate
                logger.error("Invalid date in record %s" % d["_id"])
        date = dd if dd else None

    if date and date != "unknown":
        val["date"] = date

    # publisher
    if "publisher" in v:
        val["publisher"] = []
        pub = v["publisher"]

        di = v.get("dateIssued", None)
        di = di[0] if isinstance(di, list) else di

        # Get all placeTerms of type "text"
        terms = []
        if "place" in v:
            place = v["place"]
            for p in (place if isinstance(place, list) else [place]):
                if getprop(p, "placeTerm/type", True) == "text":
                    terms.append(getprop(p, "placeTerm/#text", True))

        for t in filter(None, terms):
            if di:
                val["publisher"].append("%s: %s, %s" % (t, pub, di))
            else:
                val["publisher"].append("%s: %s" % (t, pub))
        if len(val["publisher"]) == 1:
            val["publisher"] = val["publisher"][0]

    return val
コード例 #44
0
    def map_is_part_of(self):
        prop = self.root_key + "relatedItem"
        _dict = {"relation": []}

        if exists(self.provider_data, prop):
            for relatedItem in iterify(getprop(self.provider_data, prop)):
                title_prop = "titleInfo/title"
                if exists(relatedItem, title_prop):
                    _dict["relation"].append(getprop(relatedItem, title_prop))

            self.update_source_resource(self.clean_dict(_dict))
コード例 #45
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def map_rights(self):
        path = "/metadata/mods/accessCondition"
        rights = []
        if exists(self.provider_data, path):
            for r in iterify(getprop(self.provider_data, path)):
                t = getprop(r, "type", True)
                if t and t == "local rights statement":
                    rights.append(textnode(r))

            if rights:
                self.update_source_resource({"rights": rights})
コード例 #46
0
 def map_language(self):
     languages = set()
     for language_data in iterify(
             getprop(self.provider_data, "language", True)):
         for language_term in iterify(
                 getprop(language_data, "languageTerm", True)):
             language = self.txt(language_term)
             if language:
                 languages.add(language)
     if languages:
         self.update_source_resource({"language": list(languages)})
コード例 #47
0
def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None):
    """Copies value in one prop to another prop. For use with string and/or
       list prop value types. If to_prop exists, its value is iterified then
       extended with the iterified value of prop. If the to_prop parent prop
       (ie hasView in hasView/rights) does not exist, the from_prop value is
       not copied and an error is logged.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    skip_if_exists -- set to True to not copy if to_prop exists
    """

    def is_string_or_list(value):
        return (isinstance(value, basestring) or isinstance(value, list))

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"


    if exists(data, to_prop) and skip_if_exists:
        pass
    else:
        if exists(data, prop):
            if exists(data, to_prop):
                from_value = getprop(data, prop)
                if not is_string_or_list(from_value):
                    msg = "Prop %s " % prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = getprop(data, to_prop)
                if not is_string_or_list(to_value):
                    msg = "Prop %s " % to_prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = iterify(to_value)
                to_value.extend(iterify(from_value))
                setprop(data, to_prop, to_value)
            else:
                try:
                    setprop(data, to_prop, getprop(data, prop))
                except Exception, e:
                    logger.error("Could not copy %s to %s: %s" %
                                 (prop, to_prop, e))
コード例 #48
0
ファイル: montana_mapper.py プロジェクト: mlhale7/ingestion
    def map_title(self):
        """<mods:titleInfo><mods:title>"""
        prop = self.root_key + "titleInfo"
        titles = []

        for ti in iterify(getprop(self.provider_data, prop, True)):
            for t in iterify(getprop(ti, "title", True)):
                titles.append(textnode(t))

        if titles:
            self.update_source_resource({"title": titles})
コード例 #49
0
def is_part_of_transform_harvard(d, p):
    ipo = []
    v = getprop(d, p)
    for s in (v if isinstance(v, list) else[v]):
        if "type" in v and v["type"] == "series":
            ipo.append(getprop(s, "titleInfo/title", True))

    ipo = filter(None, ipo)
    ipo = ipo[0] if len(ipo) == 1 else ipo

    return {"isPartOf": ipo} if ipo else {}
コード例 #50
0
def origin_info_transform_harvard(d, p):
    val = {}
    v = getprop(d, p)

    # date
    date = None 
    if "dateCreated" in v:
        date = v["dateCreated"]
    if not date and getprop(v, "dateOther/keyDate", True) == "yes":
        date = getprop(v, "dateOther/#text", True)

    if isinstance(date, list):
        dd = {}
        for i in date:
            if isinstance(i, basestring):
                dd["displayDate"] = i
            elif "point" in i:
                if i["point"] == "start":
                    dd["begin"] = i["point"]
                else:
                    dd["end"] = i["point"]
            else:
                # Odd date? Log error and investigate
                logger.error("Invalid date in record %s" % d["_id"])
        date = dd if dd else None

    if date and date != "unknown":
        val["date"] = date
    
    # publisher
    if "publisher" in v:
        val["publisher"] = []
        pub = v["publisher"]

        di = v.get("dateIssued", None)
        di = di[0] if isinstance(di, list) else di

        # Get all placeTerms of type "text"
        terms = []
        if "place" in v:
            place = v["place"]
            for p in (place if isinstance(place, list) else [place]):
                if getprop(p, "placeTerm/type", True) == "text":
                    terms.append(getprop(p, "placeTerm/#text", True))

        for t in filter(None, terms):
            if di: 
                val["publisher"].append("%s: %s, %s" % (t, pub, di))
            else:
                val["publisher"].append("%s: %s" % (t, pub))
        if len(val["publisher"]) == 1:
            val["publisher"] = val["publisher"][0]

    return val
コード例 #51
0
ファイル: montana_mapper.py プロジェクト: dpla/ingestion
    def map_subject(self):
        """<mods:subject><mods:topic>"""
        prop = self.root_key + "subject"
        subjects = []

        for s in iterify(getprop(self.provider_data, prop, True)):
            for t in iterify(getprop(s, "topic", True)):
                subjects.append(textnode(t))

        if subjects:
            self.update_source_resource({"subject": subjects})
コード例 #52
0
ファイル: montana_mapper.py プロジェクト: dpla/ingestion
    def map_title(self):
        """<mods:titleInfo><mods:title>"""
        prop = self.root_key + "titleInfo"
        titles = []

        for ti in iterify(getprop(self.provider_data, prop, True)):
            for t in iterify(getprop(ti, "title", True)):
                titles.append(textnode(t))

        if titles:
            self.update_source_resource({"title": titles})
コード例 #53
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_rights(self):
        path = "/metadata/mods/accessCondition"
        rights = []
        if exists(self.provider_data, path):
            for r in iterify(getprop(self.provider_data, path)):
                t = getprop(r, "type", True)
                if t and t == "local rights statement":
                    rights.append(textnode(r))

            if rights:
                self.update_source_resource({"rights": rights})