コード例 #1
0
    def map_spatial(self):
        spatial = []
        prop = "subject"
        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "hierarchicalGeographic" in s:
                    spatial = s["hierarchicalGeographic"]
                    name = ", ".join(
                        filter(None, [
                            spatial.get("city"),
                            spatial.get("county"),
                            spatial.get("state"),
                            spatial.get("country")
                        ]))
                    spatial["name"] = name
                    spatial = [spatial]

        prop = "originInfo/place"
        if not spatial and exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "placeTerm" in s:
                    for place in iterify(s["placeTerm"]):
                        if "type" in place and place["type"] != "code":
                            spatial.append(place["#text"])

        if spatial:
            self.update_source_resource({"spatial": spatial})
コード例 #2
0
def oaimodstodpladigitalnc(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
コード例 #3
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
コード例 #4
0
 def map_format(self):
     if exists(self.provider_data, "medium"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "medium")})
     elif exists(self.provider_data, "format"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "format")})
コード例 #5
0
def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)
コード例 #6
0
ファイル: nypl_mapper.py プロジェクト: dpla/ingestion
    def map_subject(self):
        # Mapped from subject and genre
        #
        # Per discussion with Amy on 10 April 2014, don't worry about
        # checking whether heading maps to authority file. Amy simplified the
        # crosswalk.
        #
        # TODO: When present, we should probably pull in the valueURI and
        # authority values into the sourceResource.subject - this would
        # represent an index/API change, however.
        subject = []

        if exists(self.provider_data, "subject"):
            for v in iterify(getprop(self.provider_data, "subject")):
                if "topic" in v:
                    if isinstance(v, basestring):
                        subject.append(v["topic"])
                    elif isinstance(v["topic"], dict):
                        subject.append(v["topic"].get("#text"))
                    else:
                        logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"])
                if exists(v, "name/namePart"):
                    subject.append(getprop(v, "name/namePart"))

        if exists(self.provider_data, "genre"):
            for v in iterify(getprop(self.provider_data, "genre")):
                if isinstance(v, basestring):
                    subject.append(v)
                elif isinstance(v, dict):
                    subject.append(v.get("#text"))
                else:
                    logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"])

        if subject:
            self.update_source_resource({"subject": subject})
def set_field_from_value_mode(data, field, mode, value, multivalue=True):
    '''Set the value for the data "field" from data in collection
    ckey field with the value passed in.
    '''
    logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue))
    if value: #no value don't bother
        if mode=='overwrite':
            if exists(data, field):
                setprop(data, field, value)
            else:
                pp,pn = tuple(field.lstrip('/').split('/',1))
                if not pp in data:
                    data[pp] = {}
                data[pp][pn] = value
        elif mode=='append':
            new_value = []
            if exists(data, field):
                old_value = getprop(data, field)
                if isinstance(old_value, list):
                    new_value.extend(old_value)
                else:
                    new_value.append(old_value)
            if isinstance(value, list):
                new_value.extend(value)
            else:
                new_value.append(value)
            setprop(data, field, new_value)
        else: # fill blanks
            if not exists(data, field) or not getprop(data,
                    field,keyErrorAsNone=True):
                if multivalue and not isinstance(value, list):
                    value = [value]
                setprop(data, field, value)
    return data
コード例 #8
0
def oaimodstodpla(body, ctype, geoprop=None, provider=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    if provider == "BPL":
        data = remove_key_prefix(data, "mods:")

    # Apply all transformation rules from original document
    transformer_pipeline = {}
    transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}),
                                **CHO_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out["sourceResource"].update(transformer_pipeline[p](data, p))
    transformer_pipeline = {}
    transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}),
                                **AGGREGATION_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out.update(transformer_pipeline[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    if provider == "HARVARD":
        out["sourceResource"].update(identifier_transform_harvard(data))
        out.update(url_transform_harvard(data))
        out.update(data_provider_transform_harvard(data))

    # Join dataProvider with isPartOf for BPL
    if provider == "BPL":
        try:
            ipo = getprop(out, "dataProvider") + ". " + \
                  getprop(out, "sourceResource/isPartOf")
            setprop(out, "sourceResource/isPartOf", ipo.replace("..", "."))
        except:
            pass

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
コード例 #9
0
 def map_rights_note(self):
     rightsnotes = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsnotice'):
         rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnotice'))
     if exists(self.provider_data_source, 'ucldc_schema:rightsnote'):
         rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnote'))
     if rightsnotes:
         self.update_original_record({'rightsNote': rightsnotes})
コード例 #10
0
 def map_rights(self):
     rights = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsstatus'):
         rights_status = self.provider_data_source.get('ucldc_schema:rightsstatus')
         rights.append(self.map_rights_codes(rights_status))
     if exists(self.provider_data_source, 'ucldc_schema:rightsstatement'):
         rights.append(self.provider_data_source.get('ucldc_schema:rightsstatement'))
     self.update_source_resource({'rights': rights})
コード例 #11
0
 def map_rights_holder(self):
     rightsholders = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsholder'):
         rightsholders = [rh['name'] for rh in self.provider_data_source.get('ucldc_schema:rightsholder')]
     if exists(self.provider_data_source, 'ucldc_schema:rightscontact'):
         rightsholders.append(self.provider_data_source.get('ucldc_schema:rightscontact'))
     if rightsholders:
         self.update_original_record({'rightsHolder': rightsholders})
コード例 #12
0
 def map_date(self):
     if exists(self.provider_data, "date"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "date")
         })
     elif exists(self.provider_data, "created"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "created")
         })
コード例 #13
0
 def map_identifier(self):
     identifiers = []
     if exists(self.provider_data_source, 'ucldc_schema:identifier'):
         identifiers.append(self.provider_data_source.get('ucldc_schema:identifier'))
     if exists(self.provider_data_source, 'ucldc_schema:localidentifier'):
         localids = self.provider_data_source.get('ucldc_schema:localidentifier')
         identifiers.extend(localids)
     if identifiers:
         self.update_source_resource({'identifier': identifiers})
コード例 #14
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "access")
                        and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})
コード例 #15
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if (exists(url, "access")
                             and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})
コード例 #16
0
ファイル: ia_to_dpla.py プロジェクト: peterkingalex/ingestion
def ia_to_dpla(body, ctype, geoprop=None):
    """
    Convert output of Internet Archive service into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON" + "\n" + str(e)

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource" : {}
    }

    def multi_path_processor(data, paths, transformation):
        value = {}
        for sub_p in paths:
            if exists(data, sub_p):
                fetched = transformation[paths](data, sub_p)
                for k in fetched:
                    if k in value:
                        if isinstance(value[k], list):
                            value[k].append(fetched[k])
                        elif isinstance(value[k], basestring) and value[k] != fetched[k]:
                            value[k] = [value[k], fetched[k]]
                        elif isinstance(value[k], dict):
                            value[k].update(fetched[k])
                    else:
                        value[k] = fetched[k]
        return value


    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if isinstance(p, tuple):
            out["sourceResource"].update(multi_path_processor(data, p, CHO_TRANSFORMER))
        elif exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if isinstance(p, tuple):
            out.update(multi_path_processor(data, p, AGGREGATION_TRANSFORMER))
        elif exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
コード例 #17
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
 def map_is_shown_at(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "usage") and exists(url, "access")
                        and url["usage"].lower().startswith("primary")
                        and url["access"].lower() == "object in context"):
                         self.mapped_data.update({"isShownAt": textnode(url)})
コード例 #18
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})
コード例 #19
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})
コード例 #20
0
ファイル: enrich-format.py プロジェクト: eldios/ingestion
def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium"):
    """
    Service that accepts a JSON document and enriches the "format" field of that document
    by: 

    a) setting the format to be all lowercase
    b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) checking to see if the field is a valid IMT, and moving it to a separatee field if not
       See http://www.iana.org/assignments/media-types for list of valid media-types. We do not
       require that a subtype be defined. 
    d) Remove any extra text after the IMT   
    
    By default works on the 'format' field, but can be overridden by passing the name of the field to use
    as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter.
    """

    REGEXPS = ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','')
    IMT_TYPES = ['application','audio','image','message','model','multipart','text','video']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s)
        return s

    def is_imt(s):
        imt_regexes = [re.compile('^' + x + '(/|\Z)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data,prop)
        format = []
        physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
        if not isinstance(physicalFormat,list):
            physicalFormat = [physicalFormat]

        for s in (v if not isinstance(v,basestring) else [v]):
            format.append(cleanup(s)) if is_imt(cleanup(s)) else physicalFormat.append(s)

        if format:
            setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format)
        else:
            setprop(data,prop,None)
        if physicalFormat:
            setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)

    return json.dumps(data)
コード例 #21
0
def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None):
    """Copies value in one prop to another prop. For use with string and/or
       list prop value types. If to_prop exists, its value is iterified then
       extended with the iterified value of prop. If the to_prop parent prop
       (ie hasView in hasView/rights) does not exist, the from_prop value is
       not copied and an error is logged.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    skip_if_exists -- set to True to not copy if to_prop exists
    """

    def is_string_or_list(value):
        return (isinstance(value, basestring) or isinstance(value, list))

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"


    if exists(data, to_prop) and skip_if_exists:
        pass
    else:
        if exists(data, prop):
            if exists(data, to_prop):
                from_value = getprop(data, prop)
                if not is_string_or_list(from_value):
                    msg = "Prop %s " % prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = getprop(data, to_prop)
                if not is_string_or_list(to_value):
                    msg = "Prop %s " % to_prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = iterify(to_value)
                to_value.extend(iterify(from_value))
                setprop(data, to_prop, to_value)
            else:
                try:
                    setprop(data, to_prop, getprop(data, prop))
                except Exception, e:
                    logger.error("Could not copy %s to %s: %s" %
                                 (prop, to_prop, e))
コード例 #22
0
    def map_is_part_of(self):
        prop = self.root_key + "relatedItem"
        _dict = {"relation": []}

        if exists(self.provider_data, prop):
            for relatedItem in iterify(getprop(self.provider_data, prop)):
                title_prop = "titleInfo/title"
                if exists(relatedItem, title_prop):
                    _dict["relation"].append(getprop(relatedItem, title_prop))

            self.update_source_resource(self.clean_dict(_dict))
コード例 #23
0
ファイル: bhl_mods.py プロジェクト: dpla/ingestion
    def map_is_part_of(self):
        prop = self.root_key + "relatedItem"
        _dict = {"relation": []}

        if exists(self.provider_data, prop):
            for relatedItem in iterify(getprop(self.provider_data, prop)):
                title_prop = "titleInfo/title"
                if exists(relatedItem, title_prop):
                    _dict["relation"].append(getprop(relatedItem, title_prop))

            self.update_source_resource(self.clean_dict(_dict))
コード例 #24
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
 def map_is_shown_at(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if (exists(url, "usage") and exists(url, "access")
                             and url["usage"].lower().startswith("primary")
                             and url["access"].lower()
                             == "object in context"):
                         self.mapped_data.update(
                             {"isShownAt": textnode(url)})
コード例 #25
0
def primotodpla(body,ctype,geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))    

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)
コード例 #26
0
def primotodpla(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(
        multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
コード例 #27
0
def arctodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified ARC (NARA) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in data.keys():
        if p in CHO_TRANSFORMER:
            out["sourceResource"].update(CHO_TRANSFORMER[p](data))
        if p in AGGREGATION_TRANSFORMER:
            out.update(AGGREGATION_TRANSFORMER[p](data))

    # Apply transformations that are dependent on more than one
    # original document  field
    out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(rights_transform(data))
    out["sourceResource"].update(subject_and_spatial_transform(data))
    out.update(has_view_transform(data))
    out["sourceResource"].update(transform_state_located_in(data))

    if exists(out, "sourceResource/date"):
        logger.debug("OUTTYPE: %s" % getprop(out, "sourceResource/date"))

    if exists(data, "objects/object"):
        out.update(transform_thumbnail(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
コード例 #28
0
ファイル: enrich-type.py プロジェクト: eldios/ingestion
def enrichtype(body,ctype,action="enrich-type", prop="aggregatedCHO/type", alternate="aggregatedCHO/physicalMedium"):
    """   
    Service that accepts a JSON document and enriches the "type" field of that document
    by: 

    a) making the type lowercase
    b) converting "image" to "still image" (TODO: Amy to confirm that this is ok)
    c) applying a set of regexps to do data cleanup (remove plural forms)
    d) moving all items that are not standard DC types to the physical format field (http://dublincore.org/documents/resource-typelist/)
    
    By default works on the 'type' field, but can be overridden by passing the name of the field to use
    as a parameter
    """

    REGEXPS = ('images','image'), ('still image','image')
    DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'model', 'party', 'physical object',
                'place', 'service', 'software', 'sound', 'text']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
        return s

    def is_dc_type(s):
        return s in DC_TYPES

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data,prop)
        dctype = []
        physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
        if not isinstance(physicalFormat,list):
            physicalFormat = [physicalFormat]

        for s in (v if not isinstance(v,basestring) else [v]):
            dctype.append(cleanup(s)) if is_dc_type(cleanup(s)) else physicalFormat.append(s)

        if dctype:
            setprop(data,prop,dctype[0]) if len(dctype) == 1 else setprop(data,prop,dctype)
        else:
            setprop(data,prop,None)
        if physicalFormat:
            setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)

    return json.dumps(data)
コード例 #29
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_type(self):
        path = "/metadata/mods/typeOfResource"
        path_form = "/metadata/mods/physicalDescription/form"

        if not exists(self.provider_data, path):
            path = path_form

        if exists(self.provider_data, path):
            types = []
            for t in iterify(getprop(self.provider_data, path)):
                types.append(textnode(t))
            if types:
                self.update_source_resource({"type": types})
コード例 #30
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def name_part(self, role_type):
        prop = "/metadata/mods/name"
        results = []
        if exists(self.provider_data, prop):
            for name in getprop(self.provider_data, prop):
                if "role" in name and "namePart" in name:
                    for role in iterify(name["role"]):
                        role_prop = "roleTerm/#text"
                        if exists(role, role_prop) \
                                and getprop(role, role_prop) == role_type:
                            results.append(name["namePart"])

        return results
コード例 #31
0
ファイル: tn_mapper.py プロジェクト: mlhale7/ingestion
    def map_type(self):
        path = "/metadata/mods/typeOfResource"
        path_form = "/metadata/mods/physicalDescription/form"

        if not exists(self.provider_data, path):
            path = path_form

        if exists(self.provider_data, path):
            types = []
            for t in iterify(getprop(self.provider_data, path)):
                types.append(textnode(t))
            if types:
                self.update_source_resource({"type": types})
コード例 #32
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def name_part(self, role_type):
        prop = "/metadata/mods/name"
        results = []
        if exists(self.provider_data, prop):
            for name in getprop(self.provider_data, prop):
                if "role" in name and "namePart" in name:
                    for role in iterify(name["role"]):
                        role_prop = "roleTerm/#text"
                        if exists(role, role_prop) \
                                and getprop(role, role_prop) == role_type:
                            results.append(name["namePart"])

        return results
コード例 #33
0
def harvard_enrich_location(body,
                            ctype,
                            action="harvard_enrich_location",
                            prop="sourceResource/spatial"):
    """
    Service that accepts a Harvard JSON document and enriches the "spatial" field by translating 
    any MARC country codes contained within the originalDocument place element into their names, 
    for better geocoding accuracy.     
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if (exists(data, "originalRecord/metadata/mods/originInfo/place")):
        places = getprop(data, "originalRecord/metadata/mods/originInfo/place")
        country = ""
        countryCode = ""
        name = ""

        # Add non-country terms
        for place in iterify(places):
            logger.info("place: %s" % place)
            placeTerm = getprop(place, "placeTerm", True)
            if (isinstance(placeTerm, basestring)):
                name += " " + placeTerm
            elif (not exists(placeTerm, "authority")):
                name += " " + getprop(placeTerm, "#text", True)

        # Add country
        for place in iterify(places):
            placeTerm = getprop(place, "placeTerm", True)
            if (exists(placeTerm, "authority") \
                and "marccountry" == getprop(placeTerm, "authority", True)):
                countryCode = getprop(placeTerm, "#text", True)
                country = get_country_from_marccode(countryCode)
                if (country):
                    name += ", " + country

        # logger.info("geocode: harvard: Converting name to %s" % name)
        spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))}
        if (country \
            and (2 == len(countryCode) \
                 or countryCode.startswith("xx"))):
            spatial["country"] = country

        setprop(data, prop, [spatial])

    return json.dumps(data)
コード例 #34
0
def movedatestotemporal(body,ctype,action="move_dates_to_temporal",prop=None):
    """
    Service that accepts a JSON document and moves any dates found in the prop field to the
    temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSUB = ("\(", ""), ("\)", "")
    REGSEARCH = ["(\( *)?(\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4})( *\))?", "(\( *)?(\d{4} *[-/] *\d{4})( *\))?", "(\( *)?(\d{4})( *\))?"]

    def cleanup(s):
        for p,r in REGSUB:
            s = re.sub(p,r,s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        p = []
        temporal_field = "aggregatedCHO/temporal"
        temporal = getprop(data, temporal_field) if exists(data, temporal_field) else []

        for d in getprop(data, prop):
            for regsearch in REGSEARCH:
                pattern = re.compile(regsearch)
                for match in pattern.findall(d["name"]):
                    m = "".join(match)
                    #TODO (\( *)? matches 0 and produces '' in m
                    if m:
                        d["name"] = re.sub(re.escape(m),"",d["name"])
                        temporal.append({"name": cleanup(m)})
            if d["name"].strip():
                # Append to p, which will overwrite data[prop]
                p.append(d)

        if temporal:
            setprop(data, temporal_field, temporal)
        if p:
            setprop(data, prop, p)
        else:
            delprop(data, prop)

    return json.dumps(data)
コード例 #35
0
def edantodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for k, v in CHO_TRANSFORMER.items():
        if exists(data, k):
            out["sourceResource"].update(v(data))
    for k, v in AGGREGATION_TRANSFORMER.items():
        if exists(data, k):
            out.update(v(data))

    # Apply transformations that are dependent on more than one
    # original document  field
    #out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(transform_rights(data))
    out["sourceResource"].update(transform_subject(data))
    out["sourceResource"].update(transform_spatial(data))

    out.update(transform_is_shown_at(data))
    out.update(transform_object(data))
    out.update(transform_data_provider(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)
コード例 #36
0
def type_transform(d):
    type = []

    if "general-records-types" in d:
        type = arc_group_extraction(d, "general-records-types", "general-records-type",
                                    "general-records-type-desc")
    if exists(d, "physical-occurrences/physical-occurrence"):
        phys_occur = getprop(d, "physical-occurrences/physical-occurrence")
        type_key = "media-occurrences/media-occurrence/media-type"
        for p in phys_occur:
            if exists(p, type_key):
                type.append(getprop(p, type_key))

    return {"type": "; ".join(type)} if type else {}
コード例 #37
0
ファイル: nara_mapper.py プロジェクト: mlhale7/ingestion
    def map_type(self):
        _type = self.extract_xml_items("general-records-types",
                                       "general-records-type",
                                       "general-records-type-desc")
        
        prop = "physical-occurrences/physical-occurrence"
        if exists(self.provider_data, prop):
            type_key = "media-occurrences/media-occurrence/media-type"
            for s in iterify(getprop(self.provider_data, prop)):
                if exists(s, type_key):
                    _type.append(getprop(s, type_key))

        if _type:
            self.update_source_resource({"type": "; ".join(_type)})
コード例 #38
0
ファイル: edan_select_id.py プロジェクト: mlhale7/ingestion
def selid(body,
          ctype,
          prop='descriptiveNonRepeating/record_link',
          alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''
    tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data, prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v, basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)
コード例 #39
0
def kentucky_identify_object(body, ctype, rights_field="aggregatedCHO/rights", download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    LOG_JSON_ON_ERROR = True

    def log_json():
        if LOG_JSON_ON_ERROR:
            logger.debug(body)

    data = {}
    try:
        data = json.loads(body)
    except Exception as e:
        msg = "Bad JSON: " + e.args[0]
        logger.error(msg)
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return msg

    relation_field = "aggregatedCHO/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        msg = "Field %s does not exist" % relation_field
        logger.error(msg)
        return body

    base_url, ext = os.path.splitext(url)  
    thumb_url = "%s_tb%s" % (base_url, ext)

    rights = None
    if exists(data, rights_field):
        rights = getprop(data, rights_field)

    data["object"] = {"@id": thumb_url, "format": "", "rights": rights}

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)
コード例 #40
0
    def map_rights(self):
        edm_rights_prop = "rightsStatementURI"
        tmp_rights_prop = "tmp_rights_statement"
        map_tmp = True

        if exists(self.provider_data, edm_rights_prop):
            self.mapped_data.update(
                {"rights": getprop(self.provider_data, edm_rights_prop)}
            )
            map_tmp = False

        if map_tmp and exists(self.provider_data, tmp_rights_prop):
            self.update_source_resource(
                {"rights": getprop(self.provider_data, tmp_rights_prop)}
            )
コード例 #41
0
def type_transform(d):
    type = []

    if "general-records-types" in d:
        type = arc_group_extraction(d, "general-records-types",
                                    "general-records-type",
                                    "general-records-type-desc")
    if exists(d, "physical-occurrences/physical-occurrence"):
        phys_occur = getprop(d, "physical-occurrences/physical-occurrence")
        type_key = "media-occurrences/media-occurrence/media-type"
        for p in phys_occur:
            if exists(p, type_key):
                type.append(getprop(p, type_key))

    return {"type": "; ".join(type)} if type else {}
コード例 #42
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens, brackets
                clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))
            
                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
コード例 #43
0
ファイル: uva_mapper.py プロジェクト: dpla/ingestion
    def map_creator(self):
        prop = "name"

        if exists(self.provider_data, prop):
            personal_creator = []
            corporate_creator = []
            for s in iterify(getprop(self.provider_data, prop)):
                creator = [None, None, None]
                for name in iterify(s.get("namePart")):
                    if isinstance(name, basestring):
                            creator[0] = name
                    elif isinstance(name, dict):
                        type = name.get("type")
                        if type == "family":
                            creator[0] = name.get("#text")
                        elif type == "given":
                            creator[1] = name.get("#text")
                        elif type == "termsOfAddress":
                            creator[1] = name.get("#text")
                        elif type == "date":
                            creator[2] = name.get("#text")

                creator = ", ".join(filter(None, creator))

                if (s.get("type") == "personal" and creator not in
                    personal_creator):
                    personal_creator.append(creator)
                elif (s.get("type") == "corporate" and creator not in
                      corporate_creator):
                    corporate_creator.append(creator)

            if personal_creator:
                self.update_source_resource({"creator": personal_creator})
            elif corporate_creator:
                self.update_source_resource({"creator": corporate_creator})
コード例 #44
0
ファイル: edan_mapper.py プロジェクト: marktriggs/ingestion
    def map_is_shown_at(self):
        prop = "descriptiveNonRepeating/record_ID"
        if exists(self.provider_data, prop):
            prefix = "http://collections.si.edu/search/results.htm?" + "q=record_ID%%3A%s&repo=DPLA"
            obj = getprop(self.provider_data, prop)

            self.mapped_data.update({"isShownAt": prefix % obj})
コード例 #45
0
def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document.

    For use with the scdl profiles
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        value = getprop(data,prop)
        for v in iterify(value): 
            name = replace_state_abbreviations(v["name"].rstrip())
            v["name"] = name

            # Try to extract a County
            if " county " in name.lower(): 
                # "XXX County (S.C.)" => county: XXX
                v["county"] = name[0:name.lower().index("county")].strip()
            elif "(S.C.)" in name:
                # "XXX (S.C)" => city: XXX
                v["city"] = name[0:name.index("(S.C.)")].strip()

    return json.dumps(data)
コード例 #46
0
ファイル: uva_mapper.py プロジェクト: dpla/ingestion
    def map_is_show_at_object_has_view_and_dataprovider(self):
        def _get_media_type(d):
            pd = iterify(getprop(d, "physicalDescription"))
            for _dict in pd:
                if exists(_dict, "internetMediaType"):
                    return getprop(_dict, "internetMediaType")

        prop = "location"
        if exists(self.provider_data, prop):
            location = iterify(getprop(self.provider_data, prop))
            format = _get_media_type(self.provider_data)
            out = {}
            try:
                for _dict in location:
                    if "url" in _dict:
                        for url_dict in _dict["url"]:
                            if url_dict and "access" in url_dict:
                                if url_dict["access"] == "object in context":
                                    out["isShownAt"] = url_dict.get("#text")
                                elif url_dict["access"] == "preview":
                                    out["object"] = url_dict.get("#text")
                                elif url_dict["access"] == "raw object":
                                    has_view = {"@id": url_dict.get("#text"),
                                                "format": format}
                                    out["hasView"] = has_view
                    if ("physicalLocation" in _dict and
                        isinstance(_dict["physicalLocation"], basestring)):
                        out["dataProvider"] = _dict["physicalLocation"]
            except Exception as e:
                logger.error(e)

            if out:
                self.mapped_data.update(out)
コード例 #47
0
ファイル: michigan_mapper.py プロジェクト: dpla/ingestion
    def map_date(self):
        originInfoPath = self.root_key + "originInfo"
        dateCreated = []
        dateIssued = []
        date_begin, date_end = None, None

        if exists(self.provider_data, originInfoPath):
            for date in iterify(getprop(self.provider_data, originInfoPath)):
                if "dateCreated" in date:
                    dateCreated.append(textnode(date["dateCreated"]))

                if "dateIssued" in date:
                    t = date["dateIssued"]
                    try:
                        if "point" not in t:
                            dateIssued.append(textnode(t))
                        elif "point" in t and t["point"] == "start":
                            date_begin = textnode(t)
                        elif "point" in t and t["point"] == "end":
                            date_end = textnode(t)
                    except Exception as e:
                        logger.error("Exception when trying to map date "
                                     "values. for record %s \n\n%s" %
                                     (self.provider_data % e.message))

        # If there are no dateIssued or dateCreated properties then construct
        # a date range from begin and end points (if they exist).
        if date_begin and date_end and not dateCreated and not dateIssued:
            dateIssued.append(date_begin + "-" + date_end)

        if dateCreated:
            self.update_source_resource({"date": dateCreated})
        elif dateIssued:
            self.update_source_resource({"date": dateIssued})
コード例 #48
0
def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)
コード例 #49
0
def capitalize(data, prop):
    """
    Capitalizes the value of the related property path.
    Modifies given dictionary (data argument).
    """
    def str_capitalize(s):
        """
        Changes the first letter of the string into uppercase.
        python "aaa".capitalize() can be used, other words first letters
        into lowercase.
        """
        if s:
            return s[0].upper() + s[1:]
        return s

    if exists(data, prop):
        v = getprop(data, prop, keyErrorAsNone=True)
        if v:
            if isinstance(v, basestring):
                setprop(data, prop, str_capitalize(v))
            elif isinstance(v, list):
                new_v = []
                for s in v:
                    if isinstance(s, basestring):
                        new_v.append(str_capitalize(s))
                    else:
                        new_v.append(s)
                setprop(data, prop, new_v)
コード例 #50
0
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that
    document by removing duplicate array elements
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        for p in prop.split(","):
            if exists(data, p):
                v = getprop(data, p)
                if isinstance(v, list):
                    # Remove whitespace, periods, parens, brackets
                    clone = [_stripped(s) for s in v if _stripped(s)]
                    # Get index of unique values
                    index = list(
                        set([clone.index(s) for s in list(set(clone))]))
                    setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
コード例 #51
0
ファイル: dedup_value.py プロジェクト: chadfennell/ingestion
def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens
                clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))

                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)
コード例 #52
0
ファイル: pa_mapper.py プロジェクト: mlhale7/ingestion
 def map_contributor(self):
     prop = "contributor"
     if exists(self.provider_data, prop):
         contributors = iterify(self.provider_data.get(prop))
         setprop(self.mapped_data, "dataProvider", contributors[-1])
         if len(contributors) > 1:
             self.update_source_resource({"contributor": contributors[:-1]})
コード例 #53
0
ファイル: tn_mapper.py プロジェクト: dpla/ingestion
    def map_date(self):
        path = "/metadata/mods/originInfo/dateCreated/#text"
        if exists(self.provider_data, path):
            date_created = getprop(self.provider_data, path)

            if date_created:
                self.update_source_resource({"date": date_created})
コード例 #54
0
    def map_relation(self):
        prop = self.root_key + "relatedItem"

        if exists(self.provider_data, prop):
            relation = []
            host = None
            series = None
            for s in iterify(getprop(self.provider_data, prop)):
                title = getprop(s, "titleInfo/title", True)
                if title is not None:
                    if s.get("type") == "host":
                        host = title
                    if s.get("type") == "series":
                        series = title

                if host:
                    val = host
                    if series:
                        val += ". " + series
                    relation.append(val)

            relation = relation[0] if len(relation) == 1 else relation

            if relation:
                self.update_source_resource({"relation": relation})
コード例 #55
0
def mwdlenrichstatelocatedin(body,
                             ctype,
                             action="mdl_enrich_state_located_in",
                             prop="sourceResource/stateLocatedIn"):
    """
    Service that accepts a JSON document and enriches the "stateLocatedIn"
    field of that document by:

    For primary use with MWDL documents.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        sli = []
        values = getprop(data, prop)
        for v in values.split(";"):
            if STATE_CODES.get(v):
                sli.append(STATE_CODES[v])
            else:
                sli.append(v)
        setprop(data, prop, "; ".join(sli))

    return json.dumps(data)