Python exists Examples, dplaingestion.selector.exists Python Examples

Example #1

0

Show file

    def map_spatial(self):
        spatial = []
        prop = "subject"
        if exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "hierarchicalGeographic" in s:
                    spatial = s["hierarchicalGeographic"]
                    name = ", ".join(
                        filter(None, [
                            spatial.get("city"),
                            spatial.get("county"),
                            spatial.get("state"),
                            spatial.get("country")
                        ]))
                    spatial["name"] = name
                    spatial = [spatial]

        prop = "originInfo/place"
        if not spatial and exists(self.provider_data, prop):
            for s in iterify(getprop(self.provider_data, prop)):
                if "placeTerm" in s:
                    for place in iterify(s["placeTerm"]):
                        if "type" in place and place["type"] != "code":
                            spatial.append(place["#text"])

        if spatial:
            self.update_source_resource({"spatial": spatial})

Example #2

0

Show file

File: oai_mods_to_dpla_digitalnc.py Project: peterkingalex/ingestion

def oaimodstodpladigitalnc(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)

Example #3

0

Show file

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

Example #4

0

Show file

 def map_format(self):
     if exists(self.provider_data, "medium"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "medium")})
     elif exists(self.provider_data, "format"):
         self.update_source_resource({"format": getprop(self.provider_data,
                                                        "format")})

Example #5

0

Show file

File: move_date_values.py Project: amber-reichert/ingestion

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

Example #6

0

Show file

File: nypl_mapper.py Project: dpla/ingestion

    def map_subject(self):
        # Mapped from subject and genre
        #
        # Per discussion with Amy on 10 April 2014, don't worry about
        # checking whether heading maps to authority file. Amy simplified the
        # crosswalk.
        #
        # TODO: When present, we should probably pull in the valueURI and
        # authority values into the sourceResource.subject - this would
        # represent an index/API change, however.
        subject = []

        if exists(self.provider_data, "subject"):
            for v in iterify(getprop(self.provider_data, "subject")):
                if "topic" in v:
                    if isinstance(v, basestring):
                        subject.append(v["topic"])
                    elif isinstance(v["topic"], dict):
                        subject.append(v["topic"].get("#text"))
                    else:
                        logger.error("Topic is not a string nor a dict; %s" % self.provider_data["_id"])
                if exists(v, "name/namePart"):
                    subject.append(getprop(v, "name/namePart"))

        if exists(self.provider_data, "genre"):
            for v in iterify(getprop(self.provider_data, "genre")):
                if isinstance(v, basestring):
                    subject.append(v)
                elif isinstance(v, dict):
                    subject.append(v.get("#text"))
                else:
                    logger.error("Genre is not a string nor a dict; %s" % self.provider_data["_id"])

        if subject:
            self.update_source_resource({"subject": subject})

Example #7

0

Show file

File: required_values_from_collection_registry.py Project: calisphere-legacy-harvester/dpla-ingestion

def set_field_from_value_mode(data, field, mode, value, multivalue=True):
    '''Set the value for the data "field" from data in collection
    ckey field with the value passed in.
    '''
    logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue))
    if value: #no value don't bother
        if mode=='overwrite':
            if exists(data, field):
                setprop(data, field, value)
            else:
                pp,pn = tuple(field.lstrip('/').split('/',1))
                if not pp in data:
                    data[pp] = {}
                data[pp][pn] = value
        elif mode=='append':
            new_value = []
            if exists(data, field):
                old_value = getprop(data, field)
                if isinstance(old_value, list):
                    new_value.extend(old_value)
                else:
                    new_value.append(old_value)
            if isinstance(value, list):
                new_value.extend(value)
            else:
                new_value.append(value)
            setprop(data, field, new_value)
        else: # fill blanks
            if not exists(data, field) or not getprop(data,
                    field,keyErrorAsNone=True):
                if multivalue and not isinstance(value, list):
                    value = [value]
                setprop(data, field, value)
    return data

Example #8

0

Show file

File: oai_mods_to_dpla.py Project: peterkingalex/ingestion

def oaimodstodpla(body, ctype, geoprop=None, provider=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    if provider == "BPL":
        data = remove_key_prefix(data, "mods:")

    # Apply all transformation rules from original document
    transformer_pipeline = {}
    transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}),
                                **CHO_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out["sourceResource"].update(transformer_pipeline[p](data, p))
    transformer_pipeline = {}
    transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}),
                                **AGGREGATION_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out.update(transformer_pipeline[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    if provider == "HARVARD":
        out["sourceResource"].update(identifier_transform_harvard(data))
        out.update(url_transform_harvard(data))
        out.update(data_provider_transform_harvard(data))

    # Join dataProvider with isPartOf for BPL
    if provider == "BPL":
        try:
            ipo = getprop(out, "dataProvider") + ". " + \
                  getprop(out, "sourceResource/isPartOf")
            setprop(out, "sourceResource/isPartOf", ipo.replace("..", "."))
        except:
            pass

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)

Example #9

0

Show file

 def map_rights_note(self):
     rightsnotes = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsnotice'):
         rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnotice'))
     if exists(self.provider_data_source, 'ucldc_schema:rightsnote'):
         rightsnotes.append(self.provider_data_source.get('ucldc_schema:rightsnote'))
     if rightsnotes:
         self.update_original_record({'rightsNote': rightsnotes})

Example #10

0

Show file

 def map_rights(self):
     rights = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsstatus'):
         rights_status = self.provider_data_source.get('ucldc_schema:rightsstatus')
         rights.append(self.map_rights_codes(rights_status))
     if exists(self.provider_data_source, 'ucldc_schema:rightsstatement'):
         rights.append(self.provider_data_source.get('ucldc_schema:rightsstatement'))
     self.update_source_resource({'rights': rights})

Example #11

0

Show file

 def map_rights_holder(self):
     rightsholders = []
     if exists(self.provider_data_source, 'ucldc_schema:rightsholder'):
         rightsholders = [rh['name'] for rh in self.provider_data_source.get('ucldc_schema:rightsholder')]
     if exists(self.provider_data_source, 'ucldc_schema:rightscontact'):
         rightsholders.append(self.provider_data_source.get('ucldc_schema:rightscontact'))
     if rightsholders:
         self.update_original_record({'rightsHolder': rightsholders})

Example #12

0

Show file

 def map_date(self):
     if exists(self.provider_data, "date"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "date")
         })
     elif exists(self.provider_data, "created"):
         self.update_source_resource({
             "date": getprop(self.provider_data, "created")
         })

Example #13

0

Show file

 def map_identifier(self):
     identifiers = []
     if exists(self.provider_data_source, 'ucldc_schema:identifier'):
         identifiers.append(self.provider_data_source.get('ucldc_schema:identifier'))
     if exists(self.provider_data_source, 'ucldc_schema:localidentifier'):
         localids = self.provider_data_source.get('ucldc_schema:localidentifier')
         identifiers.extend(localids)
     if identifiers:
         self.update_source_resource({'identifier': identifiers})

Example #14

0

Show file

File: tn_mapper.py Project: dpla/ingestion

 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "access")
                        and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})

Example #15

0

Show file

File: tn_mapper.py Project: mlhale7/ingestion

 def map_object(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if (exists(url, "access")
                             and url["access"].lower() == "preview"):
                         self.mapped_data.update({"object": textnode(url)})

Example #16

0

Show file

File: ia_to_dpla.py Project: peterkingalex/ingestion

def ia_to_dpla(body, ctype, geoprop=None):
    """
    Convert output of Internet Archive service into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON" + "\n" + str(e)

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource" : {}
    }

    def multi_path_processor(data, paths, transformation):
        value = {}
        for sub_p in paths:
            if exists(data, sub_p):
                fetched = transformation[paths](data, sub_p)
                for k in fetched:
                    if k in value:
                        if isinstance(value[k], list):
                            value[k].append(fetched[k])
                        elif isinstance(value[k], basestring) and value[k] != fetched[k]:
                            value[k] = [value[k], fetched[k]]
                        elif isinstance(value[k], dict):
                            value[k].update(fetched[k])
                    else:
                        value[k] = fetched[k]
        return value


    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if isinstance(p, tuple):
            out["sourceResource"].update(multi_path_processor(data, p, CHO_TRANSFORMER))
        elif exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if isinstance(p, tuple):
            out.update(multi_path_processor(data, p, AGGREGATION_TRANSFORMER))
        elif exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)

Example #17

0

Show file

File: tn_mapper.py Project: dpla/ingestion

 def map_is_shown_at(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if(exists(url, "usage") and exists(url, "access")
                        and url["usage"].lower().startswith("primary")
                        and url["access"].lower() == "object in context"):
                         self.mapped_data.update({"isShownAt": textnode(url)})

Example #18

0

Show file

File: tn_mapper.py Project: mlhale7/ingestion

    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})

Example #19

0

Show file

File: tn_mapper.py Project: dpla/ingestion

    def map_title(self):

        path = "/metadata/mods/titleInfo"
        titles = []
        if exists(self.provider_data, path):
            for t in iterify(getprop(self.provider_data, path)):
                if exists(t, "title") and not exists(t, "title/type"):
                    titles.append(textnode(getprop(t, "title")))
            if titles:
                self.update_source_resource({"title": titles})

Example #20

0

Show file

File: enrich-format.py Project: eldios/ingestion

def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium"):
    """
    Service that accepts a JSON document and enriches the "format" field of that document
    by: 

    a) setting the format to be all lowercase
    b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) checking to see if the field is a valid IMT, and moving it to a separatee field if not
       See http://www.iana.org/assignments/media-types for list of valid media-types. We do not
       require that a subtype be defined. 
    d) Remove any extra text after the IMT   
    
    By default works on the 'format' field, but can be overridden by passing the name of the field to use
    as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter.
    """

    REGEXPS = ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','')
    IMT_TYPES = ['application','audio','image','message','model','multipart','text','video']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s)
        return s

    def is_imt(s):
        imt_regexes = [re.compile('^' + x + '(/|\Z)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data,prop)
        format = []
        physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
        if not isinstance(physicalFormat,list):
            physicalFormat = [physicalFormat]

        for s in (v if not isinstance(v,basestring) else [v]):
            format.append(cleanup(s)) if is_imt(cleanup(s)) else physicalFormat.append(s)

        if format:
            setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format)
        else:
            setprop(data,prop,None)
        if physicalFormat:
            setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)

    return json.dumps(data)

Example #21

0

Show file

File: copy_prop.py Project: calisphere-legacy-harvester/dpla-ingestion

def copyprop(body, ctype, prop=None, to_prop=None, skip_if_exists=None):
    """Copies value in one prop to another prop. For use with string and/or
       list prop value types. If to_prop exists, its value is iterified then
       extended with the iterified value of prop. If the to_prop parent prop
       (ie hasView in hasView/rights) does not exist, the from_prop value is
       not copied and an error is logged.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    skip_if_exists -- set to True to not copy if to_prop exists
    """

    def is_string_or_list(value):
        return (isinstance(value, basestring) or isinstance(value, list))

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"


    if exists(data, to_prop) and skip_if_exists:
        pass
    else:
        if exists(data, prop):
            if exists(data, to_prop):
                from_value = getprop(data, prop)
                if not is_string_or_list(from_value):
                    msg = "Prop %s " % prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = getprop(data, to_prop)
                if not is_string_or_list(to_value):
                    msg = "Prop %s " % to_prop + \
                          "is not a string/list for record %s" % data["id"]
                    logger.error(msg)
                    return body

                to_value = iterify(to_value)
                to_value.extend(iterify(from_value))
                setprop(data, to_prop, to_value)
            else:
                try:
                    setprop(data, to_prop, getprop(data, prop))
                except Exception, e:
                    logger.error("Could not copy %s to %s: %s" %
                                 (prop, to_prop, e))

Example #22

0

Show file

    def map_is_part_of(self):
        prop = self.root_key + "relatedItem"
        _dict = {"relation": []}

        if exists(self.provider_data, prop):
            for relatedItem in iterify(getprop(self.provider_data, prop)):
                title_prop = "titleInfo/title"
                if exists(relatedItem, title_prop):
                    _dict["relation"].append(getprop(relatedItem, title_prop))

            self.update_source_resource(self.clean_dict(_dict))

Example #23

0

Show file

File: bhl_mods.py Project: dpla/ingestion

    def map_is_part_of(self):
        prop = self.root_key + "relatedItem"
        _dict = {"relation": []}

        if exists(self.provider_data, prop):
            for relatedItem in iterify(getprop(self.provider_data, prop)):
                title_prop = "titleInfo/title"
                if exists(relatedItem, title_prop):
                    _dict["relation"].append(getprop(relatedItem, title_prop))

            self.update_source_resource(self.clean_dict(_dict))

Example #24

0

Show file

File: tn_mapper.py Project: mlhale7/ingestion

 def map_is_shown_at(self):
     path = "/metadata/mods/location"
     if exists(self.provider_data, path):
         for locations in iterify(getprop(self.provider_data, path)):
             if exists(locations, "url"):
                 for url in iterify(getprop(locations, "url")):
                     if (exists(url, "usage") and exists(url, "access")
                             and url["usage"].lower().startswith("primary")
                             and url["access"].lower()
                             == "object in context"):
                         self.mapped_data.update(
                             {"isShownAt": textnode(url)})

Example #25

0

Show file

File: primo-to-dpla.py Project: chadfennell/ingestion

def primotodpla(body,ctype,geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))    

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e))

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)

Example #26

0

Show file

def primotodpla(body, ctype, geoprop=None):
    """
    Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in CHO_TRANSFORMER:
        if exists(data, p):
            out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
    for p in AGGREGATION_TRANSFORMER:
        if exists(data, p):
            out.update(AGGREGATION_TRANSFORMER[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    sp_props = ["display/lds08"]
    ipo_props = ["display/lds04"]
    title_props = ["display/title", "display/lds10"]
    out["sourceResource"].update(
        multi_transform(data, "spatial", sp_props, "list"))
    out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
    out["sourceResource"].update(multi_transform(data, "title", title_props))

    dp_props = ["display/lds03"]
    out.update(multi_transform(data, "dataProvider", dp_props))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)

Example #27

0

Show file

def arctodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified ARC (NARA) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for p in data.keys():
        if p in CHO_TRANSFORMER:
            out["sourceResource"].update(CHO_TRANSFORMER[p](data))
        if p in AGGREGATION_TRANSFORMER:
            out.update(AGGREGATION_TRANSFORMER[p](data))

    # Apply transformations that are dependent on more than one
    # original document  field
    out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(rights_transform(data))
    out["sourceResource"].update(subject_and_spatial_transform(data))
    out.update(has_view_transform(data))
    out["sourceResource"].update(transform_state_located_in(data))

    if exists(out, "sourceResource/date"):
        logger.debug("OUTTYPE: %s" % getprop(out, "sourceResource/date"))

    if exists(data, "objects/object"):
        out.update(transform_thumbnail(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)

Example #28

0

Show file

File: enrich-type.py Project: eldios/ingestion

def enrichtype(body,ctype,action="enrich-type", prop="aggregatedCHO/type", alternate="aggregatedCHO/physicalMedium"):
    """   
    Service that accepts a JSON document and enriches the "type" field of that document
    by: 

    a) making the type lowercase
    b) converting "image" to "still image" (TODO: Amy to confirm that this is ok)
    c) applying a set of regexps to do data cleanup (remove plural forms)
    d) moving all items that are not standard DC types to the physical format field (http://dublincore.org/documents/resource-typelist/)
    
    By default works on the 'type' field, but can be overridden by passing the name of the field to use
    as a parameter
    """

    REGEXPS = ('images','image'), ('still image','image')
    DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'model', 'party', 'physical object',
                'place', 'service', 'software', 'sound', 'text']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
        return s

    def is_dc_type(s):
        return s in DC_TYPES

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data,prop)
        dctype = []
        physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
        if not isinstance(physicalFormat,list):
            physicalFormat = [physicalFormat]

        for s in (v if not isinstance(v,basestring) else [v]):
            dctype.append(cleanup(s)) if is_dc_type(cleanup(s)) else physicalFormat.append(s)

        if dctype:
            setprop(data,prop,dctype[0]) if len(dctype) == 1 else setprop(data,prop,dctype)
        else:
            setprop(data,prop,None)
        if physicalFormat:
            setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)

    return json.dumps(data)

Example #29

0

Show file

File: tn_mapper.py Project: dpla/ingestion

    def map_type(self):
        path = "/metadata/mods/typeOfResource"
        path_form = "/metadata/mods/physicalDescription/form"

        if not exists(self.provider_data, path):
            path = path_form

        if exists(self.provider_data, path):
            types = []
            for t in iterify(getprop(self.provider_data, path)):
                types.append(textnode(t))
            if types:
                self.update_source_resource({"type": types})

Example #30

0

Show file

File: tn_mapper.py Project: mlhale7/ingestion

    def name_part(self, role_type):
        prop = "/metadata/mods/name"
        results = []
        if exists(self.provider_data, prop):
            for name in getprop(self.provider_data, prop):
                if "role" in name and "namePart" in name:
                    for role in iterify(name["role"]):
                        role_prop = "roleTerm/#text"
                        if exists(role, role_prop) \
                                and getprop(role, role_prop) == role_type:
                            results.append(name["namePart"])

        return results

Example #31

0

Show file

File: tn_mapper.py Project: mlhale7/ingestion

    def map_type(self):
        path = "/metadata/mods/typeOfResource"
        path_form = "/metadata/mods/physicalDescription/form"

        if not exists(self.provider_data, path):
            path = path_form

        if exists(self.provider_data, path):
            types = []
            for t in iterify(getprop(self.provider_data, path)):
                types.append(textnode(t))
            if types:
                self.update_source_resource({"type": types})

Example #32

0

Show file

File: tn_mapper.py Project: dpla/ingestion

    def name_part(self, role_type):
        prop = "/metadata/mods/name"
        results = []
        if exists(self.provider_data, prop):
            for name in getprop(self.provider_data, prop):
                if "role" in name and "namePart" in name:
                    for role in iterify(name["role"]):
                        role_prop = "roleTerm/#text"
                        if exists(role, role_prop) \
                                and getprop(role, role_prop) == role_type:
                            results.append(name["namePart"])

        return results

Example #33

0

Show file

File: harvard_enrich_location.py Project: chadfennell/ingestion

def harvard_enrich_location(body,
                            ctype,
                            action="harvard_enrich_location",
                            prop="sourceResource/spatial"):
    """
    Service that accepts a Harvard JSON document and enriches the "spatial" field by translating 
    any MARC country codes contained within the originalDocument place element into their names, 
    for better geocoding accuracy.     
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if (exists(data, "originalRecord/metadata/mods/originInfo/place")):
        places = getprop(data, "originalRecord/metadata/mods/originInfo/place")
        country = ""
        countryCode = ""
        name = ""

        # Add non-country terms
        for place in iterify(places):
            logger.info("place: %s" % place)
            placeTerm = getprop(place, "placeTerm", True)
            if (isinstance(placeTerm, basestring)):
                name += " " + placeTerm
            elif (not exists(placeTerm, "authority")):
                name += " " + getprop(placeTerm, "#text", True)

        # Add country
        for place in iterify(places):
            placeTerm = getprop(place, "placeTerm", True)
            if (exists(placeTerm, "authority") \
                and "marccountry" == getprop(placeTerm, "authority", True)):
                countryCode = getprop(placeTerm, "#text", True)
                country = get_country_from_marccode(countryCode)
                if (country):
                    name += ", " + country

        # logger.info("geocode: harvard: Converting name to %s" % name)
        spatial = {"name": re.sub("[\[\]]", "", name.strip(", "))}
        if (country \
            and (2 == len(countryCode) \
                 or countryCode.startswith("xx"))):
            spatial["country"] = country

        setprop(data, prop, [spatial])

    return json.dumps(data)

Example #34

0

Show file

File: move_dates_to_temporal.py Project: eldios/ingestion

def movedatestotemporal(body,ctype,action="move_dates_to_temporal",prop=None):
    """
    Service that accepts a JSON document and moves any dates found in the prop field to the
    temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSUB = ("\(", ""), ("\)", "")
    REGSEARCH = ["(\( *)?(\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4})( *\))?", "(\( *)?(\d{4} *[-/] *\d{4})( *\))?", "(\( *)?(\d{4})( *\))?"]

    def cleanup(s):
        for p,r in REGSUB:
            s = re.sub(p,r,s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        p = []
        temporal_field = "aggregatedCHO/temporal"
        temporal = getprop(data, temporal_field) if exists(data, temporal_field) else []

        for d in getprop(data, prop):
            for regsearch in REGSEARCH:
                pattern = re.compile(regsearch)
                for match in pattern.findall(d["name"]):
                    m = "".join(match)
                    #TODO (\( *)? matches 0 and produces '' in m
                    if m:
                        d["name"] = re.sub(re.escape(m),"",d["name"])
                        temporal.append({"name": cleanup(m)})
            if d["name"].strip():
                # Append to p, which will overwrite data[prop]
                p.append(d)

        if temporal:
            setprop(data, temporal_field, temporal)
        if p:
            setprop(data, prop, p)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #35

0

Show file

def edantodpla(body, ctype, geoprop=None):
    """   
    Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {"@context": CONTEXT, "sourceResource": {}}

    # Apply all transformation rules from original document
    for k, v in CHO_TRANSFORMER.items():
        if exists(data, k):
            out["sourceResource"].update(v(data))
    for k, v in AGGREGATION_TRANSFORMER.items():
        if exists(data, k):
            out.update(v(data))

    # Apply transformations that are dependent on more than one
    # original document  field
    #out["sourceResource"].update(type_transform(data))
    out["sourceResource"].update(transform_rights(data))
    out["sourceResource"].update(transform_subject(data))
    out["sourceResource"].update(transform_spatial(data))

    out.update(transform_is_shown_at(data))
    out.update(transform_object(data))
    out.update(transform_data_provider(data))

    # Additional content not from original document
    if "HTTP_CONTRIBUTOR" in request.environ:
        try:
            out["provider"] = json.loads(
                base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
        except Exception as e:
            logger.debug("Unable to decode Contributor header value: " +
                         request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e))

    # Strip out keys with None/null values?
    out = dict((k, v) for (k, v) in out.items() if v)

    return json.dumps(out)

Example #36

0

Show file

File: arc-to-dpla.py Project: amber-reichert/ingestion

def type_transform(d):
    type = []

    if "general-records-types" in d:
        type = arc_group_extraction(d, "general-records-types", "general-records-type",
                                    "general-records-type-desc")
    if exists(d, "physical-occurrences/physical-occurrence"):
        phys_occur = getprop(d, "physical-occurrences/physical-occurrence")
        type_key = "media-occurrences/media-occurrence/media-type"
        for p in phys_occur:
            if exists(p, type_key):
                type.append(getprop(p, type_key))

    return {"type": "; ".join(type)} if type else {}

Example #37

0

Show file

File: nara_mapper.py Project: mlhale7/ingestion

    def map_type(self):
        _type = self.extract_xml_items("general-records-types",
                                       "general-records-type",
                                       "general-records-type-desc")
        
        prop = "physical-occurrences/physical-occurrence"
        if exists(self.provider_data, prop):
            type_key = "media-occurrences/media-occurrence/media-type"
            for s in iterify(getprop(self.provider_data, prop)):
                if exists(s, type_key):
                    _type.append(getprop(s, type_key))

        if _type:
            self.update_source_resource({"type": "; ".join(_type)})

Example #38

0

Show file

File: edan_select_id.py Project: mlhale7/ingestion

def selid(body,
          ctype,
          prop='descriptiveNonRepeating/record_link',
          alternative_prop='descriptiveNonRepeating/record_ID'):
    '''   
    Service that accepts a JSON document and adds or sets the "id" property to the
    value of the property named by the "prop" paramater
    '''
    tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA"

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        request_headers = copy_headers_to_dict(request.environ)
        source_name = request_headers.get('Source')

        id = None

        if exists(data, prop) or exists(data, alternative_prop):
            v = getprop(data, prop, True)
            if not v:
                v = getprop(data, alternative_prop)
                v = tmpl % v
            if isinstance(v, basestring):
                id = v
            else:
                if v:
                    for h in v:
                        if is_absolute(h):
                            id = h
                    if not id:
                        id = v[0]

        if not id:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "No id property was found"

        data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id)
        data[u'id'] = hashlib.md5(data[u'_id']).hexdigest()
    else:
        logger.error("Prop param in None in %s" % __name__)

    return json.dumps(data)

Example #39

0

Show file

File: kentucky_identify_object.py Project: eldios/ingestion

def kentucky_identify_object(body, ctype, rights_field="aggregatedCHO/rights", download="True"):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail
    """

    LOG_JSON_ON_ERROR = True

    def log_json():
        if LOG_JSON_ON_ERROR:
            logger.debug(body)

    data = {}
    try:
        data = json.loads(body)
    except Exception as e:
        msg = "Bad JSON: " + e.args[0]
        logger.error(msg)
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return msg

    relation_field = "aggregatedCHO/relation"
    if exists(data, relation_field):
        url = getprop(data, relation_field)
    else:
        msg = "Field %s does not exist" % relation_field
        logger.error(msg)
        return body

    base_url, ext = os.path.splitext(url)  
    thumb_url = "%s_tb%s" % (base_url, ext)

    rights = None
    if exists(data, rights_field):
        rights = getprop(data, rights_field)

    data["object"] = {"@id": thumb_url, "format": "", "rights": rights}

    status = IGNORE
    if download == "True":
        status = PENDING

    if "admin" in data:
        data["admin"]["object_status"] = status
    else:
        data["admin"] = {"object_status": status}

    return json.dumps(data)

Example #40

0

Show file

    def map_rights(self):
        edm_rights_prop = "rightsStatementURI"
        tmp_rights_prop = "tmp_rights_statement"
        map_tmp = True

        if exists(self.provider_data, edm_rights_prop):
            self.mapped_data.update(
                {"rights": getprop(self.provider_data, edm_rights_prop)}
            )
            map_tmp = False

        if map_tmp and exists(self.provider_data, tmp_rights_prop):
            self.update_source_resource(
                {"rights": getprop(self.provider_data, tmp_rights_prop)}
            )

Example #41

0

Show file

def type_transform(d):
    type = []

    if "general-records-types" in d:
        type = arc_group_extraction(d, "general-records-types",
                                    "general-records-type",
                                    "general-records-type-desc")
    if exists(d, "physical-occurrences/physical-occurrence"):
        phys_occur = getprop(d, "physical-occurrences/physical-occurrence")
        type_key = "media-occurrences/media-occurrence/media-type"
        for p in phys_occur:
            if exists(p, type_key):
                type.append(getprop(p, type_key))

    return {"type": "; ".join(type)} if type else {}

Example #42

0

Show file

File: dedup_value.py Project: amber-reichert/ingestion

def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens, brackets
                clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))
            
                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)

Example #43

0

Show file

File: uva_mapper.py Project: dpla/ingestion

    def map_creator(self):
        prop = "name"

        if exists(self.provider_data, prop):
            personal_creator = []
            corporate_creator = []
            for s in iterify(getprop(self.provider_data, prop)):
                creator = [None, None, None]
                for name in iterify(s.get("namePart")):
                    if isinstance(name, basestring):
                            creator[0] = name
                    elif isinstance(name, dict):
                        type = name.get("type")
                        if type == "family":
                            creator[0] = name.get("#text")
                        elif type == "given":
                            creator[1] = name.get("#text")
                        elif type == "termsOfAddress":
                            creator[1] = name.get("#text")
                        elif type == "date":
                            creator[2] = name.get("#text")

                creator = ", ".join(filter(None, creator))

                if (s.get("type") == "personal" and creator not in
                    personal_creator):
                    personal_creator.append(creator)
                elif (s.get("type") == "corporate" and creator not in
                      corporate_creator):
                    corporate_creator.append(creator)

            if personal_creator:
                self.update_source_resource({"creator": personal_creator})
            elif corporate_creator:
                self.update_source_resource({"creator": corporate_creator})

Example #44

0

Show file

File: edan_mapper.py Project: marktriggs/ingestion

    def map_is_shown_at(self):
        prop = "descriptiveNonRepeating/record_ID"
        if exists(self.provider_data, prop):
            prefix = "http://collections.si.edu/search/results.htm?" + "q=record_ID%%3A%s&repo=DPLA"
            obj = getprop(self.provider_data, prop)

            self.mapped_data.update({"isShownAt": prefix % obj})

Example #45

0

Show file

File: scdl_enrich_location.py Project: peterkingalex/ingestion

def scdl_enrich_location(body, ctype, action="scdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document.

    For use with the scdl profiles
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        value = getprop(data,prop)
        for v in iterify(value): 
            name = replace_state_abbreviations(v["name"].rstrip())
            v["name"] = name

            # Try to extract a County
            if " county " in name.lower(): 
                # "XXX County (S.C.)" => county: XXX
                v["county"] = name[0:name.lower().index("county")].strip()
            elif "(S.C.)" in name:
                # "XXX (S.C)" => city: XXX
                v["city"] = name[0:name.index("(S.C.)")].strip()

    return json.dumps(data)

Example #46

0

Show file

File: uva_mapper.py Project: dpla/ingestion

    def map_is_show_at_object_has_view_and_dataprovider(self):
        def _get_media_type(d):
            pd = iterify(getprop(d, "physicalDescription"))
            for _dict in pd:
                if exists(_dict, "internetMediaType"):
                    return getprop(_dict, "internetMediaType")

        prop = "location"
        if exists(self.provider_data, prop):
            location = iterify(getprop(self.provider_data, prop))
            format = _get_media_type(self.provider_data)
            out = {}
            try:
                for _dict in location:
                    if "url" in _dict:
                        for url_dict in _dict["url"]:
                            if url_dict and "access" in url_dict:
                                if url_dict["access"] == "object in context":
                                    out["isShownAt"] = url_dict.get("#text")
                                elif url_dict["access"] == "preview":
                                    out["object"] = url_dict.get("#text")
                                elif url_dict["access"] == "raw object":
                                    has_view = {"@id": url_dict.get("#text"),
                                                "format": format}
                                    out["hasView"] = has_view
                    if ("physicalLocation" in _dict and
                        isinstance(_dict["physicalLocation"], basestring)):
                        out["dataProvider"] = _dict["physicalLocation"]
            except Exception as e:
                logger.error(e)

            if out:
                self.mapped_data.update(out)

Example #47

0

Show file

File: michigan_mapper.py Project: dpla/ingestion

    def map_date(self):
        originInfoPath = self.root_key + "originInfo"
        dateCreated = []
        dateIssued = []
        date_begin, date_end = None, None

        if exists(self.provider_data, originInfoPath):
            for date in iterify(getprop(self.provider_data, originInfoPath)):
                if "dateCreated" in date:
                    dateCreated.append(textnode(date["dateCreated"]))

                if "dateIssued" in date:
                    t = date["dateIssued"]
                    try:
                        if "point" not in t:
                            dateIssued.append(textnode(t))
                        elif "point" in t and t["point"] == "start":
                            date_begin = textnode(t)
                        elif "point" in t and t["point"] == "end":
                            date_end = textnode(t)
                    except Exception as e:
                        logger.error("Exception when trying to map date "
                                     "values. for record %s \n\n%s" %
                                     (self.provider_data % e.message))

        # If there are no dateIssued or dateCreated properties then construct
        # a date range from begin and end points (if they exist).
        if date_begin and date_end and not dateCreated and not dateIssued:
            dateIssued.append(date_begin + "-" + date_end)

        if dateCreated:
            self.update_source_resource({"date": dateCreated})
        elif dateIssued:
            self.update_source_resource({"date": dateIssued})

Example #48

0

Show file

File: artstor_cleanup_creator.py Project: amber-reichert/ingestion

def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)

Example #49

0

Show file

def capitalize(data, prop):
    """
    Capitalizes the value of the related property path.
    Modifies given dictionary (data argument).
    """
    def str_capitalize(s):
        """
        Changes the first letter of the string into uppercase.
        python "aaa".capitalize() can be used, other words first letters
        into lowercase.
        """
        if s:
            return s[0].upper() + s[1:]
        return s

    if exists(data, prop):
        v = getprop(data, prop, keyErrorAsNone=True)
        if v:
            if isinstance(v, basestring):
                setprop(data, prop, str_capitalize(v))
            elif isinstance(v, list):
                new_v = []
                for s in v:
                    if isinstance(s, basestring):
                        new_v.append(str_capitalize(s))
                    else:
                        new_v.append(s)
                setprop(data, prop, new_v)

Example #50

0

Show file

def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that
    document by removing duplicate array elements
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

        for p in prop.split(","):
            if exists(data, p):
                v = getprop(data, p)
                if isinstance(v, list):
                    # Remove whitespace, periods, parens, brackets
                    clone = [_stripped(s) for s in v if _stripped(s)]
                    # Get index of unique values
                    index = list(
                        set([clone.index(s) for s in list(set(clone))]))
                    setprop(data, p, [v[i] for i in index])

    return json.dumps(data)

Example #51

0

Show file

File: dedup_value.py Project: chadfennell/ingestion

def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop is None:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        msg = "Prop param is None"
        logger.error(msg)
        return msg

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens
                clone = [re.sub("[ \.\(\)]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))

                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)

Example #52

0

Show file

File: pa_mapper.py Project: mlhale7/ingestion

 def map_contributor(self):
     prop = "contributor"
     if exists(self.provider_data, prop):
         contributors = iterify(self.provider_data.get(prop))
         setprop(self.mapped_data, "dataProvider", contributors[-1])
         if len(contributors) > 1:
             self.update_source_resource({"contributor": contributors[:-1]})

Example #53

0

Show file

File: tn_mapper.py Project: dpla/ingestion

    def map_date(self):
        path = "/metadata/mods/originInfo/dateCreated/#text"
        if exists(self.provider_data, path):
            date_created = getprop(self.provider_data, path)

            if date_created:
                self.update_source_resource({"date": date_created})

Example #54

0

Show file

    def map_relation(self):
        prop = self.root_key + "relatedItem"

        if exists(self.provider_data, prop):
            relation = []
            host = None
            series = None
            for s in iterify(getprop(self.provider_data, prop)):
                title = getprop(s, "titleInfo/title", True)
                if title is not None:
                    if s.get("type") == "host":
                        host = title
                    if s.get("type") == "series":
                        series = title

                if host:
                    val = host
                    if series:
                        val += ". " + series
                    relation.append(val)

            relation = relation[0] if len(relation) == 1 else relation

            if relation:
                self.update_source_resource({"relation": relation})

Example #55

0

Show file

File: mwdl_enrich_state_located_in.py Project: mlhale7/ingestion

def mwdlenrichstatelocatedin(body,
                             ctype,
                             action="mdl_enrich_state_located_in",
                             prop="sourceResource/stateLocatedIn"):
    """
    Service that accepts a JSON document and enriches the "stateLocatedIn"
    field of that document by:

    For primary use with MWDL documents.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        sli = []
        values = getprop(data, prop)
        for v in values.split(";"):
            if STATE_CODES.get(v):
                sli.append(STATE_CODES[v])
            else:
                sli.append(v)
        setprop(data, prop, "; ".join(sli))

    return json.dumps(data)