Python delprop Examples, dplaingestion.selector.delprop Python Examples

Example #1

0

Show file

File: mwdl_enrich_location.py Project: amber-reichert/ingestion

def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document. 

    For primary use with MWDL documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        spatials = []
        for spatial in iterify(getprop(data,prop)):
            if (is_spatial(spatial)): 
                spatials.append(format_spatial(spatial))

        if (len(spatials) > 0): 
            setprop(data, prop, spatials)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #2

0

Show file

def geocode_region(spatial):
    setprop(spatial, "coordinates",
            "%s, %s" % REGIONS[getprop(spatial, "name")])
    delprop(spatial, "county")
    setprop(spatial, "state", "South Carolina")
    setprop(spatial, "country", "United States")
    return spatial

Example #3

0

Show file

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

Example #4

0

Show file

File: move_date_values.py Project: amber-reichert/ingestion

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

Example #5

0

Show file

File: mwdl_enrich_location.py Project: mlhale7/ingestion

def mdlenrichlocation(body,
                      ctype,
                      action="mwdl_enrich_location",
                      prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document. 

    For primary use with MWDL documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        spatials = []
        for spatial in iterify(getprop(data, prop)):
            if (is_spatial(spatial)):
                spatials.append(format_spatial(spatial))

        if (len(spatials) > 0):
            setprop(data, prop, spatials)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #6

0

Show file

 def update_title(self):
     prop = "sourceResource/title"
     title_list = filter(None, getprop(self.mapped_data, prop))
     if title_list:
         title = [" ".join(t) for t in title_list]
         setprop(self.mapped_data, prop, title)
     else:
         delprop(self.mapped_data, prop)

Example #7

0

Show file

File: marc_mapper.py Project: dpla/ingestion

 def update_title(self):
     prop = "sourceResource/title"
     title_list = filter(None, getprop(self.mapped_data, prop))
     if title_list:
         title = [" ".join(t) for t in title_list]
         setprop(self.mapped_data, prop, title)
     else:
         delprop(self.mapped_data, prop)

Example #8

0

Show file

File: remove_object_and_harvest_image_26094.py Project: mredar/harvester

def delete_field_and_queue_image_harvest(doc, field, cdb, enq):
    print 'Delete {} for {}'.format(field, doc['_id'])
    delprop(doc, field, keyErrorAsNone=True)
    cdb.save(doc)
    timeout = 10000
    results = enq.queue_list_of_ids([doc['_id']],
                     timeout,
                     harvest_image_for_doc,
                     )

Example #9

0

Show file

def delete_field_and_queue_image_harvest(doc, field, cdb, enq):
    print 'Delete {} for {}'.format(field, doc['_id'])
    delprop(doc, field, keyErrorAsNone=True)
    cdb.save(doc)
    timeout = 10000
    results = enq.queue_list_of_ids(
        [doc['_id']],
        timeout,
        harvest_image_for_doc,
    )

Example #10

0

Show file

File: set_prop.py Project: dpla/ingestion

def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None):
    """Unsets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to unset
    condition -- the condition to be met (uses prop by default) 
    condition_prop -- the prop(s) to use in the condition (comma-separated if
                      multiple props)
    
    """

    CONDITIONS = {
        "is_digit": lambda v: v[0].isdigit(),
        "mwdl_exclude": lambda v: (v[0] == "collections" or
                                   v[0] == "findingAids"),
        "hathi_exclude": lambda v: "Minnesota Digital Library" in v,
        "finding_aid_title": lambda v: v[0].startswith("Finding Aid"),
        "usc_no_contributor": lambda v: not v[0].get("contributor", False)
    }

    def condition_met(condition_prop, condition):
        values = []
        props = condition_prop.split(",")
        for p in props:
            iterified = iterify(getprop(data, p, True))
            [values.append(i) for i in iterified]

        return CONDITIONS[condition](values)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check if prop exists to avoid key error
    if exists(data, prop):
        if not condition:
            delprop(data, prop)
        else:
            if not condition_prop:
                condition_prop = prop
            try:
                if condition_met(condition_prop, condition):
                    logger.debug("Unsetting prop %s for doc with id %s" % 
                                 (prop, data["_id"]))
                    delprop(data, prop)
            except KeyError:
                logger.error("CONDITIONS does not contain %s" % condition)
                

    return json.dumps(data)

Example #11

0

Show file

File: enrich_date.py Project: dpla/ingestion

def convert_dates(data, prop, earliest):
    """Converts dates.

    Arguments:
    data Dict - Data for conversion.
    prop Str - Properties dividided with comma.
    earliest Bool - True - the function will set only the earliest date.
    False - the function will set all dates.

    Returns:
    Nothing, the replacement is done in place.
    """
    for p in prop.split(','):
        dates = []
        if exists(data, p):
            v = getprop(data, p)
            if not isinstance(v, dict):
                if is_year_range_list(v):
                    dates.append( {
                        "begin": v[0],
                        "end": v[-1],
                        "displayDate": "%s-%s" % (v[0], v[-1])
                    })
                else:
                    for s in (v if not isinstance(v, basestring) else [v]):
                        for part in s.split(";"):
                            display_date = remove_single_brackets_and_strip(
                                            part
                                            )
                            stripped = clean_date(
                                        remove_all_brackets_and_strip(part)
                                        )
                            if len(stripped) < 4:
                                continue
                            a, b = parse_date_or_range(stripped)
                            if b != DEFAULT_DATETIME_STR:
                                dates.append( {
                                        "begin": a,
                                        "end": b,
                                        "displayDate": display_date
                                    })
            else:
                # Already filled in, probably by mapper
                continue

            dates.sort(key=lambda d: d["begin"] if d["begin"] is not None
                                                else DEFAULT_DATETIME_STR)
            if dates:
                if earliest:
                    value_to_set = dates[0]
                else:
                    value_to_set = dates
                setprop(data, p, value_to_set)
            else:
                delprop(data, p)

Example #12

0

Show file

File: enrich_date.py Project: marktriggs/ingestion

def convert_dates(data, prop, earliest):
    """Converts dates.

    Arguments:
    data Dict - Data for conversion.
    prop Str - Properties dividided with comma.
    earliest Bool - True - the function will set only the earliest date.
    False - the function will set all dates.

    Returns:
    Nothing, the replacement is done in place.
    """
    for p in prop.split(','):
        dates = []
        if exists(data, p):
            v = getprop(data, p)
            if not isinstance(v, dict):
                if is_year_range_list(v):
                    dates.append( {
                        "begin": v[0],
                        "end": v[-1],
                        "displayDate": "%s-%s" % (v[0], v[-1])
                    })
                else:
                    for s in (v if not isinstance(v, basestring) else [v]):
                        for part in s.split(";"):
                            display_date = remove_single_brackets_and_strip(
                                            part
                                            )
                            stripped = clean_date(
                                        remove_all_brackets_and_strip(part)
                                        )
                            if len(stripped) < 4:
                                continue
                            a, b = parse_date_or_range(stripped)
                            if b != DEFAULT_DATETIME_STR:
                                dates.append( {
                                        "begin": a,
                                        "end": b,
                                        "displayDate": display_date
                                    })
            else:
                # Already filled in, probably by mapper
                continue

            dates.sort(key=lambda d: d["begin"] if d["begin"] is not None
                                                else DEFAULT_DATETIME_STR)
            if dates:
                if earliest:
                    value_to_set = dates[0]
                else:
                    value_to_set = dates
                setprop(data, p, value_to_set)
            else:
                delprop(data, p)

Example #13

0

Show file

File: set_prop.py Project: calisphere-legacy-harvester/dpla-ingestion

def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None):
    """Unsets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to unset
    condition -- the condition to be met (uses prop by default) 
    condition_prop -- the prop(s) to use in the condition (comma-separated if
                      multiple props)
    
    """

    CONDITIONS = {
        "is_digit": lambda v: v[0].isdigit(),
        "mwdl_exclude": lambda v:
        (v[0] == "collections" or v[0] == "findingAids"),
        "hathi_exclude": lambda v: "Minnesota Digital Library" in v,
        "finding_aid_title": lambda v: v[0].startswith("Finding Aid"),
        "usc_no_contributor": lambda v: not v[0].get("contributor", False)
    }

    def condition_met(condition_prop, condition):
        values = []
        props = condition_prop.split(",")
        for p in props:
            iterified = iterify(getprop(data, p, True))
            [values.append(i) for i in iterified]

        return CONDITIONS[condition](values)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Check if prop exists to avoid key error
    if exists(data, prop):
        if not condition:
            delprop(data, prop)
        else:
            if not condition_prop:
                condition_prop = prop
            try:
                if condition_met(condition_prop, condition):
                    logger.debug("Unsetting prop %s for doc with id %s" %
                                 (prop, data["_id"]))
                    delprop(data, prop)
            except KeyError:
                logger.error("CONDITIONS does not contain %s" % condition)

    return json.dumps(data)

Example #14

0

Show file

File: cdl_json_mapper.py Project: marktriggs/ingestion

 def update_language(self):
     out_languages = []
     for language in iterify(getprop(self.mapped_data, "sourceResource/language", True)):
         if isinstance(language, dict):
             out_languages.append(language)
         elif isinstance(language, basestring):
             out_languages.append({"name": language})
     if out_languages:
         self.update_source_resource({"language": out_languages})
     else:
         delprop(self.mapped_data, "language", True)

Example #15

0

Show file

File: cdl_json_mapper.py Project: calisphere-legacy-harvester/dpla-ingestion

 def update_language(self):
     out_languages = []
     for language in iterify(
             getprop(self.mapped_data, "sourceResource/language", True)):
         if isinstance(language, dict):
             out_languages.append(language)
         elif isinstance(language, basestring):
             out_languages.append({"name": language})
     if out_languages:
         self.update_source_resource({"language": out_languages})
     else:
         delprop(self.mapped_data, "language", True)

Example #16

0

Show file

File: artstor_spatial_to_dataprovider.py Project: mlhale7/ingestion

def artstor_spatial_to_dataprovider(body,
                                    ctype,
                                    prop="sourceResource/spatial"):
    """Sets the dataProvider from sourceResource/spatial by:

       1. Deleting the dataProvider field
       2. Splitting on semicolon if sourceResource/spatial is a string
       3. Moving the first sourceResource/spatial value to dataProvider for
          DPLA* collections
       4. Moving the "Repository: " value to dataProvider for SS* collections
       5. Removing the sourceResource/spatial field for DPLA* collections
       6. Removing any "Accession number: " values from sourceResource/spatial
          for SS* collections
       7. Removing the string "Repository: " from the dataProvider value
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    delprop(data, "dataProvider")
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            v = v.split(";")

        spatial = []
        data_provider = None
        collections = getprop(data, "originalRecord/setSpec", True)
        for coll in iterify(collections):
            if coll.startswith("DPLA"):
                data_provider = v[0]
                break
            elif coll.startswith("SS"):
                spatial = []
                for s in v:
                    if "Repository" in s:
                        data_provider = s
                    elif "Accession" not in s:
                        spatial.append(s)
                break

        delprop(data, prop)
        if spatial:
            setprop(data, prop, spatial)
        if data_provider:
            setprop(data, "dataProvider",
                    data_provider.replace("Repository: ", ""))
    return json.dumps(data)

Example #17

0

Show file

File: move_dates_to_temporal.py Project: eldios/ingestion

def movedatestotemporal(body,ctype,action="move_dates_to_temporal",prop=None):
    """
    Service that accepts a JSON document and moves any dates found in the prop field to the
    temporal field.
    """

    if not prop:
        logger.error("No prop supplied")
        return body

    REGSUB = ("\(", ""), ("\)", "")
    REGSEARCH = ["(\( *)?(\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4})( *\))?", "(\( *)?(\d{4} *[-/] *\d{4})( *\))?", "(\( *)?(\d{4})( *\))?"]

    def cleanup(s):
        for p,r in REGSUB:
            s = re.sub(p,r,s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        p = []
        temporal_field = "aggregatedCHO/temporal"
        temporal = getprop(data, temporal_field) if exists(data, temporal_field) else []

        for d in getprop(data, prop):
            for regsearch in REGSEARCH:
                pattern = re.compile(regsearch)
                for match in pattern.findall(d["name"]):
                    m = "".join(match)
                    #TODO (\( *)? matches 0 and produces '' in m
                    if m:
                        d["name"] = re.sub(re.escape(m),"",d["name"])
                        temporal.append({"name": cleanup(m)})
            if d["name"].strip():
                # Append to p, which will overwrite data[prop]
                p.append(d)

        if temporal:
            setprop(data, temporal_field, temporal)
        if p:
            setprop(data, prop, p)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #18

0

Show file

File: artstor_spatial_to_dataprovider.py Project: dpla/ingestion

def artstor_spatial_to_dataprovider(body, ctype,
                                    prop="sourceResource/spatial"):
    """Sets the dataProvider from sourceResource/spatial by:

       1. Deleting the dataProvider field
       2. Splitting on semicolon if sourceResource/spatial is a string
       3. Moving the first sourceResource/spatial value to dataProvider for
          DPLA* collections
       4. Moving the "Repository: " value to dataProvider for SS* collections
       5. Removing the sourceResource/spatial field for DPLA* collections
       6. Removing any "Accession number: " values from sourceResource/spatial
          for SS* collections
       7. Removing the string "Repository: " from the dataProvider value
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    delprop(data, "dataProvider")
    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, basestring):
            v = v.split(";")

        spatial = []
        data_provider = None
        collections = getprop(data, "originalRecord/setSpec", True)
        for coll in iterify(collections): 
            if coll.startswith("DPLA"):
                data_provider = v[0]
                break
            elif coll.startswith("SS"):
                spatial = []
                for s in v:
                    if "Repository" in s:
                        data_provider = s
                    elif "Accession" not in s:
                        spatial.append(s)
                break

        delprop(data, prop)
        if spatial:
            setprop(data, prop, spatial)
        if data_provider:
            setprop(data, "dataProvider", data_provider.replace("Repository: ",
                                                                ""))
    return json.dumps(data)

Example #19

0

Show file

File: cdl_json_mapper.py Project: marktriggs/ingestion

 def update_subject(self):
     subjects = []
     if exists(self.mapped_data, "sourceResource/subject"):
         for subject in iterify(getprop(self.mapped_data, "sourceResource/subject")):
             if isinstance(subject, basestring):
                 subjects.append(subject)
             elif isinstance(subject, dict):
                 s = getprop(subject, "name", True)
                 if s:
                     subjects.append(s)
             else:
                 pass
         delprop(self.mapped_data, "sourceResource/subject", True)
     if subjects:
         self.update_source_resource({"subject": subjects})

Example #20

0

Show file

File: cdl_json_mapper.py Project: calisphere-legacy-harvester/dpla-ingestion

 def update_subject(self):
     subjects = []
     if exists(self.mapped_data, "sourceResource/subject"):
         for subject in iterify(
                 getprop(self.mapped_data, "sourceResource/subject")):
             if isinstance(subject, basestring):
                 subjects.append(subject)
             elif isinstance(subject, dict):
                 s = getprop(subject, "name", True)
                 if s:
                     subjects.append(s)
             else:
                 pass
         delprop(self.mapped_data, "sourceResource/subject", True)
     if subjects:
         self.update_source_resource({"subject": subjects})

Example #21

0

Show file

File: dc_clean_invalid_dates.py Project: chadfennell/ingestion

def convert(data, prop):
    value = getprop(data, prop)

    if isinstance(value, list):
        values = []
        for v in value:
            if check_date_dict(v):
                values.append(v)

        if values:
            setprop(data, prop, values)
        else:
            delprop(data, prop)

    elif not check_date_dict(value):
        delprop(data, prop)

Example #22

0

Show file

File: cdl_json_mapper.py Project: marktriggs/ingestion

 def update_data_provider(self):
     new_data_provider = getprop(self.mapped_data, "dataProvider", True)
     # if unset or dict or list
     if not isinstance(new_data_provider, basestring): 
         f = getprop(self.provider_data, "doc/originalRecord/facet-institution")
         if isinstance(f, dict):
             new_data_provider = f.pop("text", None)
         elif isinstance(f, list) and len(f) > 0:
             new_data_provider = f[0].pop("text", None)
         if not isinstance(new_data_provider, basestring):
             new_data_provider = None
     if new_data_provider:
         new_data_provider = new_data_provider.replace("::", ", ")
         self.mapped_data.update({"dataProvider": new_data_provider})
     else:
         delprop(self.mapped_data, "dataProvider", True)

Example #23

0

Show file

File: dc_clean_invalid_dates.py Project: calisphere-legacy-harvester/dpla-ingestion

def convert(data, prop):
    value = getprop(data, prop)

    if isinstance(value, list):
        values = []
        for v in value:
            if check_date_dict(v):
                values.append(v)

        if values:
            setprop(data, prop, values)
        else:
            delprop(data, prop)

    elif not check_date_dict(value):
        delprop(data, prop)

Example #24

0

Show file

File: cdl_identify_object.py Project: calisphere-legacy-harvester/dpla-ingestion

def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn(
                    "Object url for %s has ARK value (%s) that does not match isShownAt (%s)"
                    % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)

Example #25

0

Show file

File: cdl_json_mapper.py Project: calisphere-legacy-harvester/dpla-ingestion

 def update_data_provider(self):
     new_data_provider = getprop(self.mapped_data, "dataProvider", True)
     # if unset or dict or list
     if not isinstance(new_data_provider, basestring):
         f = getprop(self.provider_data,
                     "doc/originalRecord/facet-institution")
         if isinstance(f, dict):
             new_data_provider = f.pop("text", None)
         elif isinstance(f, list) and len(f) > 0:
             new_data_provider = f[0].pop("text", None)
         if not isinstance(new_data_provider, basestring):
             new_data_provider = None
     if new_data_provider:
         new_data_provider = new_data_provider.replace("::", ", ")
         self.mapped_data.update({"dataProvider": new_data_provider})
     else:
         delprop(self.mapped_data, "dataProvider", True)

Example #26

0

Show file

File: cdl_identify_object.py Project: dpla/ingestion

def cdl_identify_object(body, ctype):
    """
    Responsible for: adding a field to a document with the URL where we
    should expect to the find the thumbnail.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    url = None
    if exists(data, "object"):
        handle = getprop(data, "object")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break
    if exists(data, "originalRecord/doc/isShownBy"):
        handle = getprop(data, "originalRecord/doc/isShownBy")
        for h in (handle if not isinstance(handle, basestring) else [handle]):
            if is_absolute(h):
                url = h
                break

    if url:
        if 'content.cdlib.org' in url:
            base_url, obj_id, object_type = url.rsplit("/", 2)
            is_shown_at = getprop(data, "isShownAt")
            is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
            if obj_id != is_shown_at_id:
                logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id))
                obj_id = is_shown_at_id
            url = "/".join([base_url, obj_id, object_type])
            if object_type == "hi-res":
                setprop(data, "hasView", {"@id": url})
                url = url.replace('hi-res', 'thumbnail')

        setprop(data, "object", url)
    else:
        logger.warn("No url found for object in id %s" % data["_id"])
        delprop(data, "object", True)
    return json.dumps(data)

Example #27

0

Show file

File: compare_with_schema.py Project: dpla/ingestion

def comparewithschema(body, ctype):
    """
    Service that accepts a JSON document and removes any fields not listed
    as part of the schema.
    """

    # TODO: Send GET request to API once schema endpoint is created

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if "_id" not in data or ("sourceResource" not in data and
                             data.get("ingestType") == "item"):
        return body

    type = data.get("ingestType")
    if type:
        props = ["collection/properties"] if type == "collection" else \
                ["item/properties",
                 "item/properties/sourceResource/properties"]
        for prop in props:
            schema_keys = getprop(schema, prop).keys()

            if "sourceResource" in prop:
                data_keys = data["sourceResource"].keys()
                field_prefix = "sourceResource/"
            else:
                data_keys = data.keys()
                data_keys.remove("_id")
                field_prefix = ""

            # Remove any keys in the document that are not found in the schema
            for field in [k for k in data_keys if k not in schema_keys]:
                field = field_prefix + field
                logger.error("Field %s for %s not found in schema; deleting" %
                             (field, data.get("_id")))
                delprop(data, field)
    else:
        logger.error("Unknown type %s for %s" % (type, data.get("_id")))

    return json.dumps(data)

Example #28

0

Show file

def comparewithschema(body, ctype):
    """
    Service that accepts a JSON document and removes any fields not listed
    as part of the schema.
    """

    # TODO: Send GET request to API once schema endpoint is created

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if "_id" not in data or ("sourceResource" not in data
                             and data.get("ingestType") == "item"):
        return body

    type = data.get("ingestType")
    if type:
        props = ["collection/properties"] if type == "collection" else \
                ["item/properties",
                 "item/properties/sourceResource/properties"]
        for prop in props:
            schema_keys = getprop(schema, prop).keys()

            if "sourceResource" in prop:
                data_keys = data["sourceResource"].keys()
                field_prefix = "sourceResource/"
            else:
                data_keys = data.keys()
                data_keys.remove("_id")
                field_prefix = ""

            # Remove any keys in the document that are not found in the schema
            for field in [k for k in data_keys if k not in schema_keys]:
                field = field_prefix + field
                logger.error("Field %s for %s not found in schema; deleting" %
                             (field, data.get("_id")))
                delprop(data, field)
    else:
        logger.error("Unknown type %s for %s" % (type, data.get("_id")))

    return json.dumps(data)

Example #29

0

Show file

File: enrich-date.py Project: chadfennell/ingestion

def convert_dates(data, prop, earliest):
    """Converts dates.

    Arguments:
    data Dict - Data for conversion.
    prop Str - Properties dividided with comma.
    earliest Bool - True - the function will set only the earliest date.
    False - the function will set all dates.

    Returns:
    Nothing, the replacement is done in place.
    """
    dates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)

            if not isinstance(v, dict):
                for s in (v if not isinstance(v, basestring) else [v]):
                    for part in s.split(";"):
                        display_date = remove_brackets_and_strip(part)
                        stripped = clean_date(display_date)
                        if len(stripped) < 4:
                            continue
                        a, b = parse_date_or_range(stripped)
                        if b != '3000-01-01':
                            dates.append({
                                "begin": a,
                                "end": b,
                                "displayDate": display_date
                            })

    dates.sort(key=lambda d: d["begin"]
               if d["begin"] is not None else DEFAULT_DATETIME_STR)

    value_to_set = dates
    if earliest and dates:
        value_to_set = dates[0]

    if value_to_set:
        setprop(data, p, value_to_set)
    else:
        if exists(data, p):
            delprop(data, p)

Example #30

0

Show file

File: enrich-date.py Project: chadfennell/ingestion

def convert_dates(data, prop, earliest):
    """Converts dates.

    Arguments:
    data Dict - Data for conversion.
    prop Str - Properties dividided with comma.
    earliest Bool - True - the function will set only the earliest date.
    False - the function will set all dates.

    Returns:
    Nothing, the replacement is done in place.
    """
    dates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)

            if not isinstance(v, dict):
                for s in (v if not isinstance(v, basestring) else [v]):
                    for part in s.split(";"):
                        display_date = remove_brackets_and_strip(part)
                        stripped = clean_date(display_date)
                        if len(stripped) < 4:
                            continue
                        a, b = parse_date_or_range(stripped)
                        if b != '3000-01-01':
                            dates.append( {
                                    "begin": a,
                                    "end": b,
                                    "displayDate" : display_date
                                })

    dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)

    value_to_set = dates
    if earliest and dates:
        value_to_set = dates[0]

    if value_to_set:
        setprop(data, p, value_to_set)
    else:
        if exists(data, p):
            delprop(data, p)

Example #31

0

Show file

File: mwdl_cleanup_field.py Project: chadfennell/ingestion

def convert(data, prop):

    value = getprop(data, prop, True)
    if not value:
        return

    if isinstance(value, basestring):
        if value == "creator":
            delprop(data, prop)
        else:
            v = convert_field(value)
            setprop(data, prop, v)

    elif isinstance(value, list):
        values = []
        for item in value:
            if item == "creator":
                continue
            v = convert_field(item)
            values.append(v)
        setprop(data, prop, values)

Example #32

0

Show file

def convert(data, prop):

    value = getprop(data, prop, True)
    if not value:
        return

    if isinstance(value, basestring):
        if value == "creator":
            delprop(data, prop)
        else:
            v = convert_field(value)
            setprop(data, prop, v)

    elif isinstance(value, list):
        values = []
        for item in value:
            if item == "creator":
                continue
            v = convert_field(item)
            values.append(v)
        setprop(data, prop, values)

Example #33

0

Show file

File: artstor_spatial_to_dataprovider.py Project: chadfennell/ingestion

def artstor_spatial_to_dataprovider(body, ctype,
                                    prop="sourceResource/spatial"):
    """ Splits spatial on semicolon and copies the first value to dataProvider
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, list):
            v = v[0]
        if isinstance(v, basestring):
            v = v.split(";")[0]    
            setprop(data, "dataProvider", v)
        delprop(data, prop)

    return json.dumps(data)

Example #34

0

Show file

def remove_list_values(body, ctype, prop=None, values=None):
    """Given a comma-separated string of values, removes any instance of each
       value from the prop.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    v = getprop(data, prop, True)

    if isinstance(v, list) and values is not None:
        values = values.split(",")
        v = [s for s in v if s not in values]
        if v:
            setprop(data, prop, v)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #35

0

Show file

File: remove_list_values.py Project: amber-reichert/ingestion

def remove_list_values(body, ctype, prop=None, values=None):
    """Given a comma-separated string of values, removes any instance of each
       value from the prop.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    v = getprop(data, prop, True)
    
    if isinstance(v, list) and values is not None:
        values = values.split(",")
        v = [s for s in v if s not in values]
        if v:
            setprop(data, prop, v)
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #36

0

Show file

File: enrich-format.py Project: chadfennell/ingestion

def enrichformat(body,
                 ctype,
                 action="enrich-format",
                 prop="sourceResource/format",
                 type_field="sourceResource/type"):
    """
    Service that accepts a JSON document and enriches the "format" field of
    that document by: 

    a) Setting the format to be all lowercase
    b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) Checking to see if the field is a valid IMT
       See http://www.iana.org/assignments/media-types for list of valid
       media-types. We require that a subtype is defined.
    d) Removing any extra text after the IMT
    e) Moving valid IMT values to hasView/format if hasView exists and
       its format is not set
    f) Setting type field from format field, if it is not set. The format field
       is taken if it is a string, or the first element if it is a list. It is
        then split and the first part of IMT is taken.

    By default works on the 'sourceResource/format' field but can be overridden
    by passing the name of the field to use as the 'prop' parameter.
    """

    FORMAT_2_TYPE_MAPPINGS = {
        "audio": "sound",
        "image": "image",
        "video": "moving image",
        "text": "text"
    }

    REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \
              ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \
              ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \
              ('^jpg$', 'image/jpeg'), ('\W$', '')
    IMT_TYPES = [
        'application', 'audio', 'image', 'message', 'model', 'multipart',
        'text', 'video'
    ]

    def get_ext(s):
        ext = os.path.splitext(s)[1].split('.')

        return ext[1] if len(ext) == 2 else ""

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s)
        return s

    def is_imt(s):
        logger.debug("Checking: " + s)
        imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    imt_values = []
    if exists(data, prop):
        v = getprop(data, prop)
        format = []
        hasview_format = []

        for s in (v if not isinstance(v, basestring) else [v]):
            if s.startswith("http") and is_absolute(s):
                s = get_ext(s)
            cleaned = cleanup(s)
            if is_imt(cleaned):
                # Append to imt_values for use in type
                imt_values.append(cleaned)
                # Move IMT values to hasView/format else discard
                if exists(data, "hasView") and not \
                    exists(data, "hasView/format") and \
                                cleaned not in hasview_format:
                    hasview_format.append(cleaned)
            else:
                # Retain non-IMT values in sourceResource/format, non-cleaned
                if s not in format:
                    format.append(s)

        if format:
            if len(format) == 1:
                format = format[0]
            setprop(data, prop, format)
        else:
            delprop(data, prop)

        if hasview_format:
            if len(hasview_format) == 1:
                hasview_format = hasview_format[0]
            setprop(data, "hasView/format", hasview_format)

    # Setting the type if it is empty.
    if not exists(data, type_field) and imt_values:
        type = []
        for imt in imt_values:
            t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True)
            if t and t not in type:
                type.append(t)

        if type:
            if len(type) == 1:
                type = type[0]
            setprop(data, type_field, type)

    return json.dumps(data)

Example #37

0

Show file

def enrichtype(body,
               ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format"):
    """   
    Service that accepts a JSON document and enriches the "type" field of that
    document by: 

    a) making the type lowercase
    b) converting "image" to "still image"
      (TODO: Amy to confirm that this is ok)
    c) applying a set of regexps to do data cleanup (remove plural forms)
    d) moving all items that are not standard DC types to the
       sourceResource/format
       (http://dublincore.org/documents/resource-typelist/)
    
    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter
    """

    REGEXPS = ('images','image'), ('still image','image'),\
              ('textual records', 'text'),\
              ('photographs and other graphic materials', 'image'),\
              ('texts', 'text')
    DC_TYPES = [
        'collection', 'dataset', 'event', 'image', 'still image',
        'interactive resource', 'moving image', 'physical object', 'service',
        'software', 'sound', 'text'
    ]

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
        return s

    def is_dc_type(s):
        return s in DC_TYPES

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        dctype = []
        f = getprop(data, format_field) if exists(data, format_field) else []
        if not isinstance(f, list):
            f = [f]

        for s in (v if not isinstance(v, basestring) else [v]):
            if is_dc_type(cleanup(s)):
                dctype.append(cleanup(s))
            else:
                f.append(s)

        if dctype:
            if len(dctype) == 1:
                dctype = dctype[0]
            setprop(data, prop, dctype)
        else:
            delprop(data, prop)

        if len(f) > 1:
            setprop(data, format_field, f)
        elif len(f) == 1:
            setprop(data, format_field, f[0])

    return json.dumps(data)

Example #38

0

Show file

def copyprop(body,
             ctype,
             prop=None,
             to_prop=None,
             create=False,
             key=None,
             remove=None,
             no_replace=None,
             no_overwrite=None):
    """Copies value in one prop to another prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    create -- creates to_prop if True (default False)
    key -- the key to use if to_prop is a dict (default None)
    remove  -- removes prop if True (default False)
    no_replace -- creates list of to_prop string and appends prop if True
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, to_prop) and no_overwrite:
        pass
    else:
        if exists(data, prop) and create and not exists(data, to_prop):
            val = {} if key else ""
            setprop(data, to_prop, val)

        if exists(data, prop) and exists(data, to_prop):
            val = getprop(data, prop)
            to_element = getprop(data, to_prop)

            if isinstance(to_element, basestring):
                if no_replace:
                    el = [to_element] if to_element else []
                    el.append(val)
                    # Flatten
                    val = [
                        e for s in el
                        for e in (s if not isinstance(s, basestring) else [s])
                    ]
                setprop(data, to_prop, val)
            else:
                # If key is set, assume to_element is dict or list of dicts
                if key:
                    if not isinstance(to_element, list):
                        to_element = [to_element]
                    for dict in to_element:
                        if exists(dict, key) or create:
                            setprop(dict, key, val)
                        else:
                            msg = "Key %s does not exist in %s" % (key,
                                                                   to_prop)
                            logger.debug(msg)
                else:
                    # Handle case where to_element is a list
                    if isinstance(to_element, list):
                        if isinstance(val, list):
                            to_element = to_element + val
                        else:
                            to_element.append(val)
                        setprop(data, to_prop, to_element)
                    else:
                        # to_prop is dictionary but no key was passed.
                        msg = "%s is a dictionary but no key was passed" % to_prop
                        logger.warn(msg)
                        setprop(data, to_prop, val)

            if remove:
                delprop(data, prop)

    return json.dumps(data)

Example #39

0

Show file

    def __init__(self,
                 provider_data,
                 key_prefix=None,
                 datafield_tag='datafield',
                 controlfield_tag='controlfield',
                 pymarc=False):
        super(MARCMapper, self).__init__(provider_data, key_prefix)

        # Fields controlfield, datafield, and leader may be nested within the
        # metadata/record field for DPLA fetcher items
        prop = "metadata/record"
        if exists(self.provider_data, prop):
            self.provider_data.update(getprop(self.provider_data, prop))
            delprop(self.provider_data, prop)

        self.control_001 = ""
        self.control_007_01 = ""
        self.control_008_18 = ""
        self.control_008_21 = ""
        self.control_008_28 = ""
        self.control_format_char = ""
        self.datafield_tag = datafield_tag
        self.controlfield_tag = controlfield_tag
        self.datafield_086_or_087 = False
        self.pymarc = pymarc

        self.identifier_tag_labels = {
            "020": "ISBN:",
            "022": "ISSN:",
            "050": "LC call number:"
        }

        # Mapping dictionary for use with datafield
        # Keys are used to check if there is a tag match. If so, the value
        # provides a list of (property, code) tuples. In the case where certain
        # tags have prominence over others, an index is used and the tuples
        # will be of the form (property, index, code). To exclude a code,
        # prefix it with a "!": [("format", "!cd")] will exclude the "c"
        # "d" codes (see method _get_values).
        self.mapping_dict = {
            lambda t: t == "856": [(self.map_is_shown_at, "u"),
                                   (self.map_is_shown_by, "u")],
            lambda t: t == "041": [(self.map_language, "a")],
            lambda t: t == "260": [(self.map_display_date, "c"),
                                   (self.map_publisher, "ab")],
            lambda t: t == "300": [(self.map_extent, None)],
            lambda t: t in ("337", "338"): [(self.map_format, "a")],
            lambda t: t == "340": [(self.map_extent, "b"),
                                   (self.map_format, "a")],
            lambda t: t == "050": [(self.map_identifier, "ab")],
            lambda t: t in ("020", "022", "035"): [(self.map_identifier, "a")],
            lambda t: t in ("100", "110", "111"): [(self.map_creator, None)],
            lambda t: (760 <= int(t) <= 787): [(self.map_relation, None)],
            lambda t: (t != "538" and t.startswith("5")):
            [(self.map_description, "a")],
            lambda t: t in ("506", "540"): [(self.map_rights, None)],
            lambda t: t == "648": [(self.map_temporal, None)],
            lambda t: t in ("700", "710", "711", "720"):
            [(self.map_contributor, None)],
            lambda t: t == "245": [(self.map_title, 0, "!c")],
            lambda t: t == "242": [(self.map_title, 1, None)],
            lambda t: t == "240": [(self.map_title, 2, None)],
            lambda t: t == "651": [(self.map_spatial, "a")],
            lambda t: (int(t) in set([600, 630, 650, 651] + range(610, 620) + range(
                                         653, 659) + range(690, 700))):
            [(self.map_subject, None), (self.map_format, "v"),
             (self.map_temporal, "y"), (self.map_spatial, "z")],
        }

        self.type_mapping = {
            "datafield":
            OrderedDict([("AJ", ("Journal", "Text")),
                         ("AN", ("Newspaper", "Text")),
                         ("BI", ("Biography", "Text")),
                         ("BK", ("Book", "Text")),
                         ("CF", ("Computer File", "Interactive Resource")),
                         ("CR", ("CDROM", "Interactive Resource")),
                         ("CS", ("Software", "Software")),
                         ("DI", ("Dictionaries", "Text")),
                         ("DR", ("Directories", "Text")),
                         ("EN", ("Encyclopedias", "Text")),
                         ("HT", ("HathiTrust", None)),
                         ("MN", ("Maps-Atlas", "Image")),
                         ("MP", ("Map", "Image")),
                         ("MS", ("Musical Score", "Text")),
                         ("MU", ("Music", "Text")),
                         ("MV", ("Archive", "Collection")),
                         ("MW", ("Manuscript", "Text")),
                         ("MX", ("Mixed Material", "Collection")),
                         ("PP", ("Photograph/Pictorial Works", "Image")),
                         ("RC", ("Audio CD", "Sound")),
                         ("RL", ("Audio LP", "Sound")),
                         ("RM", ("Music", "Sound")),
                         ("RS", ("Spoken word", "Sound")),
                         ("RU", (None, "Sound")), ("SE", ("Serial", "Text")),
                         ("SX", ("Serial", "Text")),
                         ("VB", ("Video (Blu-ray)", "Moving Image")),
                         ("VD", ("Video (DVD)", "Moving Image")),
                         ("VG", ("Video Games", "Moving Image")),
                         ("VH", ("Video (VHS)", "Moving Image")),
                         ("VL", ("Motion Picture", "Moving Image")),
                         ("VM", ("Visual Material", "Image")),
                         ("WM", ("Microform", "Text")),
                         ("XC", ("Conference", "Text")),
                         ("XS", ("Statistics", "Text"))]),
            "leader":
            OrderedDict([("am", ("Book", "Text")),
                         ("asn", ("Newspapers", "Text")),
                         ("as", ("Serial", "Text")), ("aa", ("Book", "Text")),
                         ("a(?![mcs])", ("Serial", "Text")),
                         ("[cd].*", ("Musical Score", "Text")),
                         ("t.*", ("Manuscript", "Text")),
                         ("[ef].*", ("Maps", "Image")),
                         ("g.[st]", ("Photograph/Pictorial Works", "Image")),
                         ("g.[cdfo]", ("Film/Video", "Moving Image")),
                         ("g.*", (None, "Image")),
                         ("k.*", ("Photograph/Pictorial Works", "Image")),
                         ("i.*", ("Nonmusic", "Sound")),
                         ("j.*", ("Music", "Sound")),
                         ("r.*", (None, "Physical object")),
                         ("p[cs].*", (None, "Collection")),
                         ("m.*", (None, "Interactive Resource")),
                         ("o.*", (None, "Collection"))])
        }

Example #40

0

Show file

File: remove_property.py Project: mlhale7/ingestion

from akara import response
from akara.services import simple_service
from amara.thirdparty import json
from dplaingestion.selector import delprop


@simple_service(
    'POST',
    'http://purl.org/la/dp/remove_property',
    'remove_property',
    'application/json')

def remove_property(
        body,
        ctype,
        prop,
        action="remove_property",
        ):

    try:
        data = json.loads(body)
    except Exception, err:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON: " + str(err)

    delprop(data, prop, True)

    return json.dumps(data)

Example #41

0

Show file

File: enrich-format.py Project: dpla/ingestion

def enrichformat(body, ctype, action="enrich-format",
                 prop="sourceResource/format",
                 type_field="sourceResource/type"):
    """
    Service that accepts a JSON document and enriches the "format" field of
    that document by: 

    a) Setting the format to be all lowercase
    b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) Checking to see if the field is a valid IMT
       See http://www.iana.org/assignments/media-types for list of valid
       media-types. We require that a subtype is defined.
    d) Removing any extra text after the IMT
    e) Moving valid IMT values to hasView/format if hasView exists and
       its format is not set
    f) Setting type field from format field, if it is not set. The format field
       is taken if it is a string, or the first element if it is a list. It is
        then split and the first part of IMT is taken.

    By default works on the 'sourceResource/format' field but can be overridden
    by passing the name of the field to use as the 'prop' parameter.
    """

    FORMAT_2_TYPE_MAPPINGS = {
        "audio": "sound",
        "image": "image",
        "video": "moving image",
        "text": "text"
    }

    REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \
              ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \
              ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \
              ('^jpg$', 'image/jpeg'), ('\W$', '')
    IMT_TYPES = ['application', 'audio', 'image', 'message', 'model',
                 'multipart', 'text', 'video']

    def get_ext(s):
        ext = os.path.splitext(s)[1].split('.')

        return ext[1] if len(ext) == 2 else ""

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1", s)
        return s

    def is_imt(s):
        imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    imt_values = []
    if exists(data, prop):
        v = getprop(data, prop)
        format = []
        hasview_format = []

        for s in (filter(None,v) if not isinstance(v, basestring) else [v]):
            if s is not None and s.startswith("http") and is_absolute(s):
                s = get_ext(s)
            cleaned = cleanup(s)
            if is_imt(cleaned):
                # Append to imt_values for use in type
                imt_values.append(cleaned)
                # Move IMT values to hasView/format else discard
                if exists(data, "hasView") and not \
                    exists(data, "hasView/format") and \
                                cleaned not in hasview_format:
                    hasview_format.append(cleaned)
            else:
                # Retain non-IMT values in sourceResource/format, non-cleaned
                if s not in format:
                    format.append(s)

        if format:
            if len(format) == 1:
                format = format[0]
            setprop(data, prop, format)
        else:
            delprop(data, prop)

        if hasview_format:
            if len(hasview_format) == 1:
                hasview_format = hasview_format[0]
            setprop(data, "hasView/format", hasview_format)

    # Setting the type if it is empty.
    if not exists(data, type_field) and imt_values:
        type = []
        for imt in imt_values:
            t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True)
            if t and t not in type:
                type.append(t)

        if type:
            if len(type) == 1:
                type = type[0]
            setprop(data, type_field, type)

    return json.dumps(data)

Example #42

0

Show file

File: copy_prop.py Project: peterkingalex/ingestion

def copyprop(
    body, ctype, prop=None, to_prop=None, create=False, key=None, remove=None, no_replace=None, no_overwrite=None
):
    """Copies value in one prop to another prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    create -- creates to_prop if True (default False)
    key -- the key to use if to_prop is a dict (default None)
    remove  -- removes prop if True (default False)
    no_replace -- creates list of to_prop string and appends prop if True
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, to_prop) and no_overwrite:
        pass
    else:
        if exists(data, prop) and create and not exists(data, to_prop):
            val = {} if key else ""
            setprop(data, to_prop, val)

        if exists(data, prop) and exists(data, to_prop):
            val = getprop(data, prop)
            to_element = getprop(data, to_prop)

            if isinstance(to_element, basestring):
                if no_replace:
                    el = [to_element] if to_element else []
                    el.append(val)
                    # Flatten
                    val = [e for s in el for e in (s if not isinstance(s, basestring) else [s])]
                setprop(data, to_prop, val)
            else:
                # If key is set, assume to_element is dict or list of dicts
                if key:
                    if not isinstance(to_element, list):
                        to_element = [to_element]
                    for dict in to_element:
                        if exists(dict, key) or create:
                            setprop(dict, key, val)
                        else:
                            logger.error("Key %s does not exist in %s" % (key, to_prop))
                else:
                    # Handle case where to_element is a list
                    if isinstance(to_element, list):
                        if isinstance(val, list):
                            to_element = to_element + val
                        else:
                            to_element.append(val)
                        setprop(data, to_prop, to_element)
                    else:
                        # to_prop is dictionary but no key was passed.
                        logger.warn("%s is a dict but no key was passed" % to_prop)
                        setprop(data, to_prop, val)

            if remove:
                delprop(data, prop)

    return json.dumps(data)

Example #43

0

Show file

File: enrich_language.py Project: dpla/ingestion

def enrich_language(body, ctype, action="enrich_language",
                      prop="sourceResource/language"):
    """
    Service that accepts a JSON document and sets the language ISO 639-3
    code(s) and language name from the current language value(s) by:

    a) Checking if the value is a language code, else
    a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else
    c) Attempting to find an exact language name match, else
    d) Attempting to find language name matches withing the value
    """

    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        language_strings = [v] if not isinstance(v, list) else v

        iso_codes = []
        for lang_string in language_strings:
            # Check if raw value is a code
            if lang_string not in iso_codes and lang_string in ISO639_3_SUBST:
                iso_codes.append(lang_string)
            else:
                # If lang_string is an ISO 639-1 code, convert to ISO 639-3
                iso3 = iso1_to_iso3(
                        re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip()
                        )
                if iso3 not in iso_codes and iso3 in ISO639_3_SUBST:
                    iso_codes.append(iso3)
                else:
                    # First check for exact language name matches
                    for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items():
                        match = regex.match(lang_string.strip())
                        if match:
                            iso_codes.append(iso_code)
                            break

                    if match is None:
                        # Check for language names with word boundary regex
                        for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items():
                            if regex.search(lang_string):
                                iso_codes.append(iso_code)

        if iso_codes:
            seen = set()
            language = [{"iso639_3": code, "name": ISO639_3_SUBST[code]} for
                        code in iso_codes if not
                        (code in seen or seen.add(code))]
            setprop(data, prop, language)
        else:
            logger.warning("Did not find language code in [%s] for record %s" %
                           (language_strings, data["_id"]))
            delprop(data, prop)

    return json.dumps(data)

Example #44

0

Show file

def delete_field(doc, field):
    delprop(doc, field, keyErrorAsNone=True)

Example #45

0

Show file

File: enrich-type.py Project: amber-reichert/ingestion

def enrichtype(body,ctype,action="enrich-type", prop="sourceResource/type",
               format_field="sourceResource/format"):
    """   
    Service that accepts a JSON document and enriches the "type" field of that
    document by: 

    a) making the type lowercase
    b) converting "image" to "still image"
      (TODO: Amy to confirm that this is ok)
    c) applying a set of regexps to do data cleanup (remove plural forms)
    d) moving all items that are not standard DC types to the
       sourceResource/format
       (http://dublincore.org/documents/resource-typelist/)
    
    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter
    """

    REGEXPS = ('images','image'), ('still image','image'),\
              ('textual records', 'text'),\
              ('photographs and other graphic materials', 'image'),\
              ('texts', 'text')
    DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image',
                'interactive resource', 'moving image',
                'physical object', 'service', 'software', 'sound',
                'text']

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
        return s

    def is_dc_type(s):
        return s in DC_TYPES

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = getprop(data, prop)
        dctype = []
        f = getprop(data, format_field) if exists(data, format_field) else []
        if not isinstance(f, list):
            f = [f]

        for s in (v if not isinstance(v,basestring) else [v]):
            if is_dc_type(cleanup(s)):
                dctype.append(cleanup(s))
            else:
                f.append(s)

        if dctype:
            if len(dctype) == 1:
                dctype = dctype[0]
            setprop(data, prop, dctype)
        else:
            delprop(data, prop)

        if len(f) > 1:
            setprop(data, format_field, f)
        elif len(f) == 1:
            setprop(data, format_field, f[0])

    return json.dumps(data)

Example #46

0

Show file

File: cleanup_language.py Project: amber-reichert/ingestion

def cleanup_language(body, ctype, action="cleanup_language",
                     prop="sourceResource/language"):
    """
    Service that accepts a JSON document and cleans each value of the language
    field of that document by:

    a) stripping periods, brackets and parentheses
    b) convert from ISO 639-1 to ISO 639-3
    c) looking for matches in the value using LANGUAGE_NAME_REGEXES
    """

    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        v = [v] if not isinstance(v, list) else v

        languages = []
        for s in v:
            if s not in languages and s in ISO639_3_SUBST:
                languages.append(s)
            else:
                s = re.sub("[\.\[\]]", "", s).lower().strip()
                iso = re.sub("[\(\)]", "", s)
                # First convert iso1 to iso3
                iso = iso1_to_iso3(iso)
                if iso in ISO639_3_SUBST and iso not in languages:
                    languages.append(iso)
                else:
                    for n in iso.split(" "):
                        # Since we split on whitespace, we only want to check
                        # against single word reference names so we use
                        # ISO639_3_1
                        n = n.title()
                        if n in ISO639_3_1.values() and n not in languages:
                            languages.append(n)

                    # Use s (with parentheses intact)
                    match = [r.search(s).group() for r in
                             LANGUAGE_NAME_REGEXES if r.search(s)]
                    if match:
                        languages += list(set([m.strip().title() for m in
                                          match]) - set(languages))

        if languages:
            # Remove duplicates
            lang = []
            [lang.append(l) for l in languages
             if ISO639_3_SUBST.get(l, None) not in languages]
            setprop(data, prop, filter(None, lang))
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #47

0

Show file

File: scdl_geocode_regions.py Project: amber-reichert/ingestion

def geocode_region(spatial):
    setprop(spatial, "coordinates", "%s %s" % REGIONS[getprop(spatial, "name")])
    delprop(spatial, "county")
    setprop(spatial, "state", "South Carolina")
    setprop(spatial, "country", "United States")
    return spatial

Example #48

0

Show file

File: cleanup_language.py Project: chadfennell/ingestion

def cleanup_language(body,
                     ctype,
                     action="cleanup_language",
                     prop="sourceResource/language"):
    """
    Service that accepts a JSON document and cleans each value of the language
    field of that document by:

    a) stripping periods, brackets and parentheses
    b) convert from ISO 639-1 to ISO 639-3
    c) looking for matches in the value using LANGUAGE_NAME_REGEXES
    """
    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        v = [v] if not isinstance(v, list) else v

        languages = []
        for s in v:
            if s not in languages and s in ISO639_3_SUBST:
                languages.append(s)
            else:
                s = re.sub("[\.\[\]]", "", s).lower().strip()
                iso = re.sub("[\(\)]", "", s)
                # First convert iso1 to iso3
                iso = iso1_to_iso3(iso)
                if iso in ISO639_3_SUBST and iso not in languages:
                    languages.append(iso)
                else:
                    for n in iso.split(" "):
                        # Since we split on whitespace, we only want to check
                        # against single word reference names so we use
                        # ISO639_3_1
                        n = n.title()
                        if n in ISO639_3_1.values() and n not in languages:
                            languages.append(n)

                    # Use s (with parentheses intact)
                    match = [
                        r.search(s).group() for r in LANGUAGE_NAME_REGEXES
                        if r.search(s)
                    ]
                    if match:
                        languages += list(
                            set([m.strip().title()
                                 for m in match]) - set(languages))

        if languages:
            # Remove duplicates
            lang = []
            [
                lang.append(l) for l in languages
                if ISO639_3_SUBST.get(l, None) not in languages
            ]
            setprop(data, prop, filter(None, lang))
        else:
            delprop(data, prop)

    return json.dumps(data)

Example #49

0

Show file

File: marc_mapper.py Project: dpla/ingestion

    def __init__(self, provider_data, key_prefix=None):
        super(MARCMapper, self).__init__(provider_data, key_prefix)

        # Fields controlfield, datafield, and leader may be nested within the
        # metadata/record field
        prop = "metadata/record"
        if exists(self.provider_data, prop):
            self.provider_data.update(getprop(self.provider_data, prop))
            delprop(self.provider_data, prop)

        self.control_001 = ""
        self.control_007_01 = ""
        self.control_008_18 = ""
        self.control_008_21 = ""
        self.control_008_28 = ""
        self.control_format_char = ""
        self.datafield_086_or_087 = False

        self.identifier_tag_labels = {
            "020": "ISBN:",
            "022": "ISSN:",
            "050": "LC call number:"
        }

        # Mapping dictionary for use with datafield
        # Keys are used to check if there is a tag match. If so, the value
        # provides a list of (property, code) tuples. In the case where certain
        # tags have prominence over others, an index is used and the tuples
        # will be of the form (property, index, code). To exclude a code,
        # prefix it with a "!": [("format", "!cd")] will exclude the "c"
        # "d" codes (see method _get_values).
        self.mapping_dict = {
            lambda t: t == "856":               [(self.map_is_shown_at, "u")],
            lambda t: t == "041":               [(self.map_language, "a")],
            lambda t: t == "260":               [(self.map_display_date, "c"),
                                                 (self.map_publisher, "ab")],

            lambda t: t == "300":               [(self.map_extent, "ac")],
            lambda t: t in ("337", "338"):      [(self.map_format, "a")],

            lambda t: t == "340":               [(self.map_extent, "b"),
                                                 (self.map_format, "a")],

            lambda t: t == "050":               [(self.map_identifier, "ab")],
            lambda t: t in ("020", "022",
                            "035"):             [(self.map_identifier, "a")],

            lambda t: t in ("100", "110",
                            "111"):             [(self.map_creator, None)],
            lambda t: (760 <= int(t) <= 787):   [(self.map_relation, None)],

            lambda t: (t != "538" and
                       t.startswith("5")):      [(self.map_description, "a")],

            lambda t: t in ("506", "540"):      [(self.map_rights, None)],
            lambda t: t == "648":               [(self.map_temporal, None)],

            lambda t: t in ("700", "710",
                            "711", "720"):      [(self.map_contributor, None)],

            lambda t: t == "245":               [(self.map_title, 0, "!c")],
            lambda t: t == "242":               [(self.map_title, 1, None)],
            lambda t: t == "240":               [(self.map_title, 2, None)],
            lambda t: t == "651":               [(self.map_spatial, "a")],

            lambda t: (int(t) in
                       set([600, 630, 650, 651] +
                           range(610, 620) +
                           range(653, 659) +
                           range(690, 700))):   [(self.map_subject, None),
                                                 (self.map_format, "v"),
                                                 (self.map_temporal, "y"),
                                                 (self.map_spatial, "z")],
        }

        self.type_mapping = {
            "datafield": OrderedDict([
                ("AJ", ("Journal", "Text")),
                ("AN", ("Newspaper", "Text")),
                ("BI", ("Biography", "Text")),
                ("BK", ("Book", "Text")),
                ("CF", ("Computer File", "Interactive Resource")),
                ("CR", ("CDROM", "Interactive Resource")),
                ("CS", ("Software", "Software")),
                ("DI", ("Dictionaries", "Text")),
                ("DR", ("Directories", "Text")),
                ("EN", ("Encyclopedias", "Text")),
                ("HT", ("HathiTrust", None)),
                ("MN", ("Maps-Atlas", "Image")),
                ("MP", ("Map", "Image")),
                ("MS", ("Musical Score", "Text")),
                ("MU", ("Music", "Text")),
                ("MV", ("Archive", "Collection")),
                ("MW", ("Manuscript", "Text")),
                ("MX", ("Mixed Material", "Collection")),
                ("PP", ("Photograph/Pictorial Works", "Image")),
                ("RC", ("Audio CD", "Sound")),
                ("RL", ("Audio LP", "Sound")),
                ("RM", ("Music", "Sound")),
                ("RS", ("Spoken word", "Sound")),
                ("RU", (None, "Sound")),
                ("SE", ("Serial", "Text")),
                ("SX", ("Serial", "Text")),
                ("VB", ("Video (Blu-ray)", "Moving Image")),
                ("VD", ("Video (DVD)", "Moving Image")),
                ("VG", ("Video Games", "Moving Image")),
                ("VH", ("Video (VHS)", "Moving Image")),
                ("VL", ("Motion Picture", "Moving Image")),
                ("VM", ("Visual Material", "Image")),
                ("WM", ("Microform", "Text")),
                ("XC", ("Conference", "Text")),
                ("XS", ("Statistics", "Text"))
            ]),
            "leader": OrderedDict([
                ("am", ("Book", "Text")),
                ("asn", ("Newspapers", "Text")),
                ("as", ("Serial", "Text")),
                ("aa", ("Book", "Text")),
                ("a(?![mcs])", ("Serial", "Text")),
                ("[cd].*", ("Musical Score", "Text")),
                ("t.*", ("Manuscript", "Text")),
                ("[ef].*", ("Maps", "Image")),
                ("g.[st]", ("Photograph/Pictorial Works", "Image")),
                ("g.[cdfo]", ("Film/Video", "Moving Image")),
                ("g.*", (None, "Image")),
                ("k.*", ("Photograph/Pictorial Works", "Image")),
                ("i.*", ("Nonmusic", "Sound")),
                ("j.*", ("Music", "Sound")),
                ("r.*", (None, "Physical object")),
                ("p[cs].*", (None, "Collection")),
                ("m.*", (None, "Interactive Resource")),
                ("o.*", (None, "Collection"))
            ])
        }

Example #50

0

Show file

File: enrich_language.py Project: calisphere-legacy-harvester/dpla-ingestion

def enrich_language(body,
                    ctype,
                    action="enrich_language",
                    prop="sourceResource/language"):
    """
    Service that accepts a JSON document and sets the language ISO 639-3
    code(s) and language name from the current language value(s) by:

    a) Checking if the value is a language code, else
    a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else
    c) Attempting to find an exact language name match, else
    d) Attempting to find language name matches withing the value
    """
    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        language_strings = [v] if not isinstance(v, list) else v

        iso_codes = []
        for lang_string in language_strings:
            # Check if raw value is a code
            if lang_string not in iso_codes and lang_string in ISO639_3_SUBST:
                iso_codes.append(lang_string)
            else:
                # If lang_string is an ISO 639-1 code, convert to ISO 639-3
                iso3 = iso1_to_iso3(
                    re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip())
                if iso3 not in iso_codes and iso3 in ISO639_3_SUBST:
                    iso_codes.append(iso3)
                else:
                    # First check for exact language name matches
                    for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items():
                        match = regex.match(lang_string.strip())
                        if match:
                            iso_codes.append(iso_code)
                            break

                    if match is None:
                        # Check for language names with word boundary regex
                        for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items(
                        ):
                            if regex.search(lang_string):
                                iso_codes.append(iso_code)

        if iso_codes:
            seen = set()
            language = [{
                "iso639_3": code,
                "name": ISO639_3_SUBST[code]
            } for code in iso_codes if not (code in seen or seen.add(code))]
            setprop(data, prop, language)
        else:
            logger.warning("Did not find language code in [%s] for record %s" %
                           (language_strings, data["_id"]))
            delprop(data, prop)

    return json.dumps(data)