Ejemplo n.º 1
0
    def map_type(self):
        """Get type from objectType or object_type element

        Specifically, look at freetext/objectType[label=Type] and
        indexedStructured/object_type.
        """
        object_type_strings = []
        phys_type_strings = []
        ot_ccase = self.extract_xml_items("freetext", "objectType")
        phys_desc = self.extract_xml_items("freetext", "physicalDescription")
        ot_uscore = self.extract_xml_items("indexedStructured", "object_type")
        for pd in phys_desc:
            pd_text = pd.get("#text", "").strip()
            if pd_text:
                phys_type_strings.append(pd_text.lower())
        for ot in ot_ccase:
            if ot.get("@label", "") == "Type":
                s = ot.get("#text", "").strip()
                if s:
                    object_type_strings.append(s.lower())
        for ot in ot_uscore:
            s = ot.strip()
            if s:
                object_type_strings.append(s.lower())
        try:
            new_type = itemtype.type_for_strings_and_mappings(
                [(phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword)]
            )
        except itemtype.NoTypeError:
            id_for_msg = self.provider_data.get("_id", "[no _id]")
            logger.warning("Can not deduce type for item with _id: %s" % id_for_msg)
            new_type = "image"

        self.update_source_resource({"type": new_type})
 def map_rights(self, _dict, tag, codes):
     values = self._get_values(_dict, codes)
     code = values[0]
     try:
         rights = self.rights_desc[code]
         rights += ". Learn more at http://www.hathitrust.org/access_use"
         setprop(self.mapped_data, "sourceResource/rights", rights)
     except KeyError as e:
         logger.warning("Unacceptable rights code for %s: %s" %
                        (self.provider_data["_id"], e.message))
     except:
         logger.error("Could not get rights from %s" %
                      self.provider_data["_id"])
Ejemplo n.º 3
0
 def map_rights(self, _dict, tag, codes):
     values = self._get_values(_dict, codes)
     code = values[0]
     try:
         rights = self.rights_desc[code]
         rights += ". Learn more at http://www.hathitrust.org/access_use"
         setprop(self.mapped_data, "sourceResource/rights", rights)
     except KeyError as e:
         logger.warning("Unacceptable rights code for %s: %s" %
                        (self.provider_data["_id"], e.message))
     except:
         logger.error("Could not get rights from %s" %
                      self.provider_data["_id"])
def validatemapv3(body, ctype):
    """
    Service that accepts a JSON document and validates it against the
    DPLA Metadata Application Profile v3 JSON Schema.
    """

    # TODO: Send GET request to API once schema endpoint is created

    try:
        data = json.loads(body)
        id_for_msg = data.get('_id', '[no id]')
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    valid = None
    validation_message = None

    try:
        ingest_type = data.get('ingestType', None)
        if ingest_type is not None:
            validate(data, MAPV3_SCHEMAS[ingest_type])
            valid = True
        else:
            logger.warning(
                'Could not get ingestType for record with id %s; refusing to validate.'
                % id_for_msg)
    except ValidationError as err:
        valid = False
        logger.warning('Could not validate %s record with id %s: %s' %
                       (ingest_type, id_for_msg, err.message))
        validation_message = err.message

    if "admin" in data:
        data["admin"]["valid_after_enrich"] = valid
        data["admin"]["validation_message"] = validation_message
    else:
        data["admin"] = {
            "valid_after_enrich": valid,
            "validation_message": validation_message
        }
    return json.dumps(data)
Ejemplo n.º 5
0
def validatemapv3(body, ctype):
    """
    Service that accepts a JSON document and validates it against the
    DPLA Metadata Application Profile v3 JSON Schema.
    """

    # TODO: Send GET request to API once schema endpoint is created

    try:
        data = json.loads(body)
        id_for_msg = data.get('_id', '[no id]')
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    valid = None
    validation_message = None

    try:
        ingest_type = data.get('ingestType', None)
        if ingest_type is not None:
            validate(data, MAPV3_SCHEMAS[ingest_type])
            valid = True
        else:
            logger.warning('Could not get ingestType for record with id %s; refusing to validate.' % id_for_msg)
    except ValidationError as err:
        valid = False
        logger.warning('Could not validate %s record with id %s: %s' % (ingest_type, id_for_msg, err.message))
        validation_message = err.message

    if "admin" in data:
        data["admin"]["valid_after_enrich"] = valid
        data["admin"]["validation_message"] = validation_message
    else:
        data["admin"] = {
            "valid_after_enrich": valid,
            "validation_message": validation_message
        }
    return json.dumps(data)
Ejemplo n.º 6
0
    def map_type(self):
        """Get type from objectType or object_type element

        Specifically, look at freetext/objectType[label=Type] and
        indexedStructured/object_type.
        """
        object_type_strings = []
        phys_type_strings = []
        ot_ccase = self.extract_xml_items("freetext", "objectType")
        phys_desc = self.extract_xml_items("freetext",
                                           "physicalDescription")
        ot_uscore = self.extract_xml_items("indexedStructured",
                                           "object_type")
        for pd in phys_desc:
            pd_text = pd.get("#text", "").strip()
            if pd_text:
                phys_type_strings.append(pd_text.lower())
        for ot in ot_ccase:
            if ot.get("@label", "") == "Type":
                s = ot.get("#text", "").strip()
                if s:
                    object_type_strings.append(s.lower())
        for ot in ot_uscore:
            s = ot.strip()
            if s:
                object_type_strings.append(s.lower())
        try:
            new_type = itemtype.type_for_strings_and_mappings([
                (phys_type_strings, self.type_for_phys_keyword),
                (object_type_strings, self.type_for_ot_keyword)
                ])
        except itemtype.NoTypeError:
            id_for_msg = self.provider_data.get("_id", "[no _id]")
            logger.warning("Can not deduce type for item with _id: %s" %
                           id_for_msg)
            new_type = 'image'

        self.update_source_resource({"type": new_type})
def filter_path(_dict, path):
    """
    Repeatedly runs cleaner function until all empty values are removed from given path (hash stops changing).
    Arguments:
     _dict - dictionary to clean;
     path - a xpath-like path to the value, that must be checked
    Returns:
     cleaned dictionary
    """
    d = copy.deepcopy(_dict)
    embracing_path, sep, value_key = path.rpartition(PATH_DELIM)
    try:
        dict_to_clean = getprop(d, embracing_path)
    except KeyError:
        logger.warning("Attempt to clean non existent path \"%s\"", embracing_path)
        return _dict
    else:
        if value_key:
            cleaned_dict = filter_dict(dict_to_clean, filter_fields, value_key)
            setprop(d, embracing_path, cleaned_dict)
            return d
        else:
            return filter_dict(dict_to_clean, filter_fields, embracing_path)
Ejemplo n.º 8
0
def filter_path(_dict, path):
    """
    Repeatedly runs cleaner function until all empty values are removed from given path (hash stops changing).
    Arguments:
     _dict - dictionary to clean;
     path - a xpath-like path to the value, that must be checked
    Returns:
     cleaned dictionary
    """
    d = copy.deepcopy(_dict)
    embracing_path, sep, value_key = path.rpartition(PATH_DELIM)
    try:
        dict_to_clean = getprop(d, embracing_path)
    except KeyError:
        logger.warning("Attempt to clean non existent path \"%s\"",
                       embracing_path)
        return _dict
    else:
        if value_key:
            cleaned_dict = filter_dict(dict_to_clean, filter_fields, value_key)
            setprop(d, embracing_path, cleaned_dict)
            return d
        else:
            return filter_dict(dict_to_clean, filter_fields, embracing_path)
Ejemplo n.º 9
0
def enrichtype(body,
               ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format",
               default=None,
               send_rejects_to_format=False):
    """
    Service that accepts a JSON document and enriches the "type" field of that
    document by:

    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter.

    A default type, if none can be determined, may be specified with the
    "default" querystring parameter.  If no default is given, the type field
    will be unmodified, or not added, in the result.
    """
    global type_for_type_keyword, type_for_format_keyword

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    type_strings = []
    format_strings = []
    try:
        sr_type = data['sourceResource'].get('type', [])
        sr_format = data['sourceResource'].get('format', [])
    except KeyError:
        # In this case, sourceResource is not present, so give up and return
        # the original data unmodified.
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('enrich-type lacks sourceResource for _id %s' % \
                id_for_msg)
        return body
    if sr_type:
        for t in sr_type if (type(sr_type) == list) else [sr_type]:
            t_flat = t
            if type(t) == dict:
                t_flat = t.get('#text', None)
                if not t_flat:
                    t_flat = t.get('text', '')
            type_strings.append(t_flat.lower())
    if sr_format:
        for f in sr_format if (type(sr_format) == list) else [sr_format]:
            format_strings.append(f.lower())
    try:
        data['sourceResource']['type'] = \
                itemtype.type_for_strings_and_mappings([
                    (type_strings, type_for_type_keyword),
                    (format_strings, type_for_format_keyword),
                ])
    except itemtype.NoTypeError:
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('Can not deduce type for item with _id: %s' % \
                       id_for_msg)
        if default:
            data['sourceResource']['type'] = default
        else:
            try:
                del data['sourceResource']['type']
            except:
                pass
    finally:
        if send_rejects_to_format and type_strings:
            rej = itemtype.rejects([(type_strings, type_for_type_keyword)])
            if rej:
                if (not isinstance(sr_format, list)):
                    sr_format = [sr_format]
                sr_format.extend(rej)
                data['sourceResource']['format'] = sr_format

    return json.dumps(data)
Ejemplo n.º 10
0
def enrichtype(body, ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format",
               default=None,
               send_rejects_to_format=False):
    """   
    Service that accepts a JSON document and enriches the "type" field of that
    document by: 

    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter.

    A default type, if none can be determined, may be specified with the
    "default" querystring parameter.  If no default is given, the type field
    will be unmodified, or not added, in the result.
    """
    global type_for_type_keyword, type_for_format_keyword

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    type_strings = []
    format_strings = []
    try:
        sr_type = data['sourceResource'].get('type', [])
        sr_format = data['sourceResource'].get('format', [])
    except KeyError:
        # In this case, sourceResource is not present, so give up and return
        # the original data unmodified.
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('enrich-type lacks sourceResource for _id %s' % \
                id_for_msg)
        return body
    if sr_type:
        for t in sr_type if (type(sr_type) == list) else [sr_type]:
            if type(t) == dict:
                t = t.get('#text', '')
            if t is not None: 
                type_strings.append(t.lower())
    if sr_format:
        for f in sr_format if (type(sr_format) == list) else [sr_format]:
            if f is not None: 
                format_strings.append(f.lower())
    try:
        data['sourceResource']['type'] = \
                itemtype.type_for_strings_and_mappings([
                    (format_strings, type_for_format_keyword),
                    (type_strings, type_for_type_keyword)
                ])
    except itemtype.NoTypeError:
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('Can not deduce type for item with _id: %s' % \
                       id_for_msg)
        if default:
            data['sourceResource']['type'] = default
        else:
            try:
                del data['sourceResource']['type']
            except:
                pass
    finally:
        if send_rejects_to_format and type_strings:
            rej = itemtype.rejects([(type_strings, type_for_type_keyword)])
            if rej:
                if (not isinstance(sr_format, list)):
                    sr_format = [sr_format]
                sr_format.extend(rej) 
                data['sourceResource']['format'] = sr_format

    return json.dumps(data)
Ejemplo n.º 11
0
def enrich_language(body, ctype, action="enrich_language",
                      prop="sourceResource/language"):
    """
    Service that accepts a JSON document and sets the language ISO 639-3
    code(s) and language name from the current language value(s) by:

    a) Checking if the value is a language code, else
    a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else
    c) Attempting to find an exact language name match, else
    d) Attempting to find language name matches withing the value
    """

    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        language_strings = [v] if not isinstance(v, list) else v

        iso_codes = []
        for lang_string in language_strings:
            # Check if raw value is a code
            if lang_string not in iso_codes and lang_string in ISO639_3_SUBST:
                iso_codes.append(lang_string)
            else:
                # If lang_string is an ISO 639-1 code, convert to ISO 639-3
                iso3 = iso1_to_iso3(
                        re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip()
                        )
                if iso3 not in iso_codes and iso3 in ISO639_3_SUBST:
                    iso_codes.append(iso3)
                else:
                    # First check for exact language name matches
                    for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items():
                        match = regex.match(lang_string.strip())
                        if match:
                            iso_codes.append(iso_code)
                            break

                    if match is None:
                        # Check for language names with word boundary regex
                        for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items():
                            if regex.search(lang_string):
                                iso_codes.append(iso_code)

        if iso_codes:
            seen = set()
            language = [{"iso639_3": code, "name": ISO639_3_SUBST[code]} for
                        code in iso_codes if not
                        (code in seen or seen.add(code))]
            setprop(data, prop, language)
        else:
            logger.warning("Did not find language code in [%s] for record %s" %
                           (language_strings, data["_id"]))
            delprop(data, prop)

    return json.dumps(data)
def enrich_language(body,
                    ctype,
                    action="enrich_language",
                    prop="sourceResource/language"):
    """
    Service that accepts a JSON document and sets the language ISO 639-3
    code(s) and language name from the current language value(s) by:

    a) Checking if the value is a language code, else
    a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else
    c) Attempting to find an exact language name match, else
    d) Attempting to find language name matches withing the value
    """
    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        language_strings = [v] if not isinstance(v, list) else v

        iso_codes = []
        for lang_string in language_strings:
            # Check if raw value is a code
            if lang_string not in iso_codes and lang_string in ISO639_3_SUBST:
                iso_codes.append(lang_string)
            else:
                # If lang_string is an ISO 639-1 code, convert to ISO 639-3
                iso3 = iso1_to_iso3(
                    re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip())
                if iso3 not in iso_codes and iso3 in ISO639_3_SUBST:
                    iso_codes.append(iso3)
                else:
                    # First check for exact language name matches
                    for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items():
                        match = regex.match(lang_string.strip())
                        if match:
                            iso_codes.append(iso_code)
                            break

                    if match is None:
                        # Check for language names with word boundary regex
                        for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items(
                        ):
                            if regex.search(lang_string):
                                iso_codes.append(iso_code)

        if iso_codes:
            seen = set()
            language = [{
                "iso639_3": code,
                "name": ISO639_3_SUBST[code]
            } for code in iso_codes if not (code in seen or seen.add(code))]
            setprop(data, prop, language)
        else:
            logger.warning("Did not find language code in [%s] for record %s" %
                           (language_strings, data["_id"]))
            delprop(data, prop)

    return json.dumps(data)