Beispiel #1
0
    def map_type(self):
        """Get type from objectType or object_type element

        Specifically, look at freetext/objectType[label=Type] and
        indexedStructured/object_type.
        """
        object_type_strings = []
        phys_type_strings = []
        ot_ccase = self.extract_xml_items("freetext", "objectType")
        phys_desc = self.extract_xml_items("freetext", "physicalDescription")
        ot_uscore = self.extract_xml_items("indexedStructured", "object_type")
        for pd in phys_desc:
            pd_text = pd.get("#text", "").strip()
            if pd_text:
                phys_type_strings.append(pd_text.lower())
        for ot in ot_ccase:
            if ot.get("@label", "") == "Type":
                s = ot.get("#text", "").strip()
                if s:
                    object_type_strings.append(s.lower())
        for ot in ot_uscore:
            s = ot.strip()
            if s:
                object_type_strings.append(s.lower())
        try:
            new_type = itemtype.type_for_strings_and_mappings(
                [(phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword)]
            )
        except itemtype.NoTypeError:
            id_for_msg = self.provider_data.get("_id", "[no _id]")
            logger.warning("Can not deduce type for item with _id: %s" % id_for_msg)
            new_type = "image"

        self.update_source_resource({"type": new_type})
Beispiel #2
0
    def map_type(self):
        """Get type from objectType or object_type element

        Specifically, look at freetext/objectType[label=Type] and
        indexedStructured/object_type.
        """
        object_type_strings = []
        phys_type_strings = []
        ot_ccase = self.extract_xml_items("freetext", "objectType")
        phys_desc = self.extract_xml_items("freetext",
                                           "physicalDescription")
        ot_uscore = self.extract_xml_items("indexedStructured",
                                           "object_type")
        for pd in phys_desc:
            pd_text = pd.get("#text", "").strip()
            if pd_text:
                phys_type_strings.append(pd_text.lower())
        for ot in ot_ccase:
            if ot.get("@label", "") == "Type":
                s = ot.get("#text", "").strip()
                if s:
                    object_type_strings.append(s.lower())
        for ot in ot_uscore:
            s = ot.strip()
            if s:
                object_type_strings.append(s.lower())
        try:
            new_type = itemtype.type_for_strings_and_mappings([
                (phys_type_strings, self.type_for_phys_keyword),
                (object_type_strings, self.type_for_ot_keyword)
                ])
        except itemtype.NoTypeError:
            id_for_msg = self.provider_data.get("_id", "[no _id]")
            logger.warning("Can not deduce type for item with _id: %s" %
                           id_for_msg)
            new_type = 'image'

        self.update_source_resource({"type": new_type})
Beispiel #3
0
def enrichtype(body,
               ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format",
               default=None,
               send_rejects_to_format=False):
    """
    Service that accepts a JSON document and enriches the "type" field of that
    document by:

    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter.

    A default type, if none can be determined, may be specified with the
    "default" querystring parameter.  If no default is given, the type field
    will be unmodified, or not added, in the result.
    """
    global type_for_type_keyword, type_for_format_keyword

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    type_strings = []
    format_strings = []
    try:
        sr_type = data['sourceResource'].get('type', [])
        sr_format = data['sourceResource'].get('format', [])
    except KeyError:
        # In this case, sourceResource is not present, so give up and return
        # the original data unmodified.
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('enrich-type lacks sourceResource for _id %s' % \
                id_for_msg)
        return body
    if sr_type:
        for t in sr_type if (type(sr_type) == list) else [sr_type]:
            t_flat = t
            if type(t) == dict:
                t_flat = t.get('#text', None)
                if not t_flat:
                    t_flat = t.get('text', '')
            type_strings.append(t_flat.lower())
    if sr_format:
        for f in sr_format if (type(sr_format) == list) else [sr_format]:
            format_strings.append(f.lower())
    try:
        data['sourceResource']['type'] = \
                itemtype.type_for_strings_and_mappings([
                    (type_strings, type_for_type_keyword),
                    (format_strings, type_for_format_keyword),
                ])
    except itemtype.NoTypeError:
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('Can not deduce type for item with _id: %s' % \
                       id_for_msg)
        if default:
            data['sourceResource']['type'] = default
        else:
            try:
                del data['sourceResource']['type']
            except:
                pass
    finally:
        if send_rejects_to_format and type_strings:
            rej = itemtype.rejects([(type_strings, type_for_type_keyword)])
            if rej:
                if (not isinstance(sr_format, list)):
                    sr_format = [sr_format]
                sr_format.extend(rej)
                data['sourceResource']['format'] = sr_format

    return json.dumps(data)
Beispiel #4
0
def enrichtype(body, ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format",
               default=None,
               send_rejects_to_format=False):
    """   
    Service that accepts a JSON document and enriches the "type" field of that
    document by: 

    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter.

    A default type, if none can be determined, may be specified with the
    "default" querystring parameter.  If no default is given, the type field
    will be unmodified, or not added, in the result.
    """
    global type_for_type_keyword, type_for_format_keyword

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    type_strings = []
    format_strings = []
    try:
        sr_type = data['sourceResource'].get('type', [])
        sr_format = data['sourceResource'].get('format', [])
    except KeyError:
        # In this case, sourceResource is not present, so give up and return
        # the original data unmodified.
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('enrich-type lacks sourceResource for _id %s' % \
                id_for_msg)
        return body
    if sr_type:
        for t in sr_type if (type(sr_type) == list) else [sr_type]:
            if type(t) == dict:
                t = t.get('#text', '')
            if t is not None: 
                type_strings.append(t.lower())
    if sr_format:
        for f in sr_format if (type(sr_format) == list) else [sr_format]:
            if f is not None: 
                format_strings.append(f.lower())
    try:
        data['sourceResource']['type'] = \
                itemtype.type_for_strings_and_mappings([
                    (format_strings, type_for_format_keyword),
                    (type_strings, type_for_type_keyword)
                ])
    except itemtype.NoTypeError:
        id_for_msg = data.get('_id', '[no id]')
        logger.warning('Can not deduce type for item with _id: %s' % \
                       id_for_msg)
        if default:
            data['sourceResource']['type'] = default
        else:
            try:
                del data['sourceResource']['type']
            except:
                pass
    finally:
        if send_rejects_to_format and type_strings:
            rej = itemtype.rejects([(type_strings, type_for_type_keyword)])
            if rej:
                if (not isinstance(sr_format, list)):
                    sr_format = [sr_format]
                sr_format.extend(rej) 
                data['sourceResource']['format'] = sr_format

    return json.dumps(data)