Example #1
0
def normalize_person(cls, node, graph):
    name = max(strip_whitespace(' '.join(node.attrs[x]
                                         for x in NAME_PARTS.values()
                                         if node.attrs.get(x))),
               strip_whitespace(node.attrs.get('name', '')),
               '',
               key=len)

    if NULL_RE.match(name):
        logger.debug('Discarding unnamed agent "%s"',
                     node.attrs.get('name', ''))
        return graph.remove(node)

    human = nameparser.HumanName(name)
    parts = {
        v: strip_whitespace(human[k]).title()
        for k, v in NAME_PARTS.items() if strip_whitespace(human[k])
    }

    node.attrs = {
        'name': ' '.join(parts[k] for k in NAME_PARTS.values() if k in parts),
        **parts
    }

    if node.attrs.get('location'):
        node.attrs['location'] = strip_whitespace(node.attrs['location'])
Example #2
0
    def normalize(cls, node, graph):
        tags = [
            strip_whitespace(part).lower()
            for part in re.split(',|;', node.attrs['name'])
            if strip_whitespace(part)
        ]

        if len(tags) == 1 and tags[0] == node.attrs['name']:
            return

        logger.debug('Normalized "%s" to %s', node.attrs['name'], tags)

        # ensure tags are always created in the same order
        tags = [
            graph.create(None, 'tag', {'name': tag}) for tag in sorted(tags)
        ]

        for tag in tags:
            for edge in node.related('work_relations'):
                through = graph.create(None, 'throughtags', {})
                graph.relate(through, tag)
                graph.relate(through,
                             edge.subject.related('creative_work').related)

        graph.remove(node)
    def _normalize_non_person(self, node):
        # TODO reevaluate everything in this method

        attrs = node.attrs()
        name = attrs.get('name')

        if not name or self.NULL_RE.match(name):
            self.info('Discarding unnamed agent', node.id)
            node.delete()
            return

        # Slightly more intelligent title casing
        name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name)

        maybe_type = GuessAgentTypeLink(default=node.type).execute(name)
        # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore
        if len(apps.get_model('share', maybe_type).get_types()) < len(node.model.get_types()):
            node.type = maybe_type

        match = re.match(r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I)
        if match:
            *parts, location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)]
            node['name'] = ' - '.join(reversed(parts))
            node['location'] = location
            return

        match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I)
        if match:
            name, *location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)]
            node['name'] = name
            node['location'] = ', '.join(location)

        node['name'] = name
    def regulate_node(self, node):
        tags = list(map(
            lambda t: t.lower(),
            filter(None, (
                strip_whitespace(part)
                for part in re.split(',|;', node['name'])
            ))
        ))

        if not tags:
            self.info('Discarding nameless tag', node.id)
            node.delete()
            return

        if len(tags) == 1:
            node['name'] = tags[0]
            return

        through_tags = node['work_relations']
        for tag in sorted(tags):
            new_tag = node.graph.add_node(None, 'tag', {'name': tag})
            self.info('Added tokenized tag', new_tag.id)
            for through_tag in through_tags:
                node.graph.add_node(None, 'throughtags', {
                    'tag': new_tag,
                    'creative_work': through_tag['creative_work']
                })

        self.info('Discarded tag with multiple names', node.id)
        node.delete()
Example #5
0
    def regulate_node(self, node):
        tags = list(
            map(
                lambda t: t.lower(),
                filter(None, (strip_whitespace(part)
                              for part in re.split(',|;', node['name'])))))

        if not tags:
            self.info('Discarding nameless tag', node.id)
            node.delete()
            return

        if len(tags) == 1:
            node['name'] = tags[0]
            return

        through_tags = node['work_relations']
        for tag in sorted(tags):
            new_tag = node.graph.add_node(None, 'tag', {'name': tag})
            self.info('Added tokenized tag', new_tag.id)
            for through_tag in through_tags:
                node.graph.add_node(
                    None, 'throughtags', {
                        'tag': new_tag,
                        'creative_work': through_tag['creative_work']
                    })

        self.info('Discarded tag with multiple names', node.id)
        node.delete()
Example #6
0
 def regulate_node(self, node):
     for k, v in node.attrs().items():
         if isinstance(v, str):
             v = strip_whitespace(v)
             if self.NULL_RE.match(v):
                 node[k] = ''
             else:
                 node[k] = v
Example #7
0
def normalize_person(cls, node, graph):
    name = max(strip_whitespace(' '.join(
        node.attrs[x]
        for x in NAME_PARTS.values()
        if node.attrs.get(x)
    )), strip_whitespace(node.attrs.get('name', '')), '', key=len)

    if NULL_RE.match(name):
        logger.debug('Discarding unnamed agent "%s"', node.attrs.get('name', ''))
        return graph.remove(node)

    human = nameparser.HumanName(name)
    parts = {v: strip_whitespace(human[k]).title() for k, v in NAME_PARTS.items() if strip_whitespace(human[k])}

    node.attrs = {'name': ' '.join(parts[k] for k in NAME_PARTS.values() if k in parts), **parts}

    if node.attrs.get('location'):
        node.attrs['location'] = strip_whitespace(node.attrs['location'])
Example #8
0
    def _normalize_non_person(self, node):
        # TODO reevaluate everything in this method

        attrs = node.attrs()
        name = attrs.get('name')

        if not name or self.NULL_RE.match(name):
            self.info('Discarding unnamed agent', node.id)
            node.delete()
            return

        # Slightly more intelligent title casing
        name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}',
                      lambda x: x.group().title(), name)

        maybe_type = GuessAgentTypeLink(default=node.type).execute(name)
        # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore
        if len(apps.get_model('share', maybe_type).get_types()) < len(
                node.model.get_types()):
            node.type = maybe_type

        match = re.match(
            r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name,
            re.I)
        if match:
            *parts, location = [
                strip_whitespace(x) for x in match.groups()
                if x and strip_whitespace(x)
            ]
            node['name'] = ' - '.join(reversed(parts))
            node['location'] = location
            return

        match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I)
        if match:
            name, *location = [
                strip_whitespace(x) for x in match.groups()
                if x and strip_whitespace(x)
            ]
            node['name'] = name
            node['location'] = ', '.join(location)

        node['name'] = name
Example #9
0
    def normalize(cls, node, graph):
        if 'name' not in node.attrs and not node.is_blank:
            return

        name = strip_whitespace(node.attrs['name'])

        # Slightly more intellegent title casing
        name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name)

        if NULL_RE.match(name):
            logger.debug('Discarding unnamed agent "%s"', name)
            return graph.remove(node)

        maybe_type = GuessAgentTypeLink(default=node.type).execute(node.attrs['name'])
        # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore
        if len(apps.get_model('share', maybe_type).get_types()) < len(node.model.get_types()):
            node._type = maybe_type

        match = re.match(r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I)
        if match:
            *parts, location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)]
            node.attrs['name'] = ' - '.join(reversed(parts))
            node.attrs['location'] = location
            return

        match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I)
        if match:
            name, *location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)]
            node.attrs['name'] = name
            node.attrs['location'] = ', '.join(location)

        node.attrs['name'] = name
        if node.attrs.get('location'):
            node.attrs['location'] = strip_whitespace(node.attrs['location'])
Example #10
0
    def normalize(cls, node, graph):
        tags = [
            strip_whitespace(part).lower()
            for part in re.split(',|;', node.attrs['name'])
            if strip_whitespace(part)
        ]

        if len(tags) == 1 and tags[0] == node.attrs['name']:
            return

        logger.debug('Normalized "%s" to %s', node.attrs['name'], tags)

        # ensure tags are always created in the same order
        tags = [graph.create(None, 'tag', {'name': tag}) for tag in sorted(tags)]

        for tag in tags:
            for edge in node.related('work_relations'):
                through = graph.create(None, 'throughtags', {})
                graph.relate(through, tag)
                graph.relate(through, edge.subject.related('creative_work').related)

        graph.remove(node)
Example #11
0
    def normalize(cls, node, graph):
        if 'name' not in node.attrs and not node.is_blank:
            return

        name = strip_whitespace(node.attrs['name'])

        # Slightly more intellegent title casing
        name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}',
                      lambda x: x.group().title(), name)

        if NULL_RE.match(name):
            logger.debug('Discarding unnamed agent "%s"', name)
            return graph.remove(node)

        maybe_type = GuessAgentTypeLink(default=node.type).execute(
            node.attrs['name'])
        # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore
        if len(apps.get_model('share', maybe_type).get_types()) < len(
                node.model.get_types()):
            node._type = maybe_type

        match = re.match(
            r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name,
            re.I)
        if match:
            *parts, location = [
                strip_whitespace(x) for x in match.groups()
                if x and strip_whitespace(x)
            ]
            node.attrs['name'] = ' - '.join(reversed(parts))
            node.attrs['location'] = location
            return

        match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I)
        if match:
            name, *location = [
                strip_whitespace(x) for x in match.groups()
                if x and strip_whitespace(x)
            ]
            node.attrs['name'] = name
            node.attrs['location'] = ', '.join(location)

        node.attrs['name'] = name
        if node.attrs.get('location'):
            node.attrs['location'] = strip_whitespace(node.attrs['location'])
Example #12
0
 def normalize(self, node, graph):
     for k, v in tuple(node.attrs.items()):
         if isinstance(v, str):
             node.attrs[k] = strip_whitespace(v)
             if node.attrs[k] == 'null':
                 node.attrs[k] = ''
Example #13
0
def normalize_contributor(cls, node, graph):
    if not node.attrs.get('cited_as'):
        node.attrs['cited_as'] = node.related('agent').related.attrs['name']
    node.attrs['cited_as'] = strip_whitespace(node.attrs['cited_as'])
Example #14
0
 def normalize(self, node, graph):
     for k, v in tuple(node.attrs.items()):
         if isinstance(v, str):
             node.attrs[k] = strip_whitespace(v)
             if node.attrs[k] == 'null':
                 node.attrs[k] = ''