def normalize_person(cls, node, graph): name = max(strip_whitespace(' '.join(node.attrs[x] for x in NAME_PARTS.values() if node.attrs.get(x))), strip_whitespace(node.attrs.get('name', '')), '', key=len) if NULL_RE.match(name): logger.debug('Discarding unnamed agent "%s"', node.attrs.get('name', '')) return graph.remove(node) human = nameparser.HumanName(name) parts = { v: strip_whitespace(human[k]).title() for k, v in NAME_PARTS.items() if strip_whitespace(human[k]) } node.attrs = { 'name': ' '.join(parts[k] for k in NAME_PARTS.values() if k in parts), **parts } if node.attrs.get('location'): node.attrs['location'] = strip_whitespace(node.attrs['location'])
def normalize(cls, node, graph): tags = [ strip_whitespace(part).lower() for part in re.split(',|;', node.attrs['name']) if strip_whitespace(part) ] if len(tags) == 1 and tags[0] == node.attrs['name']: return logger.debug('Normalized "%s" to %s', node.attrs['name'], tags) # ensure tags are always created in the same order tags = [ graph.create(None, 'tag', {'name': tag}) for tag in sorted(tags) ] for tag in tags: for edge in node.related('work_relations'): through = graph.create(None, 'throughtags', {}) graph.relate(through, tag) graph.relate(through, edge.subject.related('creative_work').related) graph.remove(node)
def _normalize_non_person(self, node): # TODO reevaluate everything in this method attrs = node.attrs() name = attrs.get('name') if not name or self.NULL_RE.match(name): self.info('Discarding unnamed agent', node.id) node.delete() return # Slightly more intelligent title casing name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name) maybe_type = GuessAgentTypeLink(default=node.type).execute(name) # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore if len(apps.get_model('share', maybe_type).get_types()) < len(node.model.get_types()): node.type = maybe_type match = re.match(r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I) if match: *parts, location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)] node['name'] = ' - '.join(reversed(parts)) node['location'] = location return match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I) if match: name, *location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)] node['name'] = name node['location'] = ', '.join(location) node['name'] = name
def regulate_node(self, node): tags = list(map( lambda t: t.lower(), filter(None, ( strip_whitespace(part) for part in re.split(',|;', node['name']) )) )) if not tags: self.info('Discarding nameless tag', node.id) node.delete() return if len(tags) == 1: node['name'] = tags[0] return through_tags = node['work_relations'] for tag in sorted(tags): new_tag = node.graph.add_node(None, 'tag', {'name': tag}) self.info('Added tokenized tag', new_tag.id) for through_tag in through_tags: node.graph.add_node(None, 'throughtags', { 'tag': new_tag, 'creative_work': through_tag['creative_work'] }) self.info('Discarded tag with multiple names', node.id) node.delete()
def regulate_node(self, node): tags = list( map( lambda t: t.lower(), filter(None, (strip_whitespace(part) for part in re.split(',|;', node['name']))))) if not tags: self.info('Discarding nameless tag', node.id) node.delete() return if len(tags) == 1: node['name'] = tags[0] return through_tags = node['work_relations'] for tag in sorted(tags): new_tag = node.graph.add_node(None, 'tag', {'name': tag}) self.info('Added tokenized tag', new_tag.id) for through_tag in through_tags: node.graph.add_node( None, 'throughtags', { 'tag': new_tag, 'creative_work': through_tag['creative_work'] }) self.info('Discarded tag with multiple names', node.id) node.delete()
def regulate_node(self, node): for k, v in node.attrs().items(): if isinstance(v, str): v = strip_whitespace(v) if self.NULL_RE.match(v): node[k] = '' else: node[k] = v
def normalize_person(cls, node, graph): name = max(strip_whitespace(' '.join( node.attrs[x] for x in NAME_PARTS.values() if node.attrs.get(x) )), strip_whitespace(node.attrs.get('name', '')), '', key=len) if NULL_RE.match(name): logger.debug('Discarding unnamed agent "%s"', node.attrs.get('name', '')) return graph.remove(node) human = nameparser.HumanName(name) parts = {v: strip_whitespace(human[k]).title() for k, v in NAME_PARTS.items() if strip_whitespace(human[k])} node.attrs = {'name': ' '.join(parts[k] for k in NAME_PARTS.values() if k in parts), **parts} if node.attrs.get('location'): node.attrs['location'] = strip_whitespace(node.attrs['location'])
def _normalize_non_person(self, node): # TODO reevaluate everything in this method attrs = node.attrs() name = attrs.get('name') if not name or self.NULL_RE.match(name): self.info('Discarding unnamed agent', node.id) node.delete() return # Slightly more intelligent title casing name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name) maybe_type = GuessAgentTypeLink(default=node.type).execute(name) # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore if len(apps.get_model('share', maybe_type).get_types()) < len( node.model.get_types()): node.type = maybe_type match = re.match( r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I) if match: *parts, location = [ strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x) ] node['name'] = ' - '.join(reversed(parts)) node['location'] = location return match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I) if match: name, *location = [ strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x) ] node['name'] = name node['location'] = ', '.join(location) node['name'] = name
def normalize(cls, node, graph): if 'name' not in node.attrs and not node.is_blank: return name = strip_whitespace(node.attrs['name']) # Slightly more intellegent title casing name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name) if NULL_RE.match(name): logger.debug('Discarding unnamed agent "%s"', name) return graph.remove(node) maybe_type = GuessAgentTypeLink(default=node.type).execute(node.attrs['name']) # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore if len(apps.get_model('share', maybe_type).get_types()) < len(node.model.get_types()): node._type = maybe_type match = re.match(r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I) if match: *parts, location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)] node.attrs['name'] = ' - '.join(reversed(parts)) node.attrs['location'] = location return match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I) if match: name, *location = [strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x)] node.attrs['name'] = name node.attrs['location'] = ', '.join(location) node.attrs['name'] = name if node.attrs.get('location'): node.attrs['location'] = strip_whitespace(node.attrs['location'])
def normalize(cls, node, graph): tags = [ strip_whitespace(part).lower() for part in re.split(',|;', node.attrs['name']) if strip_whitespace(part) ] if len(tags) == 1 and tags[0] == node.attrs['name']: return logger.debug('Normalized "%s" to %s', node.attrs['name'], tags) # ensure tags are always created in the same order tags = [graph.create(None, 'tag', {'name': tag}) for tag in sorted(tags)] for tag in tags: for edge in node.related('work_relations'): through = graph.create(None, 'throughtags', {}) graph.relate(through, tag) graph.relate(through, edge.subject.related('creative_work').related) graph.remove(node)
def normalize(cls, node, graph): if 'name' not in node.attrs and not node.is_blank: return name = strip_whitespace(node.attrs['name']) # Slightly more intellegent title casing name = re.sub(r'(?!for|and|the)\b[a-z]\w{2,}', lambda x: x.group().title(), name) if NULL_RE.match(name): logger.debug('Discarding unnamed agent "%s"', name) return graph.remove(node) maybe_type = GuessAgentTypeLink(default=node.type).execute( node.attrs['name']) # If the new type is MORE specific, IE encompasses FEWER types, upgrade. Otherwise ignore if len(apps.get_model('share', maybe_type).get_types()) < len( node.model.get_types()): node._type = maybe_type match = re.match( r'^(.*(?:Departa?ment|Institute).+?);(?: (.+?); )?([^;]+)$', name, re.I) if match: *parts, location = [ strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x) ] node.attrs['name'] = ' - '.join(reversed(parts)) node.attrs['location'] = location return match = re.match(r'^(.+?), ([^,]+), ([^,]+)$', name, re.I) if match: name, *location = [ strip_whitespace(x) for x in match.groups() if x and strip_whitespace(x) ] node.attrs['name'] = name node.attrs['location'] = ', '.join(location) node.attrs['name'] = name if node.attrs.get('location'): node.attrs['location'] = strip_whitespace(node.attrs['location'])
def normalize(self, node, graph): for k, v in tuple(node.attrs.items()): if isinstance(v, str): node.attrs[k] = strip_whitespace(v) if node.attrs[k] == 'null': node.attrs[k] = ''
def normalize_contributor(cls, node, graph): if not node.attrs.get('cited_as'): node.attrs['cited_as'] = node.related('agent').related.attrs['name'] node.attrs['cited_as'] = strip_whitespace(node.attrs['cited_as'])