def test_fix_name(self): self.assertEqual(text.fix_name(' Colin Fraser '), ('colin fraser', [])) self.assertEqual(text.fix_name('Fraser, Colin'), ('colin fraser', [])) self.assertEqual(text.fix_name('Sir Colin Fraser'), ('colin fraser', ['sir'])) self.assertEqual(text.fix_name('Fraser, Sir Colin'), ('colin fraser', ['sir']))
def get_subjects(self, data): """ Finds all subjects of the frame assigned to the sentence :param dict data: classification results :return: all subjects as tuples (chunk, wikidata id) :rtype: generator of tuples """ if data['lu'] not in self.frame_data: logger.debug('sentence with a LU not contained in the lexical database') logger.debug(data) subjects = [] else: frame = self.frame_data[data['lu']] subjects = [fe for fe in data['fes'] if fe['fe'] in frame['core_fes']] if subjects: for each in subjects: name = each['chunk'] wid = wikidata.resolver_with_hints( 'P1559', text.fix_name(name)[0], self.language ) yield name, wid else: # if this fails, assume the subject is the main subject of the # article from which this sentence was extracted if data['url'] in self.url_to_wid: name = None wid = self.url_to_wid[data['url']] else: name = data.get('name') wid = wikidata.resolver_with_hints( 'P1559', text.fix_name(name)[0], self.language, type_=5 # Q5 = human ) or None if name else None yield name, wid
def get_subjects(self, data): """ Finds all subjects of the frame assigned to the sentence :param dict data: classification results :return: all subjects as tuples (chunk, wikidata id) :rtype: generator of tuples """ if data["lu"] not in self.frame_data: logger.debug("sentence with a LU not contained in the lexical database") logger.debug(data) subjects = [] else: frame = self.frame_data[data["lu"]] subjects = [fe for fe in data["fes"] if fe["fe"] in frame["core_fes"]] if subjects: for each in subjects: name = each["chunk"] wid = wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language) yield name, wid else: # if this fails, assume the subject is the main subject of the # article from which this sentence was extracted if data["url"] in self.url_to_wid: name = None wid = self.url_to_wid[data["url"]] else: name = data.get("name") wid = ( wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language, type_=5) # Q5 = human or None if name else None ) yield name, wid
def serialize_item(self, item): """ Converts an item to quick statements. :param item: Scraped item, either str (json) or dict :returns: tuples <success, item> where item is an entity which could not be resolved if success is false, otherwise it is a <subject, property, object, source> tuple :rtype: generator """ if isinstance(item, basestring): item = json.loads(item) name = item.pop('name', '') other = item.pop('other', {}) url = item.pop('url', '') if self.sourced_only and not url: logger.debug('item %s has no url, skipping it') return if not name: logger.debug('item %s has no name, skipping it') return data = {} try: data = json.loads(other) except ValueError: pass except TypeError: if isinstance(other, dict): data = other else: return name, honorifics = text.fix_name(name) data.update(item) data.pop('bio', None) # the name will be the last one to be resolved because it is the hardest # one to get right, so we will use all the other statements to help statements = defaultdict(list) for key, value in data.iteritems(): if not isinstance(value, list): value = [value] strings = [] for val in value: if isinstance(val, basestring): strings.append(val) elif isinstance(val, dict): strings.extend(val.keys()) strings.extend(val.values()) for val in strings: if not val: continue elif not isinstance(val, basestring): logger.debug('skipping value "%s" because it is not a string', val) continue property = wikidata.PROPERTY_TO_WIKIDATA.get(key) if not property: logger.debug('cannot resolve property %s, skipping', key) continue info = dict(data, **statements) # provide all available info to the resolver resolved = wikidata.resolve(property, val, self.language, **info) if not resolved: logger.debug('cannot resolve value %s of property %s, skipping', val, property) yield False, {'chunk': val, 'additional': {'property': property, 'url': url}} continue statements[property].append(resolved) info = dict(data, **statements) # provide all available info to the resolver info['type_'] = 5 # Q5 = human wid = wikidata.resolver_with_hints('P1559', name, self.language, **info) if not wid: logger.debug('cannot find wikidata id of "%s" with properties %s, skipping', name, repr(info)) yield False, {'chunk': name, 'additional': {'property': 'P1559', 'url': url}} return # now that we are sure about the subject we can produce the actual statements yield True, (wid, 'P1559', '%s:"%s"' % (self.language, name.title()), url) for property, values in statements.iteritems(): for val in values: yield True, (wid, property, val, url) for each in honorifics: hon = wikidata.resolve('P1035', each, self.language) if hon: yield True, (wid, 'P1035', hon, url) else: yield False, {'chunk': each, 'additional': {'property': 'P1035', 'url': url}}