def process_row(data): subject = data['emergenza'] resolved = defaultdict(lambda: []) for k, v in data.iteritems(): if COLUMN_TO_PROPERTY.get(k): v = wikidata.resolve(COLUMN_TO_PROPERTY[k], v.decode('utf8'), 'it') if v: resolved[COLUMN_TO_PROPERTY[k]].append(v) info = {k: v for k, v in resolved.iteritems()} subject = wikidata.resolver_with_hints('ddd', subject, 'it', **info) if subject: statements = [] for property, value in resolved.iteritems(): stmt = wikidata.finalize_statement(subject, property, value, 'it', resolve_property=False, resolve_value=False) if stmt is not None: statements.append(stmt) else: logger.warn('could not find the wikidata id of "%s"' % data['emergenza']) statements = None return statements
def find_qualifiers(self, fes): """ Finds all FEs that could serve as qualifiers instead of full statements """ qualifiers = defaultdict(list) for fe in fes: if fe['fe'] == 'Time': literal = fe['literal'] value = wikidata.format_date(**literal) qualifiers['P585'].append(value) elif fe['fe'] == 'Duration': literal = fe['literal'] if 'start' in literal: value = wikidata.format_date(**literal['start']) qualifiers['P580'].append(value) if 'end' in literal: value = wikidata.format_date(**literal['end']) qualifiers['P580'].append(value) elif fe['fe'] == 'Place': value = None if 'link' in fe: value = wikidata.wikidata_id_from_wikipedia_url(fe['link']['uri']) if not value: value = wikidata.resolve('P276', fe['chunk'], self.language) if value: qualifiers['P276'].append(value) return qualifiers
def find_qualifiers(self, fes): """ Finds all FEs that could serve as qualifiers instead of full statements """ qualifiers = defaultdict(list) for fe in fes: if fe["fe"] == "Time": literal = fe["literal"] value = wikidata.format_date(**literal) qualifiers["P585"].append(value) elif fe["fe"] == "Duration": literal = fe["literal"] if "start" in literal: value = wikidata.format_date(**literal["start"]) qualifiers["P580"].append(value) if "end" in literal: value = wikidata.format_date(**literal["end"]) qualifiers["P580"].append(value) elif fe["fe"] == "Place": value = None if "link" in fe: value = wikidata.wikidata_id_from_wikipedia_url(fe["link"]["uri"]) if not value: value = wikidata.resolve("P276", fe["chunk"], self.language) if value: qualifiers["P276"].append(value) return qualifiers
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}} continue for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, url): yield True, each else: prop = self.fe_to_wid.get(fe['fe']) if not prop: logger.debug('unknown fe type %s, skipping', fe['fe']) continue val = wikidata.resolve(prop, fe['chunk'], self.language) if val: yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, resolve_property=False, resolve_value=False ) else: logger.debug('could not resolve chunk "%s" of fe %s (property is %s)', fe['chunk'], fe['fe'], prop) yield False, { 'chunk': fe['chunk'], 'additional': {'fe': fe, 'sentence': data['text'], 'url': url} }
def test_resolvers(self): self.assertEqual(wikidata.resolve('P1035', 'prof', 'en'), 'Q121594') self.assertEqual(wikidata.resolve('P21', 'male', 'en'), 'Q6581097') self.assertEqual(wikidata.resolve('P570', 'Feb 24, 2016', 'en'), '+00000002016-02-24T00:00:00Z/11')
def serialize_item(self, item): """ Converts an item to quick statements. :param item: Scraped item, either str (json) or dict :returns: tuples <success, item> where item is an entity which could not be resolved if success is false, otherwise it is a <subject, property, object, source> tuple :rtype: generator """ if isinstance(item, basestring): item = json.loads(item) name = item.pop('name', '') other = item.pop('other', {}) url = item.pop('url', '') if self.sourced_only and not url: logger.debug('item %s has no url, skipping it') return if not name: logger.debug('item %s has no name, skipping it') return data = {} try: data = json.loads(other) except ValueError: pass except TypeError: if isinstance(other, dict): data = other else: return name, honorifics = text.fix_name(name) data.update(item) data.pop('bio', None) # the name will be the last one to be resolved because it is the hardest # one to get right, so we will use all the other statements to help statements = defaultdict(list) for key, value in data.iteritems(): if not isinstance(value, list): value = [value] strings = [] for val in value: if isinstance(val, basestring): strings.append(val) elif isinstance(val, dict): strings.extend(val.keys()) strings.extend(val.values()) for val in strings: if not val: continue elif not isinstance(val, basestring): logger.debug('skipping value "%s" because it is not a string', val) continue property = wikidata.PROPERTY_TO_WIKIDATA.get(key) if not property: logger.debug('cannot resolve property %s, skipping', key) continue info = dict(data, **statements) # provide all available info to the resolver resolved = wikidata.resolve(property, val, self.language, **info) if not resolved: logger.debug('cannot resolve value %s of property %s, skipping', val, property) yield False, {'chunk': val, 'additional': {'property': property, 'url': url}} continue statements[property].append(resolved) info = dict(data, **statements) # provide all available info to the resolver info['type_'] = 5 # Q5 = human wid = wikidata.resolver_with_hints('P1559', name, self.language, **info) if not wid: logger.debug('cannot find wikidata id of "%s" with properties %s, skipping', name, repr(info)) yield False, {'chunk': name, 'additional': {'property': 'P1559', 'url': url}} return # now that we are sure about the subject we can produce the actual statements yield True, (wid, 'P1559', '%s:"%s"' % (self.language, name.title()), url) for property, values in statements.iteritems(): for val in values: yield True, (wid, property, val, url) for each in honorifics: hon = wikidata.resolve('P1035', each, self.language) if hon: yield True, (wid, 'P1035', hon, url) else: yield False, {'chunk': each, 'additional': {'property': 'P1035', 'url': url}}
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn( "Could not resolve Wikidata Item ID of subject '%s'", name) yield False, { 'chunk': name, 'additional': { 'sentence': data['text'], 'url': url } } continue for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, url): yield True, each else: prop = self.fe_to_wid.get(fe['fe']) if not prop: logger.debug('unknown fe type %s, skipping', fe['fe']) continue val = wikidata.resolve(prop, fe['chunk'], self.language) if val: yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, resolve_property=False, resolve_value=False) else: logger.debug( 'could not resolve chunk "%s" of fe %s (property is %s)', fe['chunk'], fe['fe'], prop) yield False, { 'chunk': fe['chunk'], 'additional': { 'fe': fe, 'sentence': data['text'], 'url': url } }
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get('url') if not url: logger.warn('skipping item without url') return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}} continue all_qualifiers = self.find_qualifiers(data['fes']) for fe in data['fes']: if fe['chunk'] == name: # do not add a statement for the current subject continue if fe['fe'] in ['Time', 'Duration']: for each in self.serialize_numerical(subj, fe, data): yield True, each else: prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid') if not prop: logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu']) continue chunk_types = set(t[len('http://dbpedia.org/ontology/'):] for t in fe.get('link', {}).get('types')) fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set()) if fe_types and chunk_types and not fe_types & chunk_types: logger.debug('skipping chunk "%s" of fe %s because types do not match, ' 'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types) continue val = None if 'link' in fe: uri = fe['link']['uri'] val = wikidata.wikidata_id_from_wikipedia_url(uri) if not val: val = wikidata.resolve(prop, fe['chunk'], self.language) if not val: val = 'Q19798648' logger.debug('could not resolve chunk "%s" of fe %s (property is %s), ' 'using default value of %s', fe['chunk'], fe['fe'], prop, val) stmt_qualifiers = [] for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []): for qualifier_value in all_qualifiers.get(qualifier_property, []): stmt_qualifiers.extend((qualifier_property, qualifier_value)) yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, qualifiers=stmt_qualifiers, resolve_property=False, resolve_value=False )
def to_statements(self, data, input_encoded=True): """ Converts the classification results into quick statements :param data: Data from the classifier. Can be either str or dict :param bool input_encoded: Whether data is a str or a dict :returns: Tuples <success, item> where item is a statement if success is true else it is a named entity which could not be resolved :type: generator """ data = json.loads(data) if input_encoded else data url = data.get("url") if not url: logger.warn("skipping item without url") return for name, subj in self.get_subjects(data): if not subj: logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name) yield False, {"chunk": name, "additional": {"sentence": data["text"], "url": url}} continue all_qualifiers = self.find_qualifiers(data["fes"]) for fe in data["fes"]: if fe["chunk"] == name: # do not add a statement for the current subject continue if fe["fe"] in ["Time", "Duration"]: for each in self.serialize_numerical(subj, fe, data): yield True, each else: prop = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("wid") if not prop: logger.debug("unknown fe type %s for LU %s, skipping", fe["fe"], data["lu"]) continue chunk_types = set(t[len("http://dbpedia.org/ontology/") :] for t in fe.get("link", {}).get("types")) fe_types = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("types", set()) if fe_types and chunk_types and not fe_types & chunk_types: logger.debug( 'skipping chunk "%s" of fe %s because types do not match, ' "expected: %s actual %s", fe["chunk"], fe["fe"], fe_types, chunk_types, ) continue val = None if "link" in fe: uri = fe["link"]["uri"] val = wikidata.wikidata_id_from_wikipedia_url(uri) if not val: val = wikidata.resolve(prop, fe["chunk"], self.language) if not val: val = "Q19798648" logger.debug( 'could not resolve chunk "%s" of fe %s (property is %s), ' "using default value of %s", fe["chunk"], fe["fe"], prop, val, ) stmt_qualifiers = [] for qualifier_property in self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("qualifiers", []): for qualifier_value in all_qualifiers.get(qualifier_property, []): stmt_qualifiers.extend((qualifier_property, qualifier_value)) yield True, wikidata.finalize_statement( subj, prop, val, self.language, url, qualifiers=stmt_qualifiers, resolve_property=False, resolve_value=False, )