Exemple #1
0
    def find_qualifiers(self, fes):
        """ Finds all FEs that could serve as qualifiers instead of full statements
        """

        qualifiers = defaultdict(list)
        for fe in fes:
            if fe['fe'] == 'Time':
                literal = fe['literal']
                value = wikidata.format_date(**literal)
                qualifiers['P585'].append(value)
            elif fe['fe'] == 'Duration':
                literal = fe['literal']
                if 'start' in literal:
                    value = wikidata.format_date(**literal['start'])
                    qualifiers['P580'].append(value)

                if 'end' in literal:
                    value = wikidata.format_date(**literal['end'])
                    qualifiers['P580'].append(value)
            elif fe['fe'] == 'Place':
                value = None
                if 'link' in fe:
                    value = wikidata.wikidata_id_from_wikipedia_url(fe['link']['uri'])

                if not value:
                    value = wikidata.resolve('P276', fe['chunk'], self.language)

                if value:
                    qualifiers['P276'].append(value)

        return qualifiers
Exemple #2
0
    def find_qualifiers(self, fes):
        """ Finds all FEs that could serve as qualifiers instead of full statements
        """

        qualifiers = defaultdict(list)
        for fe in fes:
            if fe["fe"] == "Time":
                literal = fe["literal"]
                value = wikidata.format_date(**literal)
                qualifiers["P585"].append(value)
            elif fe["fe"] == "Duration":
                literal = fe["literal"]
                if "start" in literal:
                    value = wikidata.format_date(**literal["start"])
                    qualifiers["P580"].append(value)

                if "end" in literal:
                    value = wikidata.format_date(**literal["end"])
                    qualifiers["P580"].append(value)
            elif fe["fe"] == "Place":
                value = None
                if "link" in fe:
                    value = wikidata.wikidata_id_from_wikipedia_url(fe["link"]["uri"])

                if not value:
                    value = wikidata.resolve("P276", fe["chunk"], self.language)

                if value:
                    qualifiers["P276"].append(value)

        return qualifiers
Exemple #3
0
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}}
                continue

            all_qualifiers = self.find_qualifiers(data['fes'])
            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, data):
                        yield True, each
                else:
                    prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid')
                    if not prop:
                        logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu'])
                        continue

                    chunk_types = set(t[len('http://dbpedia.org/ontology/'):]
                                      for t in fe.get('link', {}).get('types'))
                    fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set())
                    if fe_types and chunk_types and not fe_types & chunk_types:
                        logger.debug('skipping chunk "%s" of fe %s because types do not match, '
                                     'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types)
                        continue

                    val = None
                    if 'link' in fe:
                        uri = fe['link']['uri']
                        val = wikidata.wikidata_id_from_wikipedia_url(uri)

                    if not val:
                        val = wikidata.resolve(prop, fe['chunk'], self.language)

                    if not val:
                        val = 'Q19798648'
                        logger.debug('could not resolve chunk "%s" of fe %s (property is %s), '
                                     'using default value of %s',
                                     fe['chunk'], fe['fe'], prop, val)

                    stmt_qualifiers = []
                    for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []):
                        for qualifier_value in all_qualifiers.get(qualifier_property, []):
                            stmt_qualifiers.extend((qualifier_property, qualifier_value))

                    yield True, wikidata.finalize_statement(
                        subj, prop, val, self.language, url, qualifiers=stmt_qualifiers,
                        resolve_property=False, resolve_value=False
                    )
Exemple #4
0
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get("url")
        if not url:
            logger.warn("skipping item without url")
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {"chunk": name, "additional": {"sentence": data["text"], "url": url}}
                continue

            all_qualifiers = self.find_qualifiers(data["fes"])
            for fe in data["fes"]:
                if fe["chunk"] == name:  # do not add a statement for the current subject
                    continue

                if fe["fe"] in ["Time", "Duration"]:
                    for each in self.serialize_numerical(subj, fe, data):
                        yield True, each
                else:
                    prop = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("wid")
                    if not prop:
                        logger.debug("unknown fe type %s for LU %s, skipping", fe["fe"], data["lu"])
                        continue

                    chunk_types = set(t[len("http://dbpedia.org/ontology/") :] for t in fe.get("link", {}).get("types"))
                    fe_types = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("types", set())
                    if fe_types and chunk_types and not fe_types & chunk_types:
                        logger.debug(
                            'skipping chunk "%s" of fe %s because types do not match, ' "expected: %s actual %s",
                            fe["chunk"],
                            fe["fe"],
                            fe_types,
                            chunk_types,
                        )
                        continue

                    val = None
                    if "link" in fe:
                        uri = fe["link"]["uri"]
                        val = wikidata.wikidata_id_from_wikipedia_url(uri)

                    if not val:
                        val = wikidata.resolve(prop, fe["chunk"], self.language)

                    if not val:
                        val = "Q19798648"
                        logger.debug(
                            'could not resolve chunk "%s" of fe %s (property is %s), ' "using default value of %s",
                            fe["chunk"],
                            fe["fe"],
                            prop,
                            val,
                        )

                    stmt_qualifiers = []
                    for qualifier_property in self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("qualifiers", []):
                        for qualifier_value in all_qualifiers.get(qualifier_property, []):
                            stmt_qualifiers.extend((qualifier_property, qualifier_value))

                    yield True, wikidata.finalize_statement(
                        subj,
                        prop,
                        val,
                        self.language,
                        url,
                        qualifiers=stmt_qualifiers,
                        resolve_property=False,
                        resolve_value=False,
                    )