コード例 #1
0
ファイル: wlm.py プロジェクト: rpatil524/StrepHit
def process_row(data):
    subject = data['emergenza']

    resolved = defaultdict(lambda: [])
    for k, v in data.iteritems():
        if COLUMN_TO_PROPERTY.get(k):
            v = wikidata.resolve(COLUMN_TO_PROPERTY[k], v.decode('utf8'), 'it')
            if v:
                resolved[COLUMN_TO_PROPERTY[k]].append(v)

    info = {k: v for k, v in resolved.iteritems()}

    subject = wikidata.resolver_with_hints('ddd', subject, 'it', **info)
    if subject:
        statements = []
        for property, value in resolved.iteritems():
            stmt = wikidata.finalize_statement(subject, property, value,
                                               'it', resolve_property=False,
                                               resolve_value=False)
            if stmt is not None:
                statements.append(stmt)
    else:
        logger.warn('could not find the wikidata id of "%s"' % data['emergenza'])
        statements = None
    return statements
コード例 #2
0
ファイル: serialize.py プロジェクト: rpatil524/StrepHit
    def find_qualifiers(self, fes):
        """ Finds all FEs that could serve as qualifiers instead of full statements
        """

        qualifiers = defaultdict(list)
        for fe in fes:
            if fe['fe'] == 'Time':
                literal = fe['literal']
                value = wikidata.format_date(**literal)
                qualifiers['P585'].append(value)
            elif fe['fe'] == 'Duration':
                literal = fe['literal']
                if 'start' in literal:
                    value = wikidata.format_date(**literal['start'])
                    qualifiers['P580'].append(value)

                if 'end' in literal:
                    value = wikidata.format_date(**literal['end'])
                    qualifiers['P580'].append(value)
            elif fe['fe'] == 'Place':
                value = None
                if 'link' in fe:
                    value = wikidata.wikidata_id_from_wikipedia_url(fe['link']['uri'])

                if not value:
                    value = wikidata.resolve('P276', fe['chunk'], self.language)

                if value:
                    qualifiers['P276'].append(value)

        return qualifiers
コード例 #3
0
ファイル: serialize.py プロジェクト: Wikidata/StrepHit
    def find_qualifiers(self, fes):
        """ Finds all FEs that could serve as qualifiers instead of full statements
        """

        qualifiers = defaultdict(list)
        for fe in fes:
            if fe["fe"] == "Time":
                literal = fe["literal"]
                value = wikidata.format_date(**literal)
                qualifiers["P585"].append(value)
            elif fe["fe"] == "Duration":
                literal = fe["literal"]
                if "start" in literal:
                    value = wikidata.format_date(**literal["start"])
                    qualifiers["P580"].append(value)

                if "end" in literal:
                    value = wikidata.format_date(**literal["end"])
                    qualifiers["P580"].append(value)
            elif fe["fe"] == "Place":
                value = None
                if "link" in fe:
                    value = wikidata.wikidata_id_from_wikipedia_url(fe["link"]["uri"])

                if not value:
                    value = wikidata.resolve("P276", fe["chunk"], self.language)

                if value:
                    qualifiers["P276"].append(value)

        return qualifiers
コード例 #4
0
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}}
                continue

            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, url):
                        yield True, each
                else:
                    prop = self.fe_to_wid.get(fe['fe'])
                    if not prop:
                        logger.debug('unknown fe type %s, skipping', fe['fe'])
                        continue

                    val = wikidata.resolve(prop, fe['chunk'], self.language)
                    if val:
                        yield True, wikidata.finalize_statement(
                            subj, prop, val, self.language, url,
                            resolve_property=False, resolve_value=False
                        )
                    else:
                        logger.debug('could not resolve chunk "%s" of fe %s (property is %s)',
                                     fe['chunk'], fe['fe'], prop)
                        yield False, {
                            'chunk': fe['chunk'],
                            'additional': {'fe': fe, 'sentence': data['text'], 'url': url}
                        }
コード例 #5
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_resolvers(self):
     self.assertEqual(wikidata.resolve('P1035', 'prof', 'en'), 'Q121594')
     self.assertEqual(wikidata.resolve('P21', 'male', 'en'), 'Q6581097')
     self.assertEqual(wikidata.resolve('P570', 'Feb 24, 2016', 'en'),
                      '+00000002016-02-24T00:00:00Z/11')
コード例 #6
0
    def serialize_item(self, item):
        """ Converts an item to quick statements.

            :param item: Scraped item, either str (json) or dict
            :returns: tuples <success, item> where item is an entity which
             could not be resolved if success is false, otherwise it is a
             <subject, property, object, source> tuple
            :rtype: generator
        """

        if isinstance(item, basestring):
            item = json.loads(item)

        name = item.pop('name', '')
        other = item.pop('other', {})
        url = item.pop('url', '')

        if self.sourced_only and not url:
            logger.debug('item %s has no url, skipping it')
            return

        if not name:
            logger.debug('item %s has no name, skipping it')
            return

        data = {}
        try:
            data = json.loads(other)
        except ValueError:
            pass
        except TypeError:
            if isinstance(other, dict):
                data = other
            else:
                return

        name, honorifics = text.fix_name(name)
        data.update(item)
        data.pop('bio', None)

        # the name will be the last one to be resolved because it is the hardest
        # one to get right, so we will use all the other statements to help
        statements = defaultdict(list)

        for key, value in data.iteritems():
            if not isinstance(value, list):
                value = [value]

            strings = []
            for val in value:
                if isinstance(val, basestring):
                    strings.append(val)
                elif isinstance(val, dict):
                    strings.extend(val.keys())
                    strings.extend(val.values())

            for val in strings:
                if not val:
                    continue
                elif not isinstance(val, basestring):
                    logger.debug('skipping value "%s" because it is not a string', val)
                    continue

                property = wikidata.PROPERTY_TO_WIKIDATA.get(key)
                if not property:
                    logger.debug('cannot resolve property %s, skipping', key)
                    continue

                info = dict(data, **statements)  # provide all available info to the resolver
                resolved = wikidata.resolve(property, val, self.language, **info)
                if not resolved:
                    logger.debug('cannot resolve value %s of property %s, skipping', val, property)
                    yield False, {'chunk': val, 'additional': {'property': property, 'url': url}}
                    continue

                statements[property].append(resolved)

        info = dict(data, **statements)  # provide all available info to the resolver
        info['type_'] = 5  # Q5 = human
        wid = wikidata.resolver_with_hints('P1559', name, self.language, **info)

        if not wid:
            logger.debug('cannot find wikidata id of "%s" with properties %s, skipping',
                         name, repr(info))
            yield False, {'chunk': name, 'additional': {'property': 'P1559', 'url': url}}
            return

        # now that we are sure about the subject we can produce the actual statements
        yield True, (wid, 'P1559', '%s:"%s"' % (self.language, name.title()), url)
        for property, values in statements.iteritems():
            for val in values:
                yield True, (wid, property, val, url)

        for each in honorifics:
            hon = wikidata.resolve('P1035', each, self.language)
            if hon:
                yield True, (wid, 'P1035', hon, url)
            else:
                yield False, {'chunk': each, 'additional': {'property': 'P1035', 'url': url}}
コード例 #7
0
ファイル: serialize.py プロジェクト: nooralahzadeh/StrepHit
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn(
                    "Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {
                    'chunk': name,
                    'additional': {
                        'sentence': data['text'],
                        'url': url
                    }
                }
                continue

            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, url):
                        yield True, each
                else:
                    prop = self.fe_to_wid.get(fe['fe'])
                    if not prop:
                        logger.debug('unknown fe type %s, skipping', fe['fe'])
                        continue

                    val = wikidata.resolve(prop, fe['chunk'], self.language)
                    if val:
                        yield True, wikidata.finalize_statement(
                            subj,
                            prop,
                            val,
                            self.language,
                            url,
                            resolve_property=False,
                            resolve_value=False)
                    else:
                        logger.debug(
                            'could not resolve chunk "%s" of fe %s (property is %s)',
                            fe['chunk'], fe['fe'], prop)
                        yield False, {
                            'chunk': fe['chunk'],
                            'additional': {
                                'fe': fe,
                                'sentence': data['text'],
                                'url': url
                            }
                        }
コード例 #8
0
ファイル: serialize.py プロジェクト: rpatil524/StrepHit
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {'chunk': name, 'additional': {'sentence': data['text'], 'url': url}}
                continue

            all_qualifiers = self.find_qualifiers(data['fes'])
            for fe in data['fes']:
                if fe['chunk'] == name:  # do not add a statement for the current subject
                    continue

                if fe['fe'] in ['Time', 'Duration']:
                    for each in self.serialize_numerical(subj, fe, data):
                        yield True, each
                else:
                    prop = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('wid')
                    if not prop:
                        logger.debug('unknown fe type %s for LU %s, skipping', fe['fe'], data['lu'])
                        continue

                    chunk_types = set(t[len('http://dbpedia.org/ontology/'):]
                                      for t in fe.get('link', {}).get('types'))
                    fe_types = self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('types', set())
                    if fe_types and chunk_types and not fe_types & chunk_types:
                        logger.debug('skipping chunk "%s" of fe %s because types do not match, '
                                     'expected: %s actual %s', fe['chunk'], fe['fe'], fe_types, chunk_types)
                        continue

                    val = None
                    if 'link' in fe:
                        uri = fe['link']['uri']
                        val = wikidata.wikidata_id_from_wikipedia_url(uri)

                    if not val:
                        val = wikidata.resolve(prop, fe['chunk'], self.language)

                    if not val:
                        val = 'Q19798648'
                        logger.debug('could not resolve chunk "%s" of fe %s (property is %s), '
                                     'using default value of %s',
                                     fe['chunk'], fe['fe'], prop, val)

                    stmt_qualifiers = []
                    for qualifier_property in self.lu_fe_map.get((data['lu'], fe['fe']), {}).get('qualifiers', []):
                        for qualifier_value in all_qualifiers.get(qualifier_property, []):
                            stmt_qualifiers.extend((qualifier_property, qualifier_value))

                    yield True, wikidata.finalize_statement(
                        subj, prop, val, self.language, url, qualifiers=stmt_qualifiers,
                        resolve_property=False, resolve_value=False
                    )
コード例 #9
0
ファイル: serialize.py プロジェクト: Wikidata/StrepHit
    def to_statements(self, data, input_encoded=True):
        """ Converts the classification results into quick statements

            :param data: Data from the classifier. Can be either str or dict
            :param bool input_encoded: Whether data is a str or a dict
            :returns: Tuples <success, item> where item is a statement if success
             is true else it is a named entity which could not be resolved
            :type: generator
        """
        data = json.loads(data) if input_encoded else data

        url = data.get("url")
        if not url:
            logger.warn("skipping item without url")
            return

        for name, subj in self.get_subjects(data):
            if not subj:
                logger.warn("Could not resolve Wikidata Item ID of subject '%s'", name)
                yield False, {"chunk": name, "additional": {"sentence": data["text"], "url": url}}
                continue

            all_qualifiers = self.find_qualifiers(data["fes"])
            for fe in data["fes"]:
                if fe["chunk"] == name:  # do not add a statement for the current subject
                    continue

                if fe["fe"] in ["Time", "Duration"]:
                    for each in self.serialize_numerical(subj, fe, data):
                        yield True, each
                else:
                    prop = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("wid")
                    if not prop:
                        logger.debug("unknown fe type %s for LU %s, skipping", fe["fe"], data["lu"])
                        continue

                    chunk_types = set(t[len("http://dbpedia.org/ontology/") :] for t in fe.get("link", {}).get("types"))
                    fe_types = self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("types", set())
                    if fe_types and chunk_types and not fe_types & chunk_types:
                        logger.debug(
                            'skipping chunk "%s" of fe %s because types do not match, ' "expected: %s actual %s",
                            fe["chunk"],
                            fe["fe"],
                            fe_types,
                            chunk_types,
                        )
                        continue

                    val = None
                    if "link" in fe:
                        uri = fe["link"]["uri"]
                        val = wikidata.wikidata_id_from_wikipedia_url(uri)

                    if not val:
                        val = wikidata.resolve(prop, fe["chunk"], self.language)

                    if not val:
                        val = "Q19798648"
                        logger.debug(
                            'could not resolve chunk "%s" of fe %s (property is %s), ' "using default value of %s",
                            fe["chunk"],
                            fe["fe"],
                            prop,
                            val,
                        )

                    stmt_qualifiers = []
                    for qualifier_property in self.lu_fe_map.get((data["lu"], fe["fe"]), {}).get("qualifiers", []):
                        for qualifier_value in all_qualifiers.get(qualifier_property, []):
                            stmt_qualifiers.extend((qualifier_property, qualifier_value))

                    yield True, wikidata.finalize_statement(
                        subj,
                        prop,
                        val,
                        self.language,
                        url,
                        qualifiers=stmt_qualifiers,
                        resolve_property=False,
                        resolve_value=False,
                    )