Esempio n. 1
0
    def get_subjects(self, data):
        """ Finds all subjects of the frame assigned to the sentence

            :param dict data: classification results
            :return: all subjects as tuples (chunk, wikidata id)
            :rtype: generator of tuples
        """

        if data['lu'] not in self.frame_data:
            logger.debug('sentence with a LU not contained in the lexical database')
            logger.debug(data)
            subjects = []
        else:
            frame = self.frame_data[data['lu']]
            subjects = [fe for fe in data['fes'] if fe['fe'] in frame['core_fes']]

        if subjects:
            for each in subjects:
                name = each['chunk']
                wid = wikidata.resolver_with_hints(
                    'P1559', name, self.language
                )
                yield name, wid
        else:
            # if this fails, assume the subject is the main subject of the article
            # from which this sentence was extracted
            if data['url'] in self.url_to_wid:
                name = None
                wid = self.url_to_wid[data['url']]
            else:
                name = data.get('name')
                wid = wikidata.resolver_with_hints('P1559', name, self.language) or None if name else None

            yield name, wid
Esempio n. 2
0
def process_row(data):
    subject = data['emergenza']

    resolved = defaultdict(lambda: [])
    for k, v in data.iteritems():
        if COLUMN_TO_PROPERTY.get(k):
            v = wikidata.resolve(COLUMN_TO_PROPERTY[k], v.decode('utf8'), 'it')
            if v:
                resolved[COLUMN_TO_PROPERTY[k]].append(v)

    info = {k: v for k, v in resolved.iteritems()}

    subject = wikidata.resolver_with_hints('ddd', subject, 'it', **info)
    if subject:
        statements = []
        for property, value in resolved.iteritems():
            stmt = wikidata.finalize_statement(subject, property, value,
                                               'it', resolve_property=False,
                                               resolve_value=False)
            if stmt is not None:
                statements.append(stmt)
    else:
        logger.warn('could not find the wikidata id of "%s"' % data['emergenza'])
        statements = None
    return statements
Esempio n. 3
0
    def get_subjects(self, data):
        """ Finds all subjects of the frame assigned to the sentence

            :param dict data: classification results
            :return: all subjects as tuples (chunk, wikidata id)
            :rtype: generator of tuples
        """

        if data['lu'] not in self.frame_data:
            logger.debug(
                'sentence with a LU not contained in the lexical database')
            logger.debug(data)
            subjects = []
        else:
            frame = self.frame_data[data['lu']]
            subjects = [
                fe for fe in data['fes'] if fe['fe'] in frame['core_fes']
            ]

        if subjects:
            for each in subjects:
                name = each['chunk']
                wid = wikidata.resolver_with_hints('P1559', name,
                                                   self.language)
                yield name, wid
        else:
            # if this fails, assume the subject is the main subject of the article
            # from which this sentence was extracted
            if data['url'] in self.url_to_wid:
                name = None
                wid = self.url_to_wid[data['url']]
            else:
                name = data.get('name')
                wid = wikidata.resolver_with_hints(
                    'P1559', name, self.language) or None if name else None

            yield name, wid
Esempio n. 4
0
    def get_subjects(self, data):
        """ Finds all subjects of the frame assigned to the sentence

            :param dict data: classification results
            :return: all subjects as tuples (chunk, wikidata id)
            :rtype: generator of tuples
        """

        if data["lu"] not in self.frame_data:
            logger.debug("sentence with a LU not contained in the lexical database")
            logger.debug(data)
            subjects = []
        else:
            frame = self.frame_data[data["lu"]]
            subjects = [fe for fe in data["fes"] if fe["fe"] in frame["core_fes"]]

        if subjects:
            for each in subjects:
                name = each["chunk"]
                wid = wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language)
                yield name, wid
        else:
            # if this fails, assume the subject is the main subject of the
            # article from which this sentence was extracted
            if data["url"] in self.url_to_wid:
                name = None
                wid = self.url_to_wid[data["url"]]
            else:
                name = data.get("name")
                wid = (
                    wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language, type_=5)  # Q5 = human
                    or None
                    if name
                    else None
                )

            yield name, wid
Esempio n. 5
0
 def test_name_resolver_with_gender(self):
     self.assertEqual('', wikidata.resolver_with_hints('P1477', 'colin fraser', 'en', P21=['Q6581097']))
Esempio n. 6
0
 def test_name_resolver_with_death(self):
     additional_info = {'P570': ['+1958-08-15T00:00:00Z/11']}
     self.assertEqual(wikidata.resolver_with_hints('P1477', 'colin fraser', 'en', **additional_info),
                      'Q5145111')
Esempio n. 7
0
 def test_name_resolver_with_birth(self):
     additional_info = {'P569': ['+1893-09-20T00:00:00Z/11']}
     self.assertEqual(wikidata.resolver_with_hints('P1477', 'colin fraser', 'en', **additional_info),
                      'Q5145111')
Esempio n. 8
0
    def serialize_item(self, item):
        """ Converts an item to quick statements.

            :param item: Scraped item, either str (json) or dict
            :returns: tuples <success, item> where item is an entity which
             could not be resolved if success is false, otherwise it is a
             <subject, property, object, source> tuple
            :rtype: generator
        """

        if isinstance(item, basestring):
            item = json.loads(item)

        name = item.pop('name', '')
        other = item.pop('other', {})
        url = item.pop('url', '')

        if self.sourced_only and not url:
            logger.debug('item %s has no url, skipping it')
            return

        if not name:
            logger.debug('item %s has no name, skipping it')
            return

        data = {}
        try:
            data = json.loads(other)
        except ValueError:
            pass
        except TypeError:
            if isinstance(other, dict):
                data = other
            else:
                return

        name, honorifics = text.fix_name(name)
        data.update(item)
        data.pop('bio', None)

        # the name will be the last one to be resolved because it is the hardest
        # one to get right, so we will use all the other statements to help
        statements = defaultdict(list)

        for key, value in data.iteritems():
            if not isinstance(value, list):
                value = [value]

            strings = []
            for val in value:
                if isinstance(val, basestring):
                    strings.append(val)
                elif isinstance(val, dict):
                    strings.extend(val.keys())
                    strings.extend(val.values())

            for val in strings:
                if not val:
                    continue
                elif not isinstance(val, basestring):
                    logger.debug('skipping value "%s" because it is not a string', val)
                    continue

                property = wikidata.PROPERTY_TO_WIKIDATA.get(key)
                if not property:
                    logger.debug('cannot resolve property %s, skipping', key)
                    continue

                info = dict(data, **statements)  # provide all available info to the resolver
                resolved = wikidata.resolve(property, val, self.language, **info)
                if not resolved:
                    logger.debug('cannot resolve value %s of property %s, skipping', val, property)
                    yield False, {'chunk': val, 'additional': {'property': property, 'url': url}}
                    continue

                statements[property].append(resolved)

        info = dict(data, **statements)  # provide all available info to the resolver
        info['type_'] = 5  # Q5 = human
        wid = wikidata.resolver_with_hints('P1559', name, self.language, **info)

        if not wid:
            logger.debug('cannot find wikidata id of "%s" with properties %s, skipping',
                         name, repr(info))
            yield False, {'chunk': name, 'additional': {'property': 'P1559', 'url': url}}
            return

        # now that we are sure about the subject we can produce the actual statements
        yield True, (wid, 'P1559', '%s:"%s"' % (self.language, name.title()), url)
        for property, values in statements.iteritems():
            for val in values:
                yield True, (wid, property, val, url)

        for each in honorifics:
            hon = wikidata.resolve('P1035', each, self.language)
            if hon:
                yield True, (wid, 'P1035', hon, url)
            else:
                yield False, {'chunk': each, 'additional': {'property': 'P1035', 'url': url}}