Example #1
0
 def test_fix_name(self):
     self.assertEqual(text.fix_name('  Colin Fraser    '),
                      ('colin fraser', []))
     self.assertEqual(text.fix_name('Fraser, Colin'),
                      ('colin fraser', []))
     self.assertEqual(text.fix_name('Sir Colin Fraser'),
                      ('colin fraser', ['sir']))
     self.assertEqual(text.fix_name('Fraser, Sir Colin'),
                      ('colin fraser', ['sir']))
Example #2
0
    def get_subjects(self, data):
        """ Finds all subjects of the frame assigned to the sentence

            :param dict data: classification results
            :return: all subjects as tuples (chunk, wikidata id)
            :rtype: generator of tuples
        """

        if data['lu'] not in self.frame_data:
            logger.debug('sentence with a LU not contained in the lexical database')
            logger.debug(data)
            subjects = []
        else:
            frame = self.frame_data[data['lu']]
            subjects = [fe for fe in data['fes'] if fe['fe'] in frame['core_fes']]

        if subjects:
            for each in subjects:
                name = each['chunk']
                wid = wikidata.resolver_with_hints(
                    'P1559', text.fix_name(name)[0], self.language
                )
                yield name, wid
        else:
            # if this fails, assume the subject is the main subject of the
            # article from which this sentence was extracted
            if data['url'] in self.url_to_wid:
                name = None
                wid = self.url_to_wid[data['url']]
            else:
                name = data.get('name')
                wid = wikidata.resolver_with_hints(
                    'P1559', text.fix_name(name)[0], self.language,
                    type_=5  # Q5 = human
                ) or None if name else None

            yield name, wid
Example #3
0
    def get_subjects(self, data):
        """ Finds all subjects of the frame assigned to the sentence

            :param dict data: classification results
            :return: all subjects as tuples (chunk, wikidata id)
            :rtype: generator of tuples
        """

        if data["lu"] not in self.frame_data:
            logger.debug("sentence with a LU not contained in the lexical database")
            logger.debug(data)
            subjects = []
        else:
            frame = self.frame_data[data["lu"]]
            subjects = [fe for fe in data["fes"] if fe["fe"] in frame["core_fes"]]

        if subjects:
            for each in subjects:
                name = each["chunk"]
                wid = wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language)
                yield name, wid
        else:
            # if this fails, assume the subject is the main subject of the
            # article from which this sentence was extracted
            if data["url"] in self.url_to_wid:
                name = None
                wid = self.url_to_wid[data["url"]]
            else:
                name = data.get("name")
                wid = (
                    wikidata.resolver_with_hints("P1559", text.fix_name(name)[0], self.language, type_=5)  # Q5 = human
                    or None
                    if name
                    else None
                )

            yield name, wid
Example #4
0
    def serialize_item(self, item):
        """ Converts an item to quick statements.

            :param item: Scraped item, either str (json) or dict
            :returns: tuples <success, item> where item is an entity which
             could not be resolved if success is false, otherwise it is a
             <subject, property, object, source> tuple
            :rtype: generator
        """

        if isinstance(item, basestring):
            item = json.loads(item)

        name = item.pop('name', '')
        other = item.pop('other', {})
        url = item.pop('url', '')

        if self.sourced_only and not url:
            logger.debug('item %s has no url, skipping it')
            return

        if not name:
            logger.debug('item %s has no name, skipping it')
            return

        data = {}
        try:
            data = json.loads(other)
        except ValueError:
            pass
        except TypeError:
            if isinstance(other, dict):
                data = other
            else:
                return

        name, honorifics = text.fix_name(name)
        data.update(item)
        data.pop('bio', None)

        # the name will be the last one to be resolved because it is the hardest
        # one to get right, so we will use all the other statements to help
        statements = defaultdict(list)

        for key, value in data.iteritems():
            if not isinstance(value, list):
                value = [value]

            strings = []
            for val in value:
                if isinstance(val, basestring):
                    strings.append(val)
                elif isinstance(val, dict):
                    strings.extend(val.keys())
                    strings.extend(val.values())

            for val in strings:
                if not val:
                    continue
                elif not isinstance(val, basestring):
                    logger.debug('skipping value "%s" because it is not a string', val)
                    continue

                property = wikidata.PROPERTY_TO_WIKIDATA.get(key)
                if not property:
                    logger.debug('cannot resolve property %s, skipping', key)
                    continue

                info = dict(data, **statements)  # provide all available info to the resolver
                resolved = wikidata.resolve(property, val, self.language, **info)
                if not resolved:
                    logger.debug('cannot resolve value %s of property %s, skipping', val, property)
                    yield False, {'chunk': val, 'additional': {'property': property, 'url': url}}
                    continue

                statements[property].append(resolved)

        info = dict(data, **statements)  # provide all available info to the resolver
        info['type_'] = 5  # Q5 = human
        wid = wikidata.resolver_with_hints('P1559', name, self.language, **info)

        if not wid:
            logger.debug('cannot find wikidata id of "%s" with properties %s, skipping',
                         name, repr(info))
            yield False, {'chunk': name, 'additional': {'property': 'P1559', 'url': url}}
            return

        # now that we are sure about the subject we can produce the actual statements
        yield True, (wid, 'P1559', '%s:"%s"' % (self.language, name.title()), url)
        for property, values in statements.iteritems():
            for val in values:
                yield True, (wid, property, val, url)

        for each in honorifics:
            hon = wikidata.resolve('P1035', each, self.language)
            if hon:
                yield True, (wid, 'P1035', hon, url)
            else:
                yield False, {'chunk': each, 'additional': {'property': 'P1035', 'url': url}}