コード例 #1
0
    def make_item_keyword_categories(self):
        """Construct categories from the item keyword values."""
        all_keywords = set()
        if self.motivord:
            all_keywords.update([
                keyword.casefold()
                for keyword in utils.clean_uncertain(self.motivord)
            ])
        if self.sokord:
            all_keywords.update([
                keyword.casefold()
                for keyword in utils.clean_uncertain(self.sokord)
            ])
        keyword_map = self.smvk_info.mappings.get('keywords')

        for keyword in all_keywords:
            if keyword not in keyword_map:
                continue
            for cat in keyword_map[keyword]:
                match_on_first = True
                found_testcat = False
                for place_cats in self.geo_data.get('commonscats').values():
                    if not place_cats:
                        continue
                    found_testcat = any([
                        self.try_cat_patterns(cat, place_cat, match_on_first)
                        for place_cat in place_cats
                    ])
                    if found_testcat:
                        break
                    match_on_first = False
                if not found_testcat and self.smvk_info.category_exists(cat):
                    self.content_cats.add(cat)
コード例 #2
0
    def get_geo_data(self):
        """
        Find commonscat and wikidata entries for each available place level.

        Returns an dict with the most specific wikidata entry and any matching
        commonscats in decreasing order of relevance.

        If any 'other_geo' value is matched the wikidata ids are returned and
        the categories are added as content_cats.

        Uncertain entries are filtered out from everything except raw.
        """
        wikidata = OrderedDict()
        commonscats = OrderedDict()
        labels = OrderedDict()
        raw = OrderedDict()
        for geo_type in GEO_ORDER:
            # all except country are lists so handle all as lists
            wikidata_type = {}
            commonscats_type = []
            labels_type = []
            geo_entries_raw = []
            if getattr(self, geo_type):  # country otherwise makes ['']
                geo_entries_raw = common.listify(getattr(self, geo_type))
            geo_entries = utils.clean_uncertain(geo_entries_raw)
            for geo_entry in geo_entries:
                label = geo_entry.strip()
                mapping = self.smvk_info.mapped_and_wikidata(
                    geo_entry, self.smvk_info.mappings['places'])
                if mapping.get('category'):
                    commonscats_type += mapping.get('category')  # a list
                if mapping.get('wikidata'):
                    wikidata_type[label] = mapping.get('wikidata')
                labels_type.append(label)
            wikidata[geo_type] = wikidata_type
            commonscats[geo_type] = list(set(commonscats_type))
            labels[geo_type] = labels_type
            raw[geo_type] = geo_entries_raw

        # assume country is always mapped and either land OR depicted_land used
        countries = next((getattr(self, key)
                          for key in GEO_COUNTRIES if getattr(self, key)), [])
        num_countries = len(common.listify(countries))
        if len(list(filter(None, commonscats.values()))) <= num_countries:
            # just knowing country is pretty bad
            self.meta_cats.add('needing categorisation (place)')

        # add other_geo to raw
        raw['other_geo'] = self.other_geo

        return {
            'wd': wikidata,
            'commonscats': commonscats,
            'labels': labels,
            'raw': raw,
            'other': utils.clean_uncertain(self.other_geo)
        }
コード例 #3
0
    def parse_data(self, data):
        """Go through the raw data breaking out data needing mapping."""
        for key, image in data.items():
            self.check_for_unexpected_lists(image, image.get('photo_id'))

            if image.get('event'):
                self.expedition_to_match.update(
                    utils.clean_uncertain(common.listify(image.get('event')),
                                          keep=True))
            if image.get('museum_obj'):
                museum, _, type = image.get('museum_obj').partition('/')
                self.museum_to_match.add((museum, type))
            if image.get('ext_ids'):
                self.external_to_parse.update(image.get('ext_ids'))

            # keywords - compare without case
            keyword_columns = ('motivord', 'sokord')
            for col in keyword_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                val = [v.casefold() for v in val]
                self.keywords_to_map.update(val)

            # people
            people_columns = ('depicted_persons', 'photographer', 'creator')
            for col in people_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                self.people_to_map.update(
                    [helpers.flip_name(person) for person in val])

            # ethnic groups - compare without case
            ethnic_columns = ('ethnic', 'ethnic_old')
            for col in ethnic_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                val = [v.casefold() for v in val]
                self.ethnic_to_map.update(val)

            # places
            place_columns = ('land', 'region', 'ort', 'depicted_places',
                             ('depicted_land',
                              'land'))  # depicted_land merged with land
            for col in place_columns:
                key = col
                if isinstance(col, tuple):
                    key = col[1]
                    col = col[0]
                if key not in self.places_to_map:
                    self.places_to_map[key] = Counter()
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                self.places_to_map[key].update(val)
コード例 #4
0
    def get_depicted_person(self, wrap=False):
        """
        Format a depicted person statement.

        The result is always wrapped in a {{depicted person}} template.
        People are added either by their wikidata id or by their name.
        Note that the template itself only supports up to 5 people

        :param wrap: whether to set the 'information field' style, wrapping
            the result in an {{information field}}.
        """
        if not self.depicted_persons:
            return ''

        formatted_people = []
        for person in utils.clean_uncertain(self.depicted_persons, keep=True):
            person_data = self.get_person_data(person)
            if person_data.get('category'):
                self.content_cats.update(person_data.get('category'))
            formatted_people.append(
                person_data.get('wikidata') or person_data.get('name'))

        style = '|style=information field' if wrap else ''
        return '{{depicted person|%s%s}} ' % ('|'.join(formatted_people),
                                              style)
コード例 #5
0
    def get_event_data(self, strict=True):
        """
        Return data about the event.

        :param strict: Whether to discard uncertain entries.
        """
        event = utils.clean_uncertain(self.event, keep=not strict)
        return self.smvk_info.mappings.get('expeditions').get(event, {})
コード例 #6
0
    def get_creator_data(self, strict=True):
        """
        Return the mapped person data for the creator(s).

        :param strict: Whether to discard uncertain entries.
        """
        person = self.creator or self.photographer  # don't support both
        person = utils.clean_uncertain(person, keep=not strict)
        if person:
            return self.get_person_data(person)
        return {}
コード例 #7
0
    def get_title_description(self):
        """
        Construct an appropriate description for a filename.

        The location part prioritises ort and region over depicted_places and
        other_geo as these are cleaner. Land is always included. Uncertain
        entries are filterd out.
        out.
        """
        txt = self.description_clean
        geo = (utils.clean_uncertain(self.ort)
               or utils.clean_uncertain(self.region)
               or utils.clean_uncertain(self.depicted_places)
               or utils.clean_uncertain(self.other_geo))
        land = (utils.clean_uncertain(self.land)
                or utils.clean_uncertain(self.depicted_land))
        if geo or land:
            txt += '. {}'.format(', '.join(geo))
            land_text = '-'.join(common.listify(land))
            if geo and land:
                if land_text in txt:  # avoid duplicated info
                    return txt
                txt += '. '
            txt += land_text
        return txt
コード例 #8
0
    def get_ethnic_data(self, strict=True):
        """
        Return data about ethnic groups.

        :param strict: Whether to discard uncertain entries.
        """
        ethnic = self.ethnic or common.listify(self.ethnic_old)
        data = []
        ethnicities = utils.clean_uncertain(ethnic, keep=not strict)
        if not ethnicities:
            return data
        mapping = self.smvk_info.mappings.get('ethnic')
        for ethnicity in ethnicities:
            data.append(
                mapping.get(ethnicity.casefold())
                or {'name': ethnicity.casefold()})
        return data