def make_item_keyword_categories(self): """Construct categories from the item keyword values.""" all_keywords = set() if self.motivord: all_keywords.update([ keyword.casefold() for keyword in utils.clean_uncertain(self.motivord) ]) if self.sokord: all_keywords.update([ keyword.casefold() for keyword in utils.clean_uncertain(self.sokord) ]) keyword_map = self.smvk_info.mappings.get('keywords') for keyword in all_keywords: if keyword not in keyword_map: continue for cat in keyword_map[keyword]: match_on_first = True found_testcat = False for place_cats in self.geo_data.get('commonscats').values(): if not place_cats: continue found_testcat = any([ self.try_cat_patterns(cat, place_cat, match_on_first) for place_cat in place_cats ]) if found_testcat: break match_on_first = False if not found_testcat and self.smvk_info.category_exists(cat): self.content_cats.add(cat)
def get_geo_data(self): """ Find commonscat and wikidata entries for each available place level. Returns an dict with the most specific wikidata entry and any matching commonscats in decreasing order of relevance. If any 'other_geo' value is matched the wikidata ids are returned and the categories are added as content_cats. Uncertain entries are filtered out from everything except raw. """ wikidata = OrderedDict() commonscats = OrderedDict() labels = OrderedDict() raw = OrderedDict() for geo_type in GEO_ORDER: # all except country are lists so handle all as lists wikidata_type = {} commonscats_type = [] labels_type = [] geo_entries_raw = [] if getattr(self, geo_type): # country otherwise makes [''] geo_entries_raw = common.listify(getattr(self, geo_type)) geo_entries = utils.clean_uncertain(geo_entries_raw) for geo_entry in geo_entries: label = geo_entry.strip() mapping = self.smvk_info.mapped_and_wikidata( geo_entry, self.smvk_info.mappings['places']) if mapping.get('category'): commonscats_type += mapping.get('category') # a list if mapping.get('wikidata'): wikidata_type[label] = mapping.get('wikidata') labels_type.append(label) wikidata[geo_type] = wikidata_type commonscats[geo_type] = list(set(commonscats_type)) labels[geo_type] = labels_type raw[geo_type] = geo_entries_raw # assume country is always mapped and either land OR depicted_land used countries = next((getattr(self, key) for key in GEO_COUNTRIES if getattr(self, key)), []) num_countries = len(common.listify(countries)) if len(list(filter(None, commonscats.values()))) <= num_countries: # just knowing country is pretty bad self.meta_cats.add('needing categorisation (place)') # add other_geo to raw raw['other_geo'] = self.other_geo return { 'wd': wikidata, 'commonscats': commonscats, 'labels': labels, 'raw': raw, 'other': utils.clean_uncertain(self.other_geo) }
def parse_data(self, data): """Go through the raw data breaking out data needing mapping.""" for key, image in data.items(): self.check_for_unexpected_lists(image, image.get('photo_id')) if image.get('event'): self.expedition_to_match.update( utils.clean_uncertain(common.listify(image.get('event')), keep=True)) if image.get('museum_obj'): museum, _, type = image.get('museum_obj').partition('/') self.museum_to_match.add((museum, type)) if image.get('ext_ids'): self.external_to_parse.update(image.get('ext_ids')) # keywords - compare without case keyword_columns = ('motivord', 'sokord') for col in keyword_columns: val = image.get(col) or [] val = utils.clean_uncertain(common.listify(val), keep=True) val = [v.casefold() for v in val] self.keywords_to_map.update(val) # people people_columns = ('depicted_persons', 'photographer', 'creator') for col in people_columns: val = image.get(col) or [] val = utils.clean_uncertain(common.listify(val), keep=True) self.people_to_map.update( [helpers.flip_name(person) for person in val]) # ethnic groups - compare without case ethnic_columns = ('ethnic', 'ethnic_old') for col in ethnic_columns: val = image.get(col) or [] val = utils.clean_uncertain(common.listify(val), keep=True) val = [v.casefold() for v in val] self.ethnic_to_map.update(val) # places place_columns = ('land', 'region', 'ort', 'depicted_places', ('depicted_land', 'land')) # depicted_land merged with land for col in place_columns: key = col if isinstance(col, tuple): key = col[1] col = col[0] if key not in self.places_to_map: self.places_to_map[key] = Counter() val = image.get(col) or [] val = utils.clean_uncertain(common.listify(val), keep=True) self.places_to_map[key].update(val)
def get_depicted_person(self, wrap=False): """ Format a depicted person statement. The result is always wrapped in a {{depicted person}} template. People are added either by their wikidata id or by their name. Note that the template itself only supports up to 5 people :param wrap: whether to set the 'information field' style, wrapping the result in an {{information field}}. """ if not self.depicted_persons: return '' formatted_people = [] for person in utils.clean_uncertain(self.depicted_persons, keep=True): person_data = self.get_person_data(person) if person_data.get('category'): self.content_cats.update(person_data.get('category')) formatted_people.append( person_data.get('wikidata') or person_data.get('name')) style = '|style=information field' if wrap else '' return '{{depicted person|%s%s}} ' % ('|'.join(formatted_people), style)
def get_event_data(self, strict=True): """ Return data about the event. :param strict: Whether to discard uncertain entries. """ event = utils.clean_uncertain(self.event, keep=not strict) return self.smvk_info.mappings.get('expeditions').get(event, {})
def get_creator_data(self, strict=True): """ Return the mapped person data for the creator(s). :param strict: Whether to discard uncertain entries. """ person = self.creator or self.photographer # don't support both person = utils.clean_uncertain(person, keep=not strict) if person: return self.get_person_data(person) return {}
def get_title_description(self): """ Construct an appropriate description for a filename. The location part prioritises ort and region over depicted_places and other_geo as these are cleaner. Land is always included. Uncertain entries are filterd out. out. """ txt = self.description_clean geo = (utils.clean_uncertain(self.ort) or utils.clean_uncertain(self.region) or utils.clean_uncertain(self.depicted_places) or utils.clean_uncertain(self.other_geo)) land = (utils.clean_uncertain(self.land) or utils.clean_uncertain(self.depicted_land)) if geo or land: txt += '. {}'.format(', '.join(geo)) land_text = '-'.join(common.listify(land)) if geo and land: if land_text in txt: # avoid duplicated info return txt txt += '. ' txt += land_text return txt
def get_ethnic_data(self, strict=True): """ Return data about ethnic groups. :param strict: Whether to discard uncertain entries. """ ethnic = self.ethnic or common.listify(self.ethnic_old) data = [] ethnicities = utils.clean_uncertain(ethnic, keep=not strict) if not ethnicities: return data mapping = self.smvk_info.mappings.get('ethnic') for ethnicity in ethnicities: data.append( mapping.get(ethnicity.casefold()) or {'name': ethnicity.casefold()}) return data