def get_geo_data(self):
        """
        Find commonscat and wikidata entries for each available place level.

        Returns an dict with the most specific wikidata entry and any matching
        commonscats in decreasing order of relevance.

        If any 'other_geo' value is matched the wikidata ids are returned and
        the categories are added as content_cats.

        Uncertain entries are filtered out from everything except raw.
        """
        wikidata = OrderedDict()
        commonscats = OrderedDict()
        labels = OrderedDict()
        raw = OrderedDict()
        for geo_type in GEO_ORDER:
            # all except country are lists so handle all as lists
            wikidata_type = {}
            commonscats_type = []
            labels_type = []
            geo_entries_raw = []
            if getattr(self, geo_type):  # country otherwise makes ['']
                geo_entries_raw = common.listify(getattr(self, geo_type))
            geo_entries = utils.clean_uncertain(geo_entries_raw)
            for geo_entry in geo_entries:
                label = geo_entry.strip()
                mapping = self.smvk_info.mapped_and_wikidata(
                    geo_entry, self.smvk_info.mappings['places'])
                if mapping.get('category'):
                    commonscats_type += mapping.get('category')  # a list
                if mapping.get('wikidata'):
                    wikidata_type[label] = mapping.get('wikidata')
                labels_type.append(label)
            wikidata[geo_type] = wikidata_type
            commonscats[geo_type] = list(set(commonscats_type))
            labels[geo_type] = labels_type
            raw[geo_type] = geo_entries_raw

        # assume country is always mapped and either land OR depicted_land used
        countries = next((getattr(self, key)
                          for key in GEO_COUNTRIES if getattr(self, key)), [])
        num_countries = len(common.listify(countries))
        if len(list(filter(None, commonscats.values()))) <= num_countries:
            # just knowing country is pretty bad
            self.meta_cats.add('needing categorisation (place)')

        # add other_geo to raw
        raw['other_geo'] = self.other_geo

        return {
            'wd': wikidata,
            'commonscats': commonscats,
            'labels': labels,
            'raw': raw,
            'other': utils.clean_uncertain(self.other_geo)
        }
    def add_identification_data(self, data):
        handled_tags = list()
        skipped_tags = [
            '',
        ]
        tag = u'lido:objectIdentificationWrap'

        # add title
        handled_tags.append(u'lido:titleWrap')
        titles = common.listify(data[u'lido:titleWrap'][u'lido:titleSet'])
        self.titles = get_lang_values_from_set(titles,
                                               (u'lido:appellationValue', ))

        # add incription
        handled_tags.append(u'lido:inscriptionsWrap')
        inscriptions = common.listify(
            data[u'lido:inscriptionsWrap'][u'lido:inscriptions'])
        self.inscriptions = get_lang_values_from_set(
            inscriptions, (u'lido:inscriptionTranscription', ))

        # add decription
        handled_tags.append(u'lido:objectDescriptionWrap')
        description_set = data[u'lido:objectDescriptionWrap'][
            u'lido:objectDescriptionSet']
        if not isinstance(description_set, OrderedDict):
            pywikibot.warning(
                "Weird things are happening in description field for %s:\n%s" %
                (self.source_file, description_set))
        descriptions = common.listify(
            description_set[u'lido:descriptiveNoteValue'])
        self.descriptions = get_lang_values_from_set(descriptions)

        # add measurements
        handled_tags.append(u'lido:objectMeasurementsWrap')
        measurement_set = data[u'lido:objectMeasurementsWrap'][
            u'lido:objectMeasurementsSet']
        if set(measurement_set.keys()) - set(
            [u'lido:displayObjectMeasurements', u'lido:objectMeasurements']):
            pywikibot.warning(
                "Weird things are happening in measurement field for %s:\n%s" %
                (self.source_file, measurement_set))
        self._debug(measurement_set.get(u'lido:displayObjectMeasurements'))
        measurements = common.trim_list(
            common.listify(
                measurement_set.get(u'lido:displayObjectMeasurements')))
        self._debug(measurements)
        self.add_meaurements(measurements)

        # ensure location is always Nationalmuesum
        handled_tags.append(u'lido:repositoryWrap')
        repository_viaf = data[u'lido:repositoryWrap']['lido:repositorySet'][
            'lido:repositoryName']['lido:legalBodyID']['#text']
        if repository_viaf != u'http://viaf.org/viaf/147742988':
            pywikibot.warning("Unexpected repoitory in %s: %s" %
                              (self.source_file, repository_viaf))

        flag_missed_tags(data, tag, handled_tags, skipped_tags)
    def parse_data(self, data):
        """Go through the raw data breaking out data needing mapping."""
        for key, image in data.items():
            self.check_for_unexpected_lists(image, image.get('photo_id'))

            if image.get('event'):
                self.expedition_to_match.update(
                    utils.clean_uncertain(common.listify(image.get('event')),
                                          keep=True))
            if image.get('museum_obj'):
                museum, _, type = image.get('museum_obj').partition('/')
                self.museum_to_match.add((museum, type))
            if image.get('ext_ids'):
                self.external_to_parse.update(image.get('ext_ids'))

            # keywords - compare without case
            keyword_columns = ('motivord', 'sokord')
            for col in keyword_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                val = [v.casefold() for v in val]
                self.keywords_to_map.update(val)

            # people
            people_columns = ('depicted_persons', 'photographer', 'creator')
            for col in people_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                self.people_to_map.update(
                    [helpers.flip_name(person) for person in val])

            # ethnic groups - compare without case
            ethnic_columns = ('ethnic', 'ethnic_old')
            for col in ethnic_columns:
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                val = [v.casefold() for v in val]
                self.ethnic_to_map.update(val)

            # places
            place_columns = ('land', 'region', 'ort', 'depicted_places',
                             ('depicted_land',
                              'land'))  # depicted_land merged with land
            for col in place_columns:
                key = col
                if isinstance(col, tuple):
                    key = col[1]
                    col = col[0]
                if key not in self.places_to_map:
                    self.places_to_map[key] = Counter()
                val = image.get(col) or []
                val = utils.clean_uncertain(common.listify(val), keep=True)
                self.places_to_map[key].update(val)
    def get_title_description(self):
        """
        Construct an appropriate description for a filename.

        The location part prioritises ort and region over depicted_places and
        other_geo as these are cleaner. Land is always included. Uncertain
        entries are filterd out.
        out.
        """
        txt = self.description_clean
        geo = (utils.clean_uncertain(self.ort)
               or utils.clean_uncertain(self.region)
               or utils.clean_uncertain(self.depicted_places)
               or utils.clean_uncertain(self.other_geo))
        land = (utils.clean_uncertain(self.land)
                or utils.clean_uncertain(self.depicted_land))
        if geo or land:
            txt += '. {}'.format(', '.join(geo))
            land_text = '-'.join(common.listify(land))
            if geo and land:
                if land_text in txt:  # avoid duplicated info
                    return txt
                txt += '. '
            txt += land_text
        return txt
Beispiel #5
0
def clean_uncertain(value, keep=False):
    """
    Handle uncertain values in the data.

    Process any value containing a '[?]' string.

    :param value: the value or list of values to process
    :param keep: whether to keep the clean value or discard it
    """
    was_list = isinstance(value, list)
    values = common.listify(value)
    new_list = []
    for val in values:
        if '[?]' in val:
            if keep:
                new_list.append(
                    val.replace('[?]', '').replace('  ', ' ').strip())
        else:
            new_list.append(val)

    # return in same format as original
    if not was_list:
        if not new_list:
            return ''
        return new_list[0]
    return new_list
    def add_creation(self, event):
        handled_tags = list()
        skipped_tags = ['lido:eventName', u'lido:eventType']
        tag = u'lido:event'

        # add creator(s)
        handled_tags.append(u'lido:eventActor')
        self.creator = {}
        self.handle_creators(
            common.trim_list(common.listify(event[u'lido:eventActor'])))

        # add creation_date
        handled_tags.append(u'lido:eventDate')
        self.creation_date = {}
        if event.get(u'lido:eventDate'):
            self.creation_date['earliest'] = event[u'lido:eventDate'][
                u'lido:date'].get(u'lido:earliestDate')
            self.creation_date['latest'] = event[u'lido:eventDate'][
                u'lido:date'].get(u'lido:latestDate')
            self.creation_date['text'] = get_lang_values_from_set(
                common.listify(event[u'lido:eventDate'][u'lido:displayDate']))

        # add creation place
        handled_tags.append(u'lido:eventPlace')
        self.creation_place = get_lang_values_from_set(
            common.listify(event[u'lido:eventPlace'][u'lido:place']
                           [u'lido:namePlaceSet']),
            (u'lido:appellationValue', ))

        # add materialtech
        handled_tags.append(u'lido:eventMaterialsTech')
        self.techniques = get_lang_values_from_set(
            common.listify(event[u'lido:eventMaterialsTech']),
            (u'lido:materialsTech', u'lido:termMaterialsTech', u'lido:term'))

        flag_missed_tags(event, tag, handled_tags, skipped_tags)
    def get_ethnic_data(self, strict=True):
        """
        Return data about ethnic groups.

        :param strict: Whether to discard uncertain entries.
        """
        ethnic = self.ethnic or common.listify(self.ethnic_old)
        data = []
        ethnicities = utils.clean_uncertain(ethnic, keep=not strict)
        if not ethnicities:
            return data
        mapping = self.smvk_info.mappings.get('ethnic')
        for ethnicity in ethnicities:
            data.append(
                mapping.get(ethnicity.casefold())
                or {'name': ethnicity.casefold()})
        return data
    def add_relation_data(self, data):
        handled_tags = list()
        skipped_tags = [
            '',
        ]
        tag = u'lido:objectRelationWrap'

        # handle subjects
        handled_tags.append(u'lido:subjectWrap')
        self.subjects = list()
        subjects = data[u'lido:subjectWrap'][u'lido:subjectSet'][
            u'lido:subject']
        if subjects:
            subjects = common.listify(subjects[u'lido:subjectActor'])
            for subject in subjects:
                self.subjects.append(handle_actor(subject[u'lido:actor']))

        flag_missed_tags(data, tag, handled_tags, skipped_tags)
    def add_descriptive_data(self, data):
        handled_tags = list()
        skipped_tags = ['lido:objectClassificationWrap', u'@xml:lang']
        tag = u'lido:descriptiveMetadata'

        # add identification data
        handled_tags.append(u'lido:objectIdentificationWrap')
        self.add_identification_data(data[u'lido:objectIdentificationWrap'])

        # add event data
        handled_tags.append(u'lido:eventWrap')
        self.add_event_data(
            common.listify(data[u'lido:eventWrap']['lido:eventSet']))

        # add relation data
        handled_tags.append(u'lido:objectRelationWrap')
        self.add_relation_data(data[u'lido:objectRelationWrap'])

        flag_missed_tags(data, tag, handled_tags, skipped_tags)
Beispiel #10
0
    def add_image_data(self, data):
        handled_tags = list()
        skipped_tags = [
            '',
        ]
        tag = u'lido:resourceWrap/lido:resourceSet'

        # identify filenames
        handled_tags.append(u'lido:resourceRepresentation')
        self.images = {}
        images = []
        # identify local images
        links = data[u'lido:resourceRepresentation']
        for link in links:
            if link[u'lido:linkResource'].startswith('http'):
                continue
            images.append(link[u'lido:linkResource'])

        # match image to attributions
        handled_tags.append(u'lido:rightsResource')
        attributions = common.listify(
            data[u'lido:rightsResource'].get(u'lido:rightsHolder'))
        if attributions:
            if len(attributions) != len(images):
                pywikibot.warning("image-attribution missmatch in %s" %
                                  self.source_file)
            for i, attribution in enumerate(attributions):
                self.images[images[i]] = attribution[u'lido:legalBodyName'][
                    u'lido:appellationValue']
            #nån logik som ser till att det inte blir fel ibland
        else:
            # there aren't always photographers
            for i, image in enumerate(images):
                self.images[images[i]] = None

        # add license, just in case
        self.image_license = data[u'lido:rightsResource'][u'lido:rightsType'][
            u'lido:term']['#text']

        flag_missed_tags(data, tag, handled_tags, skipped_tags)
    def consume_entries(self, units, key_val, require=None, only=None):
        """
        Clean a scraped mapping list and return as a dict.

        If the field used as dict key is empty or non-unique a warning is raise
        and that entry skipped.

        @param units: a list of entry-dict items as returned by parse_entries
        @param key_val: the name of the field to use as dict key.
        @param require: a field or list of fields where at least one must be
            non-empty for the entry to be presented.
            Default: None = all entries returned.
        @param only: only return the value of this field
        """
        if require:
            require = common.listify(require)  # allow both single str and list

        presentable_units = {}
        for entry in units:
            clean_entry = self.clean_entry(entry)
            if require and not any(clean_entry.get(r) for r in require):
                continue

            key = clean_entry.get(key_val)
            if not key:
                pywikibot.warning('The field intended as dict key was empty!')
                continue
            elif key in presentable_units:
                # @todo: this should compare values and keep any with content
                # and only warn if two have differing content
                pywikibot.warning(
                    'The dict key was not unique! - {}'.format(key))
                continue

            if only:
                presentable_units[key] = clean_entry.get(only)
            else:
                presentable_units[key] = clean_entry

        return presentable_units
    def get_original_description(self):
        """Get original description incl. motif, keywords and class(es)."""
        descr = self.beskrivning or ''

        if self.motiv:
            descr += '<br>\n''Motiv'': {}'.format(self.motiv)

        if self.item_keywords:
            descr += '<br>\n''Nyckelord'': {}'.format(
                ', '.join(self.item_keywords))

        if self.item_classes:
            # Output the primary class, if identified, else output all
            classes = self.isolate_primary_class() or self.item_classes
            descr += '<br>\n''Kategori'': {}'.format(
                ', '.join(common.listify(classes)))

        descr = descr.strip()
        if descr.startswith('<br>'):
            descr = descr[len('<br>'):]

        return descr.strip()
Beispiel #13
0
def format_description_row(label, value, delimiter=','):
    """Format a single description line."""
    delimiter = '{} '.format(delimiter)
    return '<br/>\n{}: {}'.format(
        helpers.italicize(label),
        delimiter.join(common.listify(value)))