Esempio n. 1
0
def verify_url_file_extension(url, file_exts, url_protocols=None):
    """
    Verify that a url contains a file extension and that it is allowed.

    Also checks that the protocol is whitelisted.

    @param url: the url to check
    @param file_exts: tuple of allowed file extensions
    @param url_protocols: tuple of allowed url protocols
    @return: the file extension
    @raises: common.MyError
    """
    url_protocols = url_protocols or URL_PROTOCOLS

    protocol, _, rest = url.partition('://')
    if protocol not in url_protocols:
        raise common.MyError(
            '{0}: Found url with a disallowed protocol'.format(url))

    try:
        ext = os.path.splitext(url)[1]
    except IndexError:
        raise common.MyError(
            '{0}: Found url without a file extension'.format(url))
    else:
        if not ext:
            raise common.MyError(
                '{0}: Found url without a file extension'.format(url))

    if ext not in file_exts:
        raise common.MyError(
            '{0}: Found url with a disallowed file extension ({1})'.format(
                url, ext))

    return ext
Esempio n. 2
0
def load_settings(args):
    """
    Load settings from file, command line or defaults.

    Any command line values takes precedence over setting file values.
    If neither is present then defaults are used.

    Command line > Settings file > default_options
    """
    default_options = DEFAULT_OPTIONS.copy()

    options = handle_args(args, PARAMETER_HELP.format(**default_options))

    # settings_file must be handled first
    options['settings_file'] = (options.get('settings_file') or
                                default_options.pop('settings_file'))

    # combine all loaded settings
    settings_options = common.open_and_read_file(
        options.get('settings_file'), as_json=True)
    for key, val in default_options.items():
        options[key] = options.get(key) or settings_options.get(key) or val

    # read glam-specific settings like location of mapping tables
    if not options["glam_code"]:
        err_mess = "The batch settings file ({}) is missing a GLAM code."
        raise common.MyError(err_mess.format(options.get('settings_file')))

    glam_file = os.path.join(SETTINGS_DIR, options["glam_code"])
    glam_options = common.open_and_read_file(
        "{}.json".format(glam_file), as_json=True)
    for key, val in glam_options.items():
        options[key] = glam_options.get(key)

    return options
Esempio n. 3
0
def load_mapping_lists_mappings(
        mappings_dir, update=True, mappings=None, mapping_root=None):
    """
    Add mapping lists to the loaded mappings.

    :param update: whether to first download the latest mappings
    :param mappings_dir: path to directory in which mappings are found
    :param mappings: dict to which mappings should be added. If None then a new
        dict is returned.
    :param mapping_root: root path for the mappings on wiki (required for an
        update)
    """
    mappings = mappings or {}
    mappings_dir = mappings_dir or MAPPINGS_DIR
    if update and not mapping_root:
        raise common.MyError('A mapping root is needed to load new updates.')

    ml = make_places_list(mappings_dir, mapping_root)
    mappings['places'] = ml.consume_entries(
        ml.load_old_mappings(update=update), 'name',
        require=['category', 'wikidata'])

    mk = make_keywords_list(mappings_dir, mapping_root)
    mappings['keywords'] = mk.consume_entries(
        mk.load_old_mappings(update=update), 'name', require='category',
        only='category')

    mp = make_people_list(mappings_dir, mapping_root)
    mappings['people'] = mp.consume_entries(
        mp.load_old_mappings(update=update), 'name',
        require=['creator', 'category', 'wikidata'])
    return mappings
Esempio n. 4
0
def makeHitlist(files, data):
    """
    Given a list of paths to file and target filenames construct a hitlist.

    The hitlist is made up by the (lower case) extension and the
    extensionless basename of the file.

    The data file should be a dict where the keys are the (extensionless)
    target filenames.

    @param files: list of file paths
    @param data: dict containing target filenames as keys
    @return: list of hitList[key] = {ext, path, data}
    """
    hitlist = []
    processed_keys = []  # stay paranoid
    for f in files:
        key, ext = os.path.splitext(os.path.basename(f))
        if key not in data:
            continue
        elif key in processed_keys:
            raise common.MyError('non-unique file key: %s' % key)
        processed_keys.append(key)
        hitlist.append({
            'path': f,
            'ext': ext.lower(),
            'data': data[key],
            'key': key
        })
    return hitlist
Esempio n. 5
0
def process_all_files(base_dir=MAIN_DIR, xml_dir=XML_DIR):
    """Identify all xml files in a directory, load the data and process."""
    # Check categories
    xml_dir = os.path.join(base_dir, xml_dir)
    for directory in (base_dir, xml_dir):
        if not os.path.isdir(directory):
            raise common.MyError(
                u'The provided directory was not a valid directory: %s' %
                directory)

    # Find candidate files
    found_files = prep.find_files(path=xml_dir,
                                  file_exts=('.xml', ),
                                  subdir=False)
    pywikibot.output("Found %d .xml files" % len(found_files))

    data = {}
    for xml_file in found_files:
        try:
            test = InfoEntry(load_xml(xml_file))
        except Exception as e:
            pywikibot.output(u"Encountered error while processing %s: %s" %
                             (os.path.split(xml_file)[-1], e))
            continue
        if test.obj_id in data.keys():
            pywikibot.output(u"Multiple files for same object: %s, %s, %s" %
                             (test.obj_id, test.source_file,
                              data[test.obj_id]['source_file']))
            continue
        data[test.obj_id] = test.output()

    out_file = os.path.join(base_dir, u'processed_lido.json')
    common.open_and_write_file(out_file, data, as_json=True)
    pywikibot.output("Created %s with %d entries" % (out_file, len(data)))
Esempio n. 6
0
    def set_options(self, overriding_options):
        """
        Set various options to default or override in initialisation.

        @param overriding_options: dict of options to use instead of default
            values.
        """
        overriding_options = overriding_options or {}

        # default options
        options = {
            # the value used to indicate that a mapping is not applicable or
            # not needed (as opposed to being left unmapped).
            'na_value': '-',
            # delimiter used to separate list values.
            'list_delimiter': '/',
            # key in the mapping entry to be used for name and secondary
            # sorting. Cannot be a multi-valued (list) field.
            'name_key': 'name',
            # key in the mapping entry to be used for frequency.
            'freq_key': 'frequency'
        }

        for k, v in overriding_options.items():
            if k in options:
                options[k] = v
            else:
                raise common.MyError('{} is not a recognised option'.format(k))

        return options
    def get_glam_id(self):
        """Set the identifier used by the GLAM."""
        for (glam, idno) in self.glam_id:
            if glam == self.glam_data.get("glam_code"):
                return idno

        # without a glam_id we have to abort
        raise common.MyError('Could not find an id for this GLAM in the data')
 def load_batch_settings(self, options):
     """Load batch-specific settings for categorization."""
     fpath = options.get("batch_settings")
     batch_settings = common.open_and_read_file(fpath, as_json=True)
     if ("batch_cat" not in batch_settings.keys()
             or "batch_date" not in batch_settings.keys()):
         err = "Batch settings file ({}) is missing base category or date."
         raise common.MyError(err.format(fpath))
     return batch_settings
    def check_for_unexpected_lists(self, data, label):
        """
        Ensure there aren't any unexpected lists.

        :param data: a single image or archive card entry
        :param label: label allowing the row to be identified in the csv
        """
        delimiter = self.settings.get('list_delimiter')
        if any(delimiter in entry for entry in data.values()):
            raise common.MyError(
                '{}: One of the columns unexpectedly '
                'contains a list\n{}'.format(
                    label, '\n'.join([
                        '{}: {}'.format(k, v) for k, v in filter(
                            lambda x: delimiter in x[1], data.items())
                    ])))
    def get_license_text(self):
        """Format a license template."""
        if self.copyright and self.default_copyright:
            # cannot deal with double license info yet
            raise NotImplementedError

        copyright = self.copyright or self.default_copyright

        # CC licenses are used for modern photographs
        if copyright.get('code') == 'by':
            return '{{CC-BY-4.0|%s}}' % self.get_byline()
        elif copyright.get('code') == 'by-sa':
            return '{{CC-BY-SA-4.0|%s}}' % self.get_byline()
        elif copyright.get('code') == 'pdm':
            # for PD try to get death date from creator (wikidata) else PD-70
            mapping = self.glam_info.mappings.get('people')
            persons = (self.creation.get('related_persons')
                       or copyright.get('persons')
                       or self.photographer.get("name"))
            death_years = []
            for person in persons:
                name = person.get('name')
                data = self.glam_info.mapped_and_wikidata(name, mapping)
                death_years.append(data.get('death_year'))
            death_years = list(filter(None, death_years))  # trim empties
            try:
                death_year = max(death_years)
            except ValueError:
                death_year = None
            if death_year and death_year < self.glam_info.pd_year:
                return '{{PD-old-auto|deathyear=%s}}' % death_year
            elif death_year and not self.is_photo:
                raise common.MyError(
                    'The creator death year is not late enough for PD and '
                    'this does not seem to be a photo')
            elif self.is_photo:
                return '{{PD-Sweden-photo}}'
            else:
                return '{{PD-old-70}}'
Esempio n. 11
0
    def make_item_from_raw(entry, image_file, natmus_info):
        """
        Given the raw metadata for an item, construct an NatmusItem.

        @param entry: the raw metadata entry as a dict
        @param natmus_info: the parent NatmusInfo instance
        @return: NatmusItem
        """
        d = entry.copy()
        # skip paintings not in wikidata
        if d['obj_id'] not in natmus_info.wd_paintings.keys() and \
                natmus_info.skip_non_wikidata:
            raise common.MyError(
                u"skip_4: "
                u"%s did not have any associated wikidata entry" % d['obj_id'])

        # add specific image info
        d['image'] = image_file
        d['photographer'] = d['images'].get(image_file)

        # collect nsid entries
        for k in d['creator'].keys():
            helpers.addOrIncrement(natmus_info.nsid, k, key='freq')
        for s in d['subjects']:
            if s.get('nsid'):
                helpers.addOrIncrement(natmus_info.nsid,
                                       s.get('nsid'),
                                       key='freq')
            if s.get('other_id'):
                helpers.addOrIncrement(natmus_info.uri_ids,
                                       s.get('other_id'),
                                       key='freq')
                natmus_info.uri_ids[s.get('other_id')]['name'] = s.get('name')

        # drop unneded fields
        del d['images']

        return NatmusItem(d)
Esempio n. 12
0
    def get_license_text(self):
        """Format a license template."""
        if self.license not in ('PD', 'cc0'):
            raise common.MyError(
                'A non-supported license was encountered: {}'.format(
                    self.license))

        # CC0 is straight forward
        if self.license == 'cc0':
            return '{{CC0}}'

        # PD - identify creator and image type (photo/artwork)
        # creator death year > 70
        #     {{PD-old-auto}}
        # photo, creator known and image date < 1969
        #     {{PD-Sweden-photo}}
        creator = self.get_creator_data()  # skips any uncertain
        if creator:
            death_year = creator.get('death_year')
            creation_year = utils.get_last_year(self.date_text)
            if death_year and death_year < self.smvk_info.pd_year:
                return '{{PD-old-auto|deathyear=%s}}' % death_year
            elif death_year and not self.is_photo():
                self.problems.append(
                    'The creator death year ({}) is not late enough for PD '
                    'and this does not seem to be a photo.'.format(death_year))
            elif self.is_photo() and creation_year and creation_year < 1969:
                return '{{PD-Sweden-photo}}'
            else:
                self.problems.append(
                    'Could not determine why this image by {} is PD.'.format(
                        creator.get('name')))
        else:
            # cannot default to PD-Sweden-photo since creator need not be
            # Swedish. Cannot default to PD-anon-70 since date of first
            # publication is not known.
            self.problems.append(
                'The creator is unknown so PD status cannot be verified')
Esempio n. 13
0
    def run(self, in_file, base_name, update_mappings):
        """
        Entry point for outputting info data.

        Loads indata and any mappings to produce a make_info json file.

        @param in_file: filename (or tuple of such) containing the metadata
        @param base_name: base name to use for output
            (defaults to same as in_file)
        @update_mappings: if mappings should be updated against online sources
        """
        if not base_name:
            if common.is_str(in_file):
                base_name, ext = os.path.splitext(in_file)
            else:
                raise common.MyError(
                    'A base name must be provided if multiple in_files '
                    'are provided')

        self.cwd_path = os.path.split(base_name)[0]
        raw_data = self.load_data(in_file)
        self.load_mappings(update_mappings)
        self.process_data(raw_data)
        out_data = self.make_info()

        # store output
        out_file = '%s.json' % base_name
        common.open_and_write_file(out_file, out_data, as_json=True)
        pywikibot.output('Created %s' % out_file)

        # store filenames
        out_file = '%s.filenames.txt' % base_name
        out = ''
        for k in sorted(out_data.keys()):
            out += '%s|%s\n' % (k, out_data[k]['filename'])
        common.open_and_write_file(out_file, out)
        pywikibot.output('Created %s' % out_file)
Esempio n. 14
0
def run(in_path, out_path, data_path, file_exts=None):
    """
    Prepare an upload.

    Prepare an upload by:
        1. Find files in in_path (with subdirs) with file_exts file extension,
        2. Match these against the keys in the makeInfo output data
        3. Make info files and rename found file (in new target folder)

    @todo: throw errors on failed file read/write

    @param in_path: path to directory where unprocessed files live
    @param out_path: path to directory where renamed files and info should live
    @param data_path: path to .json containing makeInfo output data
    @param file_exts: tupple of allowed file extensions (case insensitive)
    """
    # Load data
    data = common.open_and_read_file(data_path, codec='utf-8', as_json=True)

    # set filExts
    file_exts = file_exts or FILE_EXTS

    # Find candidate files
    if not os.path.isdir(in_path):
        raise common.MyError(
            'The provided inPath was not a valid directory: %s' % in_path)
    found_files = find_files(path=in_path, file_exts=file_exts)

    # Find matches
    hitlist = makeHitlist(found_files, data)

    # make info and rename
    makeAndRename(hitlist, out_path)

    # clean up any empty subdirectories
    removeEmptyDirectories(in_path)
    def get_geo_data(self):
        """
        Find commonscat and wikidata entries for each available place level.

        Returns an dict with the most specific wikidata entry and any matching
        commonscats in decreasing order of relevance.

        If any 'other' value is matched the wikidata ids are returned and the
        categories are added as content_cats.
        """
        if (self.description_place and self.depicted_place
                and (self.description_place != self.depicted_place)):
            self.problems.append(
                'Cannot handle differing depicted_place and description_place:'
                '\nDepicted_place: {0}\nDescription_place: {1}'.format(
                    self.depicted_place, self.description_place))

        depicted_place = self.depicted_place or self.description_place
        if not depicted_place:
            return {}

        if (depicted_place.get('country')
                and depicted_place.get('country').get('code') != 'Sverige'):
            self.meta_cats.add('needing categorisation (not from Sweden)')

        # set up the geo_types and their corresponding mappings ordered from
        # most to least specific
        geo_map = OrderedDict([(i, self.glam_info.mappings.get(i))
                               for i in GEO_ORDER])
        role = depicted_place.pop('role')

        if any(key not in geo_map for key in depicted_place.keys()):
            diff = set(depicted_place.keys()) - set(geo_map.keys())
            raise common.MyError('{} should be added to GEO_ORDER'.format(
                ', '.join(diff)))

        wikidata = {}
        commonscats = []
        labels = OrderedDict()
        # handle other separately
        geo_map.pop('other')
        if depicted_place.get('other'):
            for geo_type, data in depicted_place.get('other').items():
                mapping = self.glam_info.mapped_and_wikidata(
                    data.get('code'), self.glam_info.mappings['places'])
                if mapping.get('category'):
                    commonscats += mapping.get('category')  # this is a list
                if mapping.get('wikidata'):
                    wikidata[geo_type] = mapping.get('wikidata')
                labels[geo_type] = data.get('label')

        for geo_type, mapping in geo_map.items():
            if not depicted_place.get(geo_type):
                continue
            data = depicted_place.get(geo_type)
            mapped_data = mapping.get(data.get('code'))
            if mapped_data.get('wd'):
                wikidata[geo_type] = mapped_data.get('wd')
            if mapped_data.get('commonscat'):
                commonscats.append(mapped_data.get('commonscat'))
            labels[geo_type] = data.get('label')

        # just knowing country is pretty bad
        if len(commonscats) <= 1:
            self.meta_cats.add('needing categorisation (place)')

        return {
            'role': role,
            'wd': wikidata,
            'commonscats': commonscats,
            'labels': labels
        }