def process_all_files(base_dir=MAIN_DIR, xml_dir=XML_DIR):
    """Identify all xml files in a directory, load the data and process."""
    # Check categories
    xml_dir = os.path.join(base_dir, xml_dir)
    for directory in (base_dir, xml_dir):
        if not os.path.isdir(directory):
            raise common.MyError(
                u'The provided directory was not a valid directory: %s' %
                directory)

    # Find candidate files
    found_files = prep.find_files(path=xml_dir,
                                  file_exts=('.xml', ),
                                  subdir=False)
    pywikibot.output("Found %d .xml files" % len(found_files))

    data = {}
    for xml_file in found_files:
        try:
            test = InfoEntry(load_xml(xml_file))
        except Exception as e:
            pywikibot.output(u"Encountered error while processing %s: %s" %
                             (os.path.split(xml_file)[-1], e))
            continue
        if test.obj_id in data.keys():
            pywikibot.output(u"Multiple files for same object: %s, %s, %s" %
                             (test.obj_id, test.source_file,
                              data[test.obj_id]['source_file']))
            continue
        data[test.obj_id] = test.output()

    out_file = os.path.join(base_dir, u'processed_lido.json')
    common.open_and_write_file(out_file, data, as_json=True)
    pywikibot.output("Created %s with %d entries" % (out_file, len(data)))
def makeAndRename(hitlist, outPath):
    """
    Given a hitlist create the info files and rename the matched file.

    @param hitlist: the output of makeHitlist
    @param outPath: the directory in which to store info + renamed files
    """
    # create outPath if it doesn't exist
    common.create_dir(outPath)

    # logfile
    logfile = os.path.join(outPath, '¤generator.log')
    flog = open(logfile, 'a', encoding='utf-8')

    for hit in hitlist:
        base_name = os.path.join(outPath, hit['data']['filename'])

        # output info file
        common.open_and_write_file('%s.info' % base_name,
                                   make_info_page(hit['data']))

        # rename/move matched file
        outfile = '%s%s' % (base_name, hit['ext'])
        os.rename(hit['path'], outfile)
        flog.write('%s|%s\n' %
                   (os.path.basename(hit['path']), os.path.basename(outfile)))
    flog.close()
    pywikibot.output('Created %s' % logfile)
Beispiel #3
0
def rename(base_dir, sub_cat, in_filename, log_file='move.log'):
    """
    Identify any files to replace and rename them to their commons names.

    :param base_dir: Path to directory in which replacement image files are
        found.
    :param sub_cat: The name of the subdirectory into which processed files
        should be moved.
    :param in_filename: The photoAll.csv file filtered to only contain the
        files to replace.
    :param log_file: The name of the log file to be created (in base_dir).
    """
    # Load indata
    in_filename = common.modify_path(base_dir, in_filename)
    header_check = u'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \
                   u'PhoSwdS|MulId|filnamn|AdrVorNameS|AdrNameS|PhoSystematikS'
    data = csv.csv_file_to_dict(in_filename,
                                "filnamn",
                                header_check,
                                keep=('PhoSystematikS', 'filnamn'),
                                delimiter='|',
                                codec='utf-16')

    # reformat the commons filenames
    url_prefix = u'https://commons.wikimedia.org/wiki/File:'
    for k, v in data.iteritems():
        if v['PhoSystematikS'].startswith(url_prefix):
            data[k] = v['PhoSystematikS'][len(url_prefix):]
        else:
            pywikibot.output("error in indatafile: %s, %s" % (k, v))

    # find candidate files
    candidates = prep.find_files(base_dir, ('.tif', ), subdir=False)

    # rename the files
    sub_cat = common.modify_path(base_dir, sub_cat)
    log_file = common.modify_path(base_dir, log_file)
    common.create_dir(sub_cat)
    log = []

    for candidate in candidates:
        base_name = os.path.basename(candidate)
        if base_name not in data.keys():
            log.append('%s not found in csv file' % base_name)
            continue

        commons_name = data.pop(base_name)
        commons_name = common.modify_path(sub_cat, commons_name)
        os.rename(candidate, commons_name)

    for k in data.keys():
        log.append('%s not found on disk' % k)

    common.open_and_write_file(log_file, '\n'.join(log), codec='utf-8')
    pywikibot.output(u'Created %s' % log_file)
Beispiel #4
0
def rename(base_dir, sub_cat, in_filename, log_file='move.log'):
    """
    Identify any files to replace and rename them to their commons names.

    :param base_dir: Path to directory in which replacement image files are
        found.
    :param sub_cat: The name of the subdirectory into which processed files
        should be moved.
    :param in_filename: The photoAll.csv file filtered to only contain the
        files to replace.
    :param log_file: The name of the log file to be created (in base_dir).
    """
    # Load indata
    in_filename = common.modify_path(base_dir, in_filename)
    header_check = u'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \
                   u'PhoSwdS|MulId|filnamn|AdrVorNameS|AdrNameS|PhoSystematikS'
    data = csv.csv_file_to_dict(in_filename, "filnamn", header_check,
                                keep=('PhoSystematikS', 'filnamn'),
                                delimiter='|', codec='utf-16')

    # reformat the commons filenames
    url_prefix = u'https://commons.wikimedia.org/wiki/File:'
    for k, v in data.iteritems():
        if v['PhoSystematikS'].startswith(url_prefix):
            data[k] = v['PhoSystematikS'][len(url_prefix):]
        else:
            pywikibot.output("error in indatafile: %s, %s" % (k, v))

    # find candidate files
    candidates = prep.find_files(base_dir, ('.tif', ), subdir=False)

    # rename the files
    sub_cat = common.modify_path(base_dir, sub_cat)
    log_file = common.modify_path(base_dir, log_file)
    common.create_dir(sub_cat)
    log = []

    for candidate in candidates:
        base_name = os.path.basename(candidate)
        if base_name not in data.keys():
            log.append('%s not found in csv file' % base_name)
            continue

        commons_name = data.pop(base_name)
        commons_name = common.modify_path(sub_cat, commons_name)
        os.rename(candidate, commons_name)

    for k in data.keys():
        log.append('%s not found on disk' % k)

    common.open_and_write_file(log_file, '\n'.join(log), codec='utf-8')
    pywikibot.output(u'Created %s' % log_file)
 def load_mappings(self, update_mappings):
     depicted_file = os.path.join(MAPPINGS_DIR, 'glass_depicted.json')
     depicted_page = 'User:Alicia_Fagerving_(WMSE)/sandbox_gn_depicted'
     if update_mappings:
         print("Updating mappings...")
         self.mappings['depicted'] = self.get_depicted_mapping(
             depicted_page)
         common.open_and_write_file(
             depicted_file, self.mappings['depicted'], as_json=True)
     else:
         self.mappings['depicted'] = common.open_and_read_file(
             depicted_file, as_json=True)
     pywikibot.output('Loaded all mappings')
Beispiel #6
0
    def load_mappings(self, update_mappings):
        depicted_file = os.path.join(MAPPINGS_DIR, 'depicted.json')
        depicted_page = 'User:Alicia_Fagerving_(WMSE)/sandbox3'
        photographer_file = os.path.join(MAPPINGS_DIR, 'photographers.json')
        photographer_page = 'User:Alicia_Fagerving_(WMSE)/sandbox2'
        play_file = os.path.join(MAPPINGS_DIR, 'plays.json')
        play_page = 'User:Alicia_Fagerving_WMSE/sandbox4'
        theatre_file = os.path.join(MAPPINGS_DIR, 'theatres.json')
        theatre_page = 'User:Alicia_Fagerving_(WMSE)/sandbox'
        helleday_file = os.path.join(MAPPINGS_DIR, 'linked_helleday.json')

        if update_mappings:
            print("Updating mappings...")
            self.mappings['photographers'] = self.get_photographer_mapping(
                photographer_page)
            self.mappings['theatres'] = self.get_theatre_mapping(theatre_page)
            self.mappings['depicted'] = self.get_depicted_mapping(
                depicted_page)
            self.mappings['plays'] = self.get_play_mapping(play_page)
            self.mappings['helleday_files'] = self.get_existing_helleday_files(
            )
            common.open_and_write_file(theatre_file,
                                       self.mappings['theatres'],
                                       as_json=True)
            common.open_and_write_file(photographer_file,
                                       self.mappings['photographers'],
                                       as_json=True)
            common.open_and_write_file(depicted_file,
                                       self.mappings['depicted'],
                                       as_json=True)
            common.open_and_write_file(play_file,
                                       self.mappings['plays'],
                                       as_json=True)
            common.open_and_write_file(helleday_file,
                                       self.mappings['helleday_files'],
                                       as_json=True)
        else:
            self.mappings['photographers'] = common.open_and_read_file(
                photographer_file, as_json=True)
            self.mappings['theatres'] = common.open_and_read_file(theatre_file,
                                                                  as_json=True)
            self.mappings['depicted'] = common.open_and_read_file(
                depicted_file, as_json=True)
            self.mappings['plays'] = common.open_and_read_file(play_file,
                                                               as_json=True)
            self.mappings['helleday_files'] = common.open_and_read_file(
                helleday_file, as_json=True)

        pywikibot.output('Loaded all mappings')
    def scrape(self):
        """
        Scrape lists on commons and overwrite local files.

        If the page does not exist a warning is raised and no file is created.
        """
        if not self.page.exists():
            pywikibot.warning('The list page {} does not exist!'.format(
                self.page.title()))
        else:
            parsed_data = self.parse_entries(self.page.get())
            mapping_file = os.path.join(
                self.mapping_dir, 'commons-{}.json'.format(self.page_name))
            common.open_and_write_file(mapping_file, parsed_data, as_json=True)
            pywikibot.output('Created {}'.format(mapping_file))
    def save_as_wikitext(self, new_data, preserved_data=None, intro_text=''):
        """
        Output mapping lists in wiki format.

        @param new_data: the new (non-zero frequency) mapping data as a list of
            (frequency, mapping entry) tuples. Or a dict of such lists where
            the key is used as a section title.
        @param preserved_data: the preserved (zero frequency) mapping data as
            a list of mapping entries.
        @param intro_text: Wikitext to top the page with (may also contain
            categories)
        """
        wiki_file = os.path.join(self.wikitext_dir,
                                 'commons-{}.wiki'.format(self.page_name))
        wiki_text = self.mappings_to_wikipage(new_data, preserved_data,
                                              intro_text)
        common.open_and_write_file(wiki_file, wiki_text)
    def run(self, in_file, base_name=None):
        """Overload run to add log outputting."""
        super(NatmusInfo, self).run(in_file, base_name)

        # add/output connection logs
        self.log(u'--------------------------------------------------nsid---')
        for k, v in self.nsid.iteritems():
            if v.get('wd'):
                self.log(u'%s: %s' % (k, v))
        self.log(u'------------------------------------------------uri_ids---')
        for k, v in self.uri_ids.iteritems():
            if v.get('wd') and not v.get('mapped'):
                self.log(u'%s: %s' % (k, v))
            elif not v.get('wd') and not v.get('mapped') and v.get('freq') > 5:
                self.log(u'%s: %s' % (k, v))

        if base_name:
            logfile = u'%s.log' % base_name
            common.open_and_write_file(logfile, '\n'.join(self.logger))
            pywikibot.output("Created %s" % logfile)
Beispiel #10
0
def dict_to_csv_file(filename,
                     d,
                     header,
                     delimiter='|',
                     list_delimiter=';',
                     codec='utf-8'):
    """
    Save a dict as csv file given a header string encoding the columns.

    @param filename: the target file
    @param d: the dictionary to convert
    @param header: a string giving parameters to output and their order
    @param delimiter: the used delimiter (defaults to "|")
    @param list_delimiter: the used delimiter when encountering a list
    @param codec: the used encoding (defaults to "utf-8")
    @return: None
    """
    # load file and write header
    output = '%s\n' % header

    # find keys to compare with header (from any row)
    cols = list(d[list(d)[0]])
    header = header.split(delimiter)

    # verify all header fields are present
    if any(h not in cols for h in header):
        raise MyError("Header missmatch")

    # output rows
    for k, v in d.items():
        row = []
        for h in header:
            if isinstance(v[h], list):
                row.append(list_delimiter.join(v[h]))
            else:
                row.append(v[h])
        output += '%s\n' % delimiter.join(row)

    # close
    open_and_write_file(filename, output, codec=codec)
Beispiel #11
0
    def load_mappings(self, update_mappings):
        concrete_motif_file = os.path.join(MAPPINGS_DIR, 'concrete_motif.json')
        concrete_motif_page = 'Commons:Tekniska museet/Curman/mapping title'
        geo_file = os.path.join(MAPPINGS_DIR, 'geo.json')
        geo_page = 'Commons:Tekniska museet/Curman/mapping location'
        keywords_file = os.path.join(MAPPINGS_DIR, 'keywords.json')
        keywords_page = 'Commons:Tekniska museet/Curman/mapping amnesord'

        if update_mappings:
            print("Updating mappings...")
            self.mappings['concrete_motif'] = self.get_concrete_motif_mapping(
                concrete_motif_page)
            common.open_and_write_file(concrete_motif_file,
                                       self.mappings['concrete_motif'],
                                       as_json=True)
            self.mappings['geo'] = self.get_geo_mapping(geo_page)
            common.open_and_write_file(geo_file,
                                       self.mappings['geo'],
                                       as_json=True)
            self.mappings['keywords'] = self.get_keywords_mapping(
                keywords_page)
            common.open_and_write_file(keywords_file,
                                       self.mappings['keywords'],
                                       as_json=True)
        else:
            self.mappings['concrete_motif'] = common.open_and_read_file(
                concrete_motif_file, as_json=True)
            self.mappings['geo'] = common.open_and_read_file(geo_file,
                                                             as_json=True)
            self.mappings['keywords'] = common.open_and_read_file(
                keywords_file, as_json=True)

        pywikibot.output('Loaded all mappings')
    def load_single_object(self, uuid):
        """
        Load the data for a single object.

        :param uuid: the uuid for the item
        """
        url = 'http://api.dimu.org/artifact/uuid/{}'.format(uuid)

        try:
            filepath = os.path.join(CACHE_DIR, uuid + ".json")
            if self.settings["cache"]:
                print("Loading {} from local cache".format(uuid))
                data = common.open_and_read_file(filepath, as_json=True)
            else:
                data = get_json_from_url(url)
                common.open_and_write_file(filepath, data, as_json=True)
        except requests.HTTPError as e:
            error_message = '{0}: {1}'.format(e, url)
            self.log.write(error_message)
            return None

        return data
Beispiel #13
0
    def run(self, in_file, base_name, update_mappings):
        """
        Entry point for outputting info data.

        Loads indata and any mappings to produce a make_info json file.

        @param in_file: filename (or tuple of such) containing the metadata
        @param base_name: base name to use for output
            (defaults to same as in_file)
        @update_mappings: if mappings should be updated against online sources
        """
        if not base_name:
            if common.is_str(in_file):
                base_name, ext = os.path.splitext(in_file)
            else:
                raise common.MyError(
                    'A base name must be provided if multiple in_files '
                    'are provided')

        self.cwd_path = os.path.split(base_name)[0]
        raw_data = self.load_data(in_file)
        self.load_mappings(update_mappings)
        self.process_data(raw_data)
        out_data = self.make_info()

        # store output
        out_file = '%s.json' % base_name
        common.open_and_write_file(out_file, out_data, as_json=True)
        pywikibot.output('Created %s' % out_file)

        # store filenames
        out_file = '%s.filenames.txt' % base_name
        out = ''
        for k in sorted(out_data.keys()):
            out += '%s|%s\n' % (k, out_data[k]['filename'])
        common.open_and_write_file(out_file, out)
        pywikibot.output('Created %s' % out_file)
def run(data, selection, log_file, output, media_ext):
    # fall back on defaults
    data_dir = os.path.split(data)[0]
    if not selection:
        selection = os.path.join(data_dir, DEFAULTS.get('selection'))
    selection_dir = os.path.split(selection)[0]
    if not output:
        output = os.path.join(selection_dir, DEFAULTS.get('output'))

    data = common.open_and_read_file(data, as_json=True)
    demo = common.open_and_read_file(selection, as_json=True)

    # load log
    log = {}
    if log_file:
        log_text = common.open_and_read_file(log_file)
        for l in log_text.split('\n'):
            if ' -- ' in l:
                idno, reason = l.split(' -- ')
                log[idno] = reason

    out = []
    for idno in sorted(demo.keys()):
        info = ''
        if idno in data:
            info = mi.make_info_page(data[idno], preview=True)
            if media_ext:
                info = info.replace('<ext>', media_ext)
        elif log:
            info = log[idno]
        else:
            info = 'no make_info data found'
        out.append('== {idno} -- {reason} ==\n{info}'.format(
            reason=demo.get(idno), idno=idno, info=info))

    common.open_and_write_file(output, '\n\n'.join(out))
Beispiel #15
0
def save_data(out_file, text):
    return common.open_and_write_file(out_file, text)
 def save_data(self, filename=None):
     """Dump data as json blob."""
     filename = filename or self.settings.get('harvest_file')
     sorted_data = self.sort_data('glam_id')
     common.open_and_write_file(filename, sorted_data, as_json=True)
     pywikibot.output('{0} created'.format(filename))
Beispiel #17
0
def save_data(data, filename=None):
    """Dump data as json blob."""
    filename = filename or OUTPUT_FILE
    common.open_and_write_file(filename, data, as_json=True)
    print("Saved file: {}.".format(filename))
    def load_mappings(self, update_mappings):
        """
        Update mapping files, load these and package appropriately.

        :param update_mappings: whether to first download the latest mappings
        """
        socken_file = os.path.join(MAPPINGS_DIR, 'socken.json')
        kommun_file = os.path.join(MAPPINGS_DIR, 'kommun.json')
        countries_file = os.path.join(MAPPINGS_DIR, 'countries_for_cats.json')
        tags_file = os.path.join(MAPPINGS_DIR, 'tags.json')
        primary_classes_file = os.path.join(
            MAPPINGS_DIR, 'primary_classes.json')
        photographer_file = os.path.join(MAPPINGS_DIR, 'photographers.json')
        kmb_files_file = os.path.join(MAPPINGS_DIR, 'kmb_files.json')
        commonscat_file = os.path.join(MAPPINGS_DIR, 'commonscat.json')
        church_file = os.path.join(MAPPINGS_DIR, 'churches.json')
        photographer_page = 'Institution:Riksantikvarieämbetet/KMB/creators'

        if update_mappings:
            query_props = {'P373': 'commonscat'}
            self.mappings['socken'] = KMBInfo.query_to_lookup(
                KMBInfo.build_query('P777', optional_props=query_props.keys()),
                props=query_props)
            self.mappings['kommun'] = KMBInfo.query_to_lookup(
                KMBInfo.build_query('P525', optional_props=query_props.keys()),
                props=query_props)
            self.mappings['photographers'] = self.get_photographer_mapping(
                photographer_page)
            self.mappings['kmb_files'] = self.get_existing_kmb_files()
            self.mappings['commonscat'] = {'bbr': {}, 'fmis': {}}
            KMBInfo.get_commonscat_from_heritage(
                'se-bbr', limit=1000,
                data=self.mappings['commonscat']['bbr'])
            KMBInfo.get_commonscat_from_heritage(
                'se-fornmin', limit=1000,
                data=self.mappings['commonscat']['fmis'])
            self.load_wikidata_bbr_fmis_commonscat()

            # dump to mappings
            common.open_and_write_file(
                socken_file, self.mappings['socken'], as_json=True)
            common.open_and_write_file(
                kommun_file, self.mappings['kommun'], as_json=True)
            common.open_and_write_file(
                photographer_file, self.mappings['photographers'],
                as_json=True)
            common.open_and_write_file(
                kmb_files_file, self.mappings['kmb_files'], as_json=True)
            common.open_and_write_file(
                commonscat_file, self.mappings['commonscat'], as_json=True)
        else:
            self.mappings['socken'] = common.open_and_read_file(
                socken_file, as_json=True)
            self.mappings['kommun'] = common.open_and_read_file(
                kommun_file, as_json=True)
            self.mappings['photographers'] = common.open_and_read_file(
                photographer_file, as_json=True)
            self.mappings['kmb_files'] = common.open_and_read_file(
                kmb_files_file, as_json=True)
            self.mappings['commonscat'] = common.open_and_read_file(
                commonscat_file, as_json=True)

        self.mappings['countries'] = common.open_and_read_file(
            countries_file, as_json=True)
        self.mappings['churches'] = common.open_and_read_file(
            church_file, as_json=True)
        self.mappings['tags'] = common.open_and_read_file(
            tags_file, as_json=True)
        self.mappings['primary_classes'] = common.open_and_read_file(
            primary_classes_file, as_json=True)

        pywikibot.output('Loaded all mappings')
Beispiel #19
0
def output_blob(data, filename=None):
    """Dump data as json blob."""
    filename = filename or OUTPUT_FILE
    common.open_and_write_file(filename, data, as_json=True)
    pywikibot.output('{0} created'.format(filename))
Beispiel #20
0
def load_mappings(update_mappings, mappings_dir=None,
                  load_mapping_lists=None):
    """
    Update mapping files, load these and package appropriately.

    :param update_mappings: whether to first download the latest mappings
    :param mappings_dir: path to directory in which mappings are found
    :param load_mapping_lists: the root path to any mapping_lists which should
        be loaded.
    """
    mappings = {}
    mappings_dir = mappings_dir or MAPPINGS_DIR
    common.create_dir(mappings_dir)  # ensure it exists

    parish_file = os.path.join(mappings_dir, 'socken.json')
    muni_file = os.path.join(mappings_dir, 'kommun.json')
    county_file = os.path.join(mappings_dir, 'lan.json')
    province_file = os.path.join(mappings_dir, 'province.json')
    country_file = os.path.join(mappings_dir, 'country.json')

    if update_mappings:
        query_props = {'P373': 'commonscat'}
        lang = 'sv'
        mappings['parish'] = query_to_lookup(
            build_query('P777', optional_props=query_props.keys(), lang=lang),
            props=query_props, lang=lang)
        mappings['municipality'] = query_to_lookup(
            build_query('P525', optional_props=query_props.keys(), lang=lang),
            props=query_props, lang=lang)
        mappings['county'] = query_to_lookup(
            build_query('P507', optional_props=query_props.keys(), lang=lang),
            props=query_props, lang=lang)

        # dump to mappings
        common.open_and_write_file(
            parish_file, mappings['parish'], as_json=True)
        common.open_and_write_file(
            muni_file, mappings['municipality'], as_json=True)
        common.open_and_write_file(
            county_file, mappings['county'], as_json=True)

    else:
        mappings['parish'] = common.open_and_read_file(
            parish_file, as_json=True)
        mappings['municipality'] = common.open_and_read_file(
            muni_file, as_json=True)
        mappings['county'] = common.open_and_read_file(
            county_file, as_json=True)

    # static files
    mappings['province'] = common.open_and_read_file(
        province_file, as_json=True)
    mappings['country'] = common.open_and_read_file(
        country_file, as_json=True)

    if load_mapping_lists:
        load_mapping_lists_mappings(
            mappings_dir, update_mappings, mappings, load_mapping_lists)

    pywikibot.output('Loaded all mappings')
    return mappings
Beispiel #21
0
def main():
    """Request church categories and output to json."""
    church_cats = get_all_church_cats()
    church_file = os.path.join(MAPPINGS_DIR, 'churches.json')
    common.open_and_write_file(church_file, church_cats, as_json=True)