def makeAndRename(hitlist, outPath): """ Given a hitlist create the info files and rename the matched file. @param hitlist: the output of makeHitlist @param outPath: the directory in which to store info + renamed files """ # create outPath if it doesn't exist common.create_dir(outPath) # logfile logfile = os.path.join(outPath, '¤generator.log') flog = open(logfile, 'a', encoding='utf-8') for hit in hitlist: base_name = os.path.join(outPath, hit['data']['filename']) # output info file common.open_and_write_file('%s.info' % base_name, make_info_page(hit['data'])) # rename/move matched file outfile = '%s%s' % (base_name, hit['ext']) os.rename(hit['path'], outfile) flog.write('%s|%s\n' % (os.path.basename(hit['path']), os.path.basename(outfile))) flog.close() pywikibot.output('Created %s' % logfile)
def load_mappings(update_mappings, mappings_dir=None, load_mapping_lists=None): """ Update mapping files, load these and package appropriately. :param update_mappings: whether to first download the latest mappings :param mappings_dir: path to directory in which mappings are found :param load_mapping_lists: the root path to any mapping_lists which should be loaded. """ mappings = {} mappings_dir = mappings_dir or MAPPINGS_DIR common.create_dir(mappings_dir) # ensure it exists expeditions_file = path.join(mappings_dir, 'expeditions.json') museums_file = path.join(mappings_dir, 'museums.json') # static files mappings['expeditions'] = common.open_and_read_file(expeditions_file, as_json=True) mappings['museums'] = common.open_and_read_file(museums_file, as_json=True) if load_mapping_lists: load_mapping_lists_mappings(mappings_dir, update_mappings, mappings, load_mapping_lists) pywikibot.output('Loaded all mappings') return mappings
def rename(base_dir, sub_cat, in_filename, log_file='move.log'): """ Identify any files to replace and rename them to their commons names. :param base_dir: Path to directory in which replacement image files are found. :param sub_cat: The name of the subdirectory into which processed files should be moved. :param in_filename: The photoAll.csv file filtered to only contain the files to replace. :param log_file: The name of the log file to be created (in base_dir). """ # Load indata in_filename = common.modify_path(base_dir, in_filename) header_check = u'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ u'PhoSwdS|MulId|filnamn|AdrVorNameS|AdrNameS|PhoSystematikS' data = csv.csv_file_to_dict(in_filename, "filnamn", header_check, keep=('PhoSystematikS', 'filnamn'), delimiter='|', codec='utf-16') # reformat the commons filenames url_prefix = u'https://commons.wikimedia.org/wiki/File:' for k, v in data.iteritems(): if v['PhoSystematikS'].startswith(url_prefix): data[k] = v['PhoSystematikS'][len(url_prefix):] else: pywikibot.output("error in indatafile: %s, %s" % (k, v)) # find candidate files candidates = prep.find_files(base_dir, ('.tif', ), subdir=False) # rename the files sub_cat = common.modify_path(base_dir, sub_cat) log_file = common.modify_path(base_dir, log_file) common.create_dir(sub_cat) log = [] for candidate in candidates: base_name = os.path.basename(candidate) if base_name not in data.keys(): log.append('%s not found in csv file' % base_name) continue commons_name = data.pop(base_name) commons_name = common.modify_path(sub_cat, commons_name) os.rename(candidate, commons_name) for k in data.keys(): log.append('%s not found on disk' % k) common.open_and_write_file(log_file, '\n'.join(log), codec='utf-8') pywikibot.output(u'Created %s' % log_file)
def rename(base_dir, sub_cat, in_filename, log_file='move.log'): """ Identify any files to replace and rename them to their commons names. :param base_dir: Path to directory in which replacement image files are found. :param sub_cat: The name of the subdirectory into which processed files should be moved. :param in_filename: The photoAll.csv file filtered to only contain the files to replace. :param log_file: The name of the log file to be created (in base_dir). """ # Load indata in_filename = common.modify_path(base_dir, in_filename) header_check = u'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ u'PhoSwdS|MulId|filnamn|AdrVorNameS|AdrNameS|PhoSystematikS' data = csv.csv_file_to_dict(in_filename, "filnamn", header_check, keep=('PhoSystematikS', 'filnamn'), delimiter='|', codec='utf-16') # reformat the commons filenames url_prefix = u'https://commons.wikimedia.org/wiki/File:' for k, v in data.iteritems(): if v['PhoSystematikS'].startswith(url_prefix): data[k] = v['PhoSystematikS'][len(url_prefix):] else: pywikibot.output("error in indatafile: %s, %s" % (k, v)) # find candidate files candidates = prep.find_files(base_dir, ('.tif', ), subdir=False) # rename the files sub_cat = common.modify_path(base_dir, sub_cat) log_file = common.modify_path(base_dir, log_file) common.create_dir(sub_cat) log = [] for candidate in candidates: base_name = os.path.basename(candidate) if base_name not in data.keys(): log.append('%s not found in csv file' % base_name) continue commons_name = data.pop(base_name) commons_name = common.modify_path(sub_cat, commons_name) os.rename(candidate, commons_name) for k in data.keys(): log.append('%s not found on disk' % k) common.open_and_write_file(log_file, '\n'.join(log), codec='utf-8') pywikibot.output(u'Created %s' % log_file)
def move_matching_files(files, data, out_dir): """Rename and move matching files. @param files: output from find_all_files @param data: the data loaded from the csv @param out_dir: path of the directory where output files should be placed """ common.create_dir(out_dir) for key, in_path in files.iteritems(): if key in data.keys(): path_name, ext = os.path.splitext(in_path) file_name_out = u'{}{}'.format(data[key][u'Commons'], ext) out_path = common.modify_path(out_dir, file_name_out) os.rename(in_path, out_path)
def __init__(self, page, parameters, header_template=None, row_template_name='User:André Costa (WMSE)/mapping-row', site=None, mapping_dir=None, wikitext_dir=None, options=None): """ Initialise an mapping list. @param page: name of page (incl. prefixes) where list (should) live. @param parameters: a list of mapping keys in data to use as parameters or an OrderedDict with {data key: template parameter} pairs. @param header_template: the header template (incl. any parameters) and "{{ }}". @param row_template_name: the name of the row template. (Default: User:André Costa (WMSE)/mapping-row) @param mapping_dir: Directory in which to save scraped mappings. (Default: OUT_PATH) @param wikitext_dir: Directory in which to save created wikitext mapping lists. (Default: OUT_PATH) @param site: pywikibot.Site on which the page lives. (Default: Wikimedia Commons) @param options: dict of overriding option settings. """ self.options = self.set_options(options) self.page_name = page.rpartition('/')[2] self.page = MappingList.construct_page(site, page) self.header_template = header_template self.row_template = row_template_name self.mapping_dir = mapping_dir or OUT_PATH self.wikitext_dir = wikitext_dir or OUT_PATH # store as dict internally for uniform handling if isinstance(parameters, list): self.parameters = OrderedDict([(k, k) for k in parameters]) else: self.parameters = parameters # create out_paths if they don't exist common.create_dir(self.mapping_dir) common.create_dir(self.wikitext_dir)
def upload_all(base_dir, sub_dir=u'Uploaded', log_file='upload.log', verbose=True): """ Upload the renamed files. We cannot just use uploader.up_all since there are no corresponding .info files. """ commons = pywikibot.Site('commons', 'commons') commons.login() upload_comment = u'Source image improved by the institution #LSH' # create target directories if they don't exist done_dir = common.modify_path(base_dir, sub_dir) error_dir = u'%s_errors' % done_dir warnings_dir = u'%s_warnings' % done_dir common.create_dir(done_dir) common.create_dir(error_dir) common.create_dir(warnings_dir) # logfile logfile = common.modify_path(base_dir, log_file) flog = codecs.open(logfile, 'a', 'utf-8') # find candidate files media_files = prep.find_files(base_dir, ('.tif', ), subdir=False) for media_file in media_files: file_name = os.path.basename(media_file) target_dir = None result = uploader.upload_single_file(file_name, media_file, upload_comment, commons, overwrite_page_exists=True) if result.get('error'): target_dir = error_dir elif result.get('warning'): target_dir = warnings_dir else: target_dir = done_dir if verbose: pywikibot.output(result.get('log')) flog.write(u'%s\n' % result.get('log')) os.rename(media_file, common.modify_path(target_dir, file_name)) flog.flush() flog.close() pywikibot.output(u'Created %s' % logfile)
def upload_all(base_dir, sub_dir=u'Uploaded', log_file='upload.log', verbose=True): """ Upload the renamed files. We cannot just use uploader.up_all since there are no corresponding .info files. """ commons = pywikibot.Site('commons', 'commons') commons.login() upload_comment = u'Source image improved by the institution #LSH' # create target directories if they don't exist done_dir = common.modify_path(base_dir, sub_dir) error_dir = u'%s_errors' % done_dir warnings_dir = u'%s_warnings' % done_dir common.create_dir(done_dir) common.create_dir(error_dir) common.create_dir(warnings_dir) # logfile logfile = common.modify_path(base_dir, log_file) flog = codecs.open(logfile, 'a', 'utf-8') # find candidate files media_files = prep.find_files(base_dir, ('.tif', ), subdir=False) for media_file in media_files: file_name = os.path.basename(media_file) target_dir = None result = uploader.upload_single_file( file_name, media_file, upload_comment, commons, overwrite_page_exists=True) if result.get('error'): target_dir = error_dir elif result.get('warning'): target_dir = warnings_dir else: target_dir = done_dir if verbose: pywikibot.output(result.get('log')) flog.write(u'%s\n' % result.get('log')) os.rename(media_file, common.modify_path(target_dir, file_name)) flog.flush() flog.close() pywikibot.output(u'Created %s' % logfile)
def load_mappings(update_mappings, mappings_dir=None, load_mapping_lists=None): """ Update mapping files, load these and package appropriately. :param update_mappings: whether to first download the latest mappings :param mappings_dir: path to directory in which mappings are found :param load_mapping_lists: the root path to any mapping_lists which should be loaded. """ mappings = {} mappings_dir = mappings_dir or MAPPINGS_DIR common.create_dir(mappings_dir) # ensure it exists parish_file = os.path.join(mappings_dir, 'socken.json') muni_file = os.path.join(mappings_dir, 'kommun.json') county_file = os.path.join(mappings_dir, 'lan.json') province_file = os.path.join(mappings_dir, 'province.json') country_file = os.path.join(mappings_dir, 'country.json') if update_mappings: query_props = {'P373': 'commonscat'} lang = 'sv' mappings['parish'] = query_to_lookup( build_query('P777', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) mappings['municipality'] = query_to_lookup( build_query('P525', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) mappings['county'] = query_to_lookup( build_query('P507', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) # dump to mappings common.open_and_write_file( parish_file, mappings['parish'], as_json=True) common.open_and_write_file( muni_file, mappings['municipality'], as_json=True) common.open_and_write_file( county_file, mappings['county'], as_json=True) else: mappings['parish'] = common.open_and_read_file( parish_file, as_json=True) mappings['municipality'] = common.open_and_read_file( muni_file, as_json=True) mappings['county'] = common.open_and_read_file( county_file, as_json=True) # static files mappings['province'] = common.open_and_read_file( province_file, as_json=True) mappings['country'] = common.open_and_read_file( country_file, as_json=True) if load_mapping_lists: load_mapping_lists_mappings( mappings_dir, update_mappings, mappings, load_mapping_lists) pywikibot.output('Loaded all mappings') return mappings
def up_all_from_url(info_path, cutoff=None, target='upload_logs', file_exts=None, verbose=False, test=False, target_site=None, only=None, skip=None): """ Upload all images provided as urls in a make_info json file. Media (image) files and metadata files with the expected extension .info should be in the same directory. Metadata files should contain the entirety of the desired description page (in wikitext). Outputs separate logfiles for files triggering errors, warnings (and successful) so that these can be used in latter runs. @param info_path: path to the make_info json file @param cutoff: number of files to upload (defaults to all) @param target: sub-directory for log files (defaults to "upload_logs") @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS) @param verbose: whether to output confirmation after each upload @param test: set to True to test but not upload @param target_site: pywikibot.Site to which file should be uploaded, defaults to Commons. @param only: list of urls to upload, if provided all others will be skipped @param skip: list of urls to skip, all others will be uploaded """ # set defaults unless overridden file_exts = file_exts or FILE_EXTS target_site = target_site or pywikibot.Site('commons', 'commons') target_site.login() # load info file info_datas = common.open_and_read_file(info_path, as_json=True) # create target directory if it doesn't exist output_dir = os.path.join(os.path.dirname(info_path), target) common.create_dir(output_dir) # create all log files logs = { 'success': common.LogFile(output_dir, 'success.log'), 'warning': common.LogFile(output_dir, 'warnings.log'), 'error': common.LogFile(output_dir, 'errors.log'), 'general': common.LogFile(output_dir, 'uploader.log') } # shortcut to the general/verbose logfile flog = logs['general'] # filtering based on entries in only/skip kill_list = set() if only: kill_list |= set(info_datas.keys()) - set(only) # difference if skip: kill_list |= set(info_datas.keys()) & set(skip) # intersection for key in kill_list: del info_datas[key] flog.write_w_timestamp('{} files remain to upload after filtering'.format( len(info_datas))) counter = 1 for url, data in info_datas.items(): if cutoff and counter > cutoff: break # verify that the file extension is ok try: ext = verify_url_file_extension(url, file_exts) except common.MyError as e: flog.write_w_timestamp(e) continue # verify that info and output filenames are provided if not data['info']: flog.write_w_timestamp( '{url}: Found url missing the info field (at least)'.format( url=url)) continue elif not data['filename']: flog.write_w_timestamp( '{url}: Found url missing the output filename'.format(url=url)) continue # prepare upload txt = make_info_page(data) filename = '{filename}{ext}'.format(filename=data['filename'], ext=ext) if test: pywikibot.output( 'Test upload "{filename}" from "{url}" with the following ' 'description:\n{txt}\n'.format(filename=filename, url=url, txt=txt)) counter += 1 continue # stop here if testing result = upload_single_file(filename, url, txt, target_site, upload_if_badprefix=True) if result.get('error'): logs['error'].write(url) elif result.get('warning'): logs['warning'].write(url) else: logs['success'].write(url) if verbose: pywikibot.output(result.get('log')) flog.write_w_timestamp(result.get('log')) counter += 1 for log in logs.values(): pywikibot.output(log.close_and_confirm())
def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None, verbose=False, test=False, target_site=None, chunked=True): """ Upload all matched files in the supplied directory. Media (image) files and metadata files with the expected extension .info should be in the same directory. Metadata files should contain the entirety of the desired description page (in wikitext). Moves each file to one the target folders after processing. @param in_path: path to directory with files to upload @param cutoff: number of files to upload (defaults to all) @param target: sub-directory for uploaded files (defaults to "Uploaded") @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS) @param verbose: whether to output confirmation after each upload @param test: set to True to test but not upload @param target_site: pywikibot.Site to which file should be uploaded, defaults to Commons. @param chunked: Whether to do chunked uploading or not. """ # set defaults unless overridden file_exts = file_exts or FILE_EXTS target_site = target_site or pywikibot.Site('commons', 'commons') target_site.login() # Verify in_path if not os.path.isdir(in_path): pywikibot.output('The provided in_path was not a valid ' 'directory: %s' % in_path) exit() # create target directories if they don't exist done_dir = os.path.join(in_path, target) error_dir = '%s_errors' % done_dir warnings_dir = '%s_warnings' % done_dir common.create_dir(done_dir) common.create_dir(error_dir) common.create_dir(warnings_dir) # logfile flog = common.LogFile(in_path, '¤uploader.log') # find all content files found_files = prepUpload.find_files(path=in_path, file_exts=file_exts, subdir=False) counter = 1 for f in found_files: if cutoff and counter > cutoff: break # verify that there is a matching info file info_file = '%s.info' % os.path.splitext(f)[0] base_name = os.path.basename(f) base_info_name = os.path.basename(info_file) if not os.path.exists(info_file): flog.write_w_timestamp( '{0}: Found multimedia file without info'.format(base_name)) continue # prepare upload txt = common.open_and_read_file(info_file) if test: pywikibot.output('Test upload "%s" with the following ' 'description:\n%s\n' % (base_name, txt)) counter += 1 continue # stop here if testing target_dir = None result = upload_single_file(base_name, f, txt, target_site, upload_if_badprefix=True, chunked=chunked) if result.get('error'): target_dir = error_dir elif result.get('warning'): target_dir = warnings_dir else: target_dir = done_dir if verbose: pywikibot.output(result.get('log')) flog.write_w_timestamp(result.get('log')) os.rename(f, os.path.join(target_dir, base_name)) os.rename(info_file, os.path.join(target_dir, base_info_name)) counter += 1 pywikibot.output(flog.close_and_confirm())