def __init__(self, **options): super(GlassInfo, self).__init__(**options) self.batch_cat = "{}: {}".format(BATCH_CAT, BATCH_DATE) self.commons = pywikibot.Site('commons', 'commons') self.wikidata = pywikibot.Site('wikidata', 'wikidata') self.log = common.LogFile('', LOGFILE) self.category_cache = []
def __init__(self, options): """Initialise a harvester object for a DigitaltMuseum harvest.""" if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) # Create directory for cache if needed self.data = {} # data container for harvested info self.settings = options self.log = common.LogFile('', self.settings.get('harvest_log_file')) self.log.write_w_timestamp('Harvester started...') self.exhibition_cache = {} # cache for exhibition dimu-code, as it's
def __init__(self, **options): """Initialise a make_info object.""" batch_date = options.get('batch_label') or BATCH_DATE batch_cat = options.get('base_meta_cat') or BATCH_CAT super(KMBInfo, self).__init__(batch_cat, batch_date, **options) self.commons = pywikibot.Site('commons', 'commons') self.wikidata = pywikibot.Site('wikidata', 'wikidata') self.category_cache = {} # cache for category_exists() self.photographer_cache = {} self.log = common.LogFile('', LOGFILE)
def __init__(self, **options): """Initialise a make_info object.""" batch_date = common.pop(options, 'batch_label') or BATCH_DATE batch_cat = common.pop(options, 'base_meta_cat') or BATCH_CAT super(SMVKInfo, self).__init__(batch_cat, batch_date, **options) self.commons = pywikibot.Site('commons', 'commons') self.wikidata = pywikibot.Site('wikidata', 'wikidata') self.category_cache = {} # cache for category_exists() self.wikidata_cache = {} # cache for Wikidata results self.log = common.LogFile('', LOGFILE) self.log.write_w_timestamp('Make info started...') self.pd_year = datetime.now().year - 70
def __init__(self, **options): """Initialise a make_info object.""" self.b_settings = self.load_batch_settings(options) super(GLAMInfo, self).__init__(self.b_settings["batch_cat"], self.b_settings["batch_date"], **options) self.commons = pywikibot.Site('commons', 'commons') self.wikidata = pywikibot.Site('wikidata', 'wikidata') self.category_cache = {} # cache for category_exists() self.wikidata_cache = {} # cache for Wikidata results self.log = common.LogFile( '', self.b_settings.get("makeinfo_log_file" or LOGFILE)) self.log.write_w_timestamp('Make info started...') self.pd_year = datetime.now().year - 70
def run(start=None, end=None): """Get parsed data for whole kmb hitlist and store as json.""" log = common.LogFile('', LOGFILE) hitlist = load_list() if start or end: hitlist = hitlist[start:end] data = {} total_count = len(hitlist) for count, kmb in enumerate(hitlist): data[kmb] = kmb_wrapper(kmb, log) time.sleep(THROTTLE) if count % 100 == 0: pywikibot.output('{time:s} - {count:d} of {total:d} parsed'.format( time=time.strftime('%H:%M:%S'), count=count, total=total_count)) output_blob(data) pywikibot.output(log.close_and_confirm())
def __init__(self, options): """Initialise an mapping updater for a SMVK dataset.""" self.settings = options parser = CsvParser(**self.settings) self.log = common.LogFile('', self.settings.get('mapping_log_file')) self.log.write_w_timestamp('Updater started...') self.mappings = load_mappings( update_mappings=True, mappings_dir=self.settings.get('mappings_dir')) data = parser.load_data(self.settings.get('data_file')) # load archive card data to ensure formatting is still valid archive_data = parser.load_archive_data( self.settings.get('archive_file')) self.people_to_map = Counter() self.ethnic_to_map = Counter() self.places_to_map = OrderedDict() self.keywords_to_map = Counter() self.expedition_to_match = set() self.museum_to_match = set() self.external_to_parse = set() self.parse_data(data) self.parse_archive_data(archive_data) # validate hard coded mappings for ext_id in self.external_to_parse: utils.parse_external_id(ext_id) for expedition in self.expedition_to_match: if expedition not in self.mappings.get('expeditions'): pywikibot.warning( '{} must be added to expeditions.json'.format(expedition)) museum_mapping = self.mappings.get('museums') for museum, type in self.museum_to_match: if museum not in museum_mapping: pywikibot.warning( '{} must be added to museum.json'.format(museum)) elif type not in museum_mapping.get(museum).get('known_types'): pywikibot.warning( 'The "{}" type for {} must be added the Wikimedia link ' 'templates and to museum.json'.format(type, museum)) self.dump_to_wikifiles()
def __init__(self, options): """Initialise an mapping updater for a DigitaltMuseum harvest.""" self.settings = options self.log = common.LogFile('', self.settings.get('mapping_log_file')) self.log.write_w_timestamp('Updater started...') self.mappings = load_mappings( update_mappings=True, mappings_dir=self.settings.get('mappings_dir')) harvest_data = load_harvest_data(self.settings.get('harvest_file')) self.kulturnav_hits = load_kulturnav_data() self.people_to_map = {} self.places_to_map = OrderedDict() self.subjects_to_map = Counter() self.parse_harvest_data(harvest_data) self.check_and_remove_code_place_entries() self.dump_to_wikifiles()
def get_data(): """Get parsed data for given keywords and store as json files.""" log = common.LogFile('', LOGFILE) settings = load_settings() keywords = settings["keywords"] api_key = settings["api_key"] for keyword in keywords: print("[{}] : fetching data.".format(keyword)) filename = "results_{0}.json".format(keyword) results = {} hits_limit = 500 start_at = 1 counter = 0 while True: url = create_url(keyword, hits_limit, start_at, api_key) records = get_records_from_url(url) total_results = get_total_hits(records) records = split_records(records) records_on_page = len(records) if records_on_page == 0: break else: for record in records: counter += 1 id_no = extract_id_number(record) processed_dict = {'ID': id_no, 'problem': []} processed_record = parse_record(record, processed_dict, log) if id_no not in results: results[id_no] = processed_record if counter % 100 == 0: print("Processed {} out of {}".format( counter, total_results)) start_at += hits_limit time.sleep(THROTTLE) print("[{}] : fetched {} records to {}.".format( keyword, len(results), filename)) save_data(results, filename)
def up_all_from_url(info_path, cutoff=None, target='upload_logs', file_exts=None, verbose=False, test=False, target_site=None, only=None, skip=None): """ Upload all images provided as urls in a make_info json file. Media (image) files and metadata files with the expected extension .info should be in the same directory. Metadata files should contain the entirety of the desired description page (in wikitext). Outputs separate logfiles for files triggering errors, warnings (and successful) so that these can be used in latter runs. @param info_path: path to the make_info json file @param cutoff: number of files to upload (defaults to all) @param target: sub-directory for log files (defaults to "upload_logs") @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS) @param verbose: whether to output confirmation after each upload @param test: set to True to test but not upload @param target_site: pywikibot.Site to which file should be uploaded, defaults to Commons. @param only: list of urls to upload, if provided all others will be skipped @param skip: list of urls to skip, all others will be uploaded """ # set defaults unless overridden file_exts = file_exts or FILE_EXTS target_site = target_site or pywikibot.Site('commons', 'commons') target_site.login() # load info file info_datas = common.open_and_read_file(info_path, as_json=True) # create target directory if it doesn't exist output_dir = os.path.join(os.path.dirname(info_path), target) common.create_dir(output_dir) # create all log files logs = { 'success': common.LogFile(output_dir, 'success.log'), 'warning': common.LogFile(output_dir, 'warnings.log'), 'error': common.LogFile(output_dir, 'errors.log'), 'general': common.LogFile(output_dir, 'uploader.log') } # shortcut to the general/verbose logfile flog = logs['general'] # filtering based on entries in only/skip kill_list = set() if only: kill_list |= set(info_datas.keys()) - set(only) # difference if skip: kill_list |= set(info_datas.keys()) & set(skip) # intersection for key in kill_list: del info_datas[key] flog.write_w_timestamp('{} files remain to upload after filtering'.format( len(info_datas))) counter = 1 for url, data in info_datas.items(): if cutoff and counter > cutoff: break # verify that the file extension is ok try: ext = verify_url_file_extension(url, file_exts) except common.MyError as e: flog.write_w_timestamp(e) continue # verify that info and output filenames are provided if not data['info']: flog.write_w_timestamp( '{url}: Found url missing the info field (at least)'.format( url=url)) continue elif not data['filename']: flog.write_w_timestamp( '{url}: Found url missing the output filename'.format(url=url)) continue # prepare upload txt = make_info_page(data) filename = '{filename}{ext}'.format(filename=data['filename'], ext=ext) if test: pywikibot.output( 'Test upload "{filename}" from "{url}" with the following ' 'description:\n{txt}\n'.format(filename=filename, url=url, txt=txt)) counter += 1 continue # stop here if testing result = upload_single_file(filename, url, txt, target_site, upload_if_badprefix=True) if result.get('error'): logs['error'].write(url) elif result.get('warning'): logs['warning'].write(url) else: logs['success'].write(url) if verbose: pywikibot.output(result.get('log')) flog.write_w_timestamp(result.get('log')) counter += 1 for log in logs.values(): pywikibot.output(log.close_and_confirm())
def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None, verbose=False, test=False, target_site=None, chunked=True): """ Upload all matched files in the supplied directory. Media (image) files and metadata files with the expected extension .info should be in the same directory. Metadata files should contain the entirety of the desired description page (in wikitext). Moves each file to one the target folders after processing. @param in_path: path to directory with files to upload @param cutoff: number of files to upload (defaults to all) @param target: sub-directory for uploaded files (defaults to "Uploaded") @param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS) @param verbose: whether to output confirmation after each upload @param test: set to True to test but not upload @param target_site: pywikibot.Site to which file should be uploaded, defaults to Commons. @param chunked: Whether to do chunked uploading or not. """ # set defaults unless overridden file_exts = file_exts or FILE_EXTS target_site = target_site or pywikibot.Site('commons', 'commons') target_site.login() # Verify in_path if not os.path.isdir(in_path): pywikibot.output('The provided in_path was not a valid ' 'directory: %s' % in_path) exit() # create target directories if they don't exist done_dir = os.path.join(in_path, target) error_dir = '%s_errors' % done_dir warnings_dir = '%s_warnings' % done_dir common.create_dir(done_dir) common.create_dir(error_dir) common.create_dir(warnings_dir) # logfile flog = common.LogFile(in_path, '¤uploader.log') # find all content files found_files = prepUpload.find_files(path=in_path, file_exts=file_exts, subdir=False) counter = 1 for f in found_files: if cutoff and counter > cutoff: break # verify that there is a matching info file info_file = '%s.info' % os.path.splitext(f)[0] base_name = os.path.basename(f) base_info_name = os.path.basename(info_file) if not os.path.exists(info_file): flog.write_w_timestamp( '{0}: Found multimedia file without info'.format(base_name)) continue # prepare upload txt = common.open_and_read_file(info_file) if test: pywikibot.output('Test upload "%s" with the following ' 'description:\n%s\n' % (base_name, txt)) counter += 1 continue # stop here if testing target_dir = None result = upload_single_file(base_name, f, txt, target_site, upload_if_badprefix=True, chunked=chunked) if result.get('error'): target_dir = error_dir elif result.get('warning'): target_dir = warnings_dir else: target_dir = done_dir if verbose: pywikibot.output(result.get('log')) flog.write_w_timestamp(result.get('log')) os.rename(f, os.path.join(target_dir, base_name)) os.rename(info_file, os.path.join(target_dir, base_info_name)) counter += 1 pywikibot.output(flog.close_and_confirm())