def process_all_files(base_dir=MAIN_DIR, xml_dir=XML_DIR): """Identify all xml files in a directory, load the data and process.""" # Check categories xml_dir = os.path.join(base_dir, xml_dir) for directory in (base_dir, xml_dir): if not os.path.isdir(directory): raise common.MyError( u'The provided directory was not a valid directory: %s' % directory) # Find candidate files found_files = prep.find_files(path=xml_dir, file_exts=('.xml', ), subdir=False) pywikibot.output("Found %d .xml files" % len(found_files)) data = {} for xml_file in found_files: try: test = InfoEntry(load_xml(xml_file)) except Exception as e: pywikibot.output(u"Encountered error while processing %s: %s" % (os.path.split(xml_file)[-1], e)) continue if test.obj_id in data.keys(): pywikibot.output(u"Multiple files for same object: %s, %s, %s" % (test.obj_id, test.source_file, data[test.obj_id]['source_file'])) continue data[test.obj_id] = test.output() out_file = os.path.join(base_dir, u'processed_lido.json') common.open_and_write_file(out_file, data, as_json=True) pywikibot.output("Created %s with %d entries" % (out_file, len(data)))
def makeAndRename(hitlist, outPath): """ Given a hitlist create the info files and rename the matched file. @param hitlist: the output of makeHitlist @param outPath: the directory in which to store info + renamed files """ # create outPath if it doesn't exist common.create_dir(outPath) # logfile logfile = os.path.join(outPath, '¤generator.log') flog = open(logfile, 'a', encoding='utf-8') for hit in hitlist: base_name = os.path.join(outPath, hit['data']['filename']) # output info file common.open_and_write_file('%s.info' % base_name, make_info_page(hit['data'])) # rename/move matched file outfile = '%s%s' % (base_name, hit['ext']) os.rename(hit['path'], outfile) flog.write('%s|%s\n' % (os.path.basename(hit['path']), os.path.basename(outfile))) flog.close() pywikibot.output('Created %s' % logfile)
def rename(base_dir, sub_cat, in_filename, log_file='move.log'): """ Identify any files to replace and rename them to their commons names. :param base_dir: Path to directory in which replacement image files are found. :param sub_cat: The name of the subdirectory into which processed files should be moved. :param in_filename: The photoAll.csv file filtered to only contain the files to replace. :param log_file: The name of the log file to be created (in base_dir). """ # Load indata in_filename = common.modify_path(base_dir, in_filename) header_check = u'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ u'PhoSwdS|MulId|filnamn|AdrVorNameS|AdrNameS|PhoSystematikS' data = csv.csv_file_to_dict(in_filename, "filnamn", header_check, keep=('PhoSystematikS', 'filnamn'), delimiter='|', codec='utf-16') # reformat the commons filenames url_prefix = u'https://commons.wikimedia.org/wiki/File:' for k, v in data.iteritems(): if v['PhoSystematikS'].startswith(url_prefix): data[k] = v['PhoSystematikS'][len(url_prefix):] else: pywikibot.output("error in indatafile: %s, %s" % (k, v)) # find candidate files candidates = prep.find_files(base_dir, ('.tif', ), subdir=False) # rename the files sub_cat = common.modify_path(base_dir, sub_cat) log_file = common.modify_path(base_dir, log_file) common.create_dir(sub_cat) log = [] for candidate in candidates: base_name = os.path.basename(candidate) if base_name not in data.keys(): log.append('%s not found in csv file' % base_name) continue commons_name = data.pop(base_name) commons_name = common.modify_path(sub_cat, commons_name) os.rename(candidate, commons_name) for k in data.keys(): log.append('%s not found on disk' % k) common.open_and_write_file(log_file, '\n'.join(log), codec='utf-8') pywikibot.output(u'Created %s' % log_file)
def load_mappings(self, update_mappings): depicted_file = os.path.join(MAPPINGS_DIR, 'glass_depicted.json') depicted_page = 'User:Alicia_Fagerving_(WMSE)/sandbox_gn_depicted' if update_mappings: print("Updating mappings...") self.mappings['depicted'] = self.get_depicted_mapping( depicted_page) common.open_and_write_file( depicted_file, self.mappings['depicted'], as_json=True) else: self.mappings['depicted'] = common.open_and_read_file( depicted_file, as_json=True) pywikibot.output('Loaded all mappings')
def load_mappings(self, update_mappings): depicted_file = os.path.join(MAPPINGS_DIR, 'depicted.json') depicted_page = 'User:Alicia_Fagerving_(WMSE)/sandbox3' photographer_file = os.path.join(MAPPINGS_DIR, 'photographers.json') photographer_page = 'User:Alicia_Fagerving_(WMSE)/sandbox2' play_file = os.path.join(MAPPINGS_DIR, 'plays.json') play_page = 'User:Alicia_Fagerving_WMSE/sandbox4' theatre_file = os.path.join(MAPPINGS_DIR, 'theatres.json') theatre_page = 'User:Alicia_Fagerving_(WMSE)/sandbox' helleday_file = os.path.join(MAPPINGS_DIR, 'linked_helleday.json') if update_mappings: print("Updating mappings...") self.mappings['photographers'] = self.get_photographer_mapping( photographer_page) self.mappings['theatres'] = self.get_theatre_mapping(theatre_page) self.mappings['depicted'] = self.get_depicted_mapping( depicted_page) self.mappings['plays'] = self.get_play_mapping(play_page) self.mappings['helleday_files'] = self.get_existing_helleday_files( ) common.open_and_write_file(theatre_file, self.mappings['theatres'], as_json=True) common.open_and_write_file(photographer_file, self.mappings['photographers'], as_json=True) common.open_and_write_file(depicted_file, self.mappings['depicted'], as_json=True) common.open_and_write_file(play_file, self.mappings['plays'], as_json=True) common.open_and_write_file(helleday_file, self.mappings['helleday_files'], as_json=True) else: self.mappings['photographers'] = common.open_and_read_file( photographer_file, as_json=True) self.mappings['theatres'] = common.open_and_read_file(theatre_file, as_json=True) self.mappings['depicted'] = common.open_and_read_file( depicted_file, as_json=True) self.mappings['plays'] = common.open_and_read_file(play_file, as_json=True) self.mappings['helleday_files'] = common.open_and_read_file( helleday_file, as_json=True) pywikibot.output('Loaded all mappings')
def scrape(self): """ Scrape lists on commons and overwrite local files. If the page does not exist a warning is raised and no file is created. """ if not self.page.exists(): pywikibot.warning('The list page {} does not exist!'.format( self.page.title())) else: parsed_data = self.parse_entries(self.page.get()) mapping_file = os.path.join( self.mapping_dir, 'commons-{}.json'.format(self.page_name)) common.open_and_write_file(mapping_file, parsed_data, as_json=True) pywikibot.output('Created {}'.format(mapping_file))
def save_as_wikitext(self, new_data, preserved_data=None, intro_text=''): """ Output mapping lists in wiki format. @param new_data: the new (non-zero frequency) mapping data as a list of (frequency, mapping entry) tuples. Or a dict of such lists where the key is used as a section title. @param preserved_data: the preserved (zero frequency) mapping data as a list of mapping entries. @param intro_text: Wikitext to top the page with (may also contain categories) """ wiki_file = os.path.join(self.wikitext_dir, 'commons-{}.wiki'.format(self.page_name)) wiki_text = self.mappings_to_wikipage(new_data, preserved_data, intro_text) common.open_and_write_file(wiki_file, wiki_text)
def run(self, in_file, base_name=None): """Overload run to add log outputting.""" super(NatmusInfo, self).run(in_file, base_name) # add/output connection logs self.log(u'--------------------------------------------------nsid---') for k, v in self.nsid.iteritems(): if v.get('wd'): self.log(u'%s: %s' % (k, v)) self.log(u'------------------------------------------------uri_ids---') for k, v in self.uri_ids.iteritems(): if v.get('wd') and not v.get('mapped'): self.log(u'%s: %s' % (k, v)) elif not v.get('wd') and not v.get('mapped') and v.get('freq') > 5: self.log(u'%s: %s' % (k, v)) if base_name: logfile = u'%s.log' % base_name common.open_and_write_file(logfile, '\n'.join(self.logger)) pywikibot.output("Created %s" % logfile)
def dict_to_csv_file(filename, d, header, delimiter='|', list_delimiter=';', codec='utf-8'): """ Save a dict as csv file given a header string encoding the columns. @param filename: the target file @param d: the dictionary to convert @param header: a string giving parameters to output and their order @param delimiter: the used delimiter (defaults to "|") @param list_delimiter: the used delimiter when encountering a list @param codec: the used encoding (defaults to "utf-8") @return: None """ # load file and write header output = '%s\n' % header # find keys to compare with header (from any row) cols = list(d[list(d)[0]]) header = header.split(delimiter) # verify all header fields are present if any(h not in cols for h in header): raise MyError("Header missmatch") # output rows for k, v in d.items(): row = [] for h in header: if isinstance(v[h], list): row.append(list_delimiter.join(v[h])) else: row.append(v[h]) output += '%s\n' % delimiter.join(row) # close open_and_write_file(filename, output, codec=codec)
def load_mappings(self, update_mappings): concrete_motif_file = os.path.join(MAPPINGS_DIR, 'concrete_motif.json') concrete_motif_page = 'Commons:Tekniska museet/Curman/mapping title' geo_file = os.path.join(MAPPINGS_DIR, 'geo.json') geo_page = 'Commons:Tekniska museet/Curman/mapping location' keywords_file = os.path.join(MAPPINGS_DIR, 'keywords.json') keywords_page = 'Commons:Tekniska museet/Curman/mapping amnesord' if update_mappings: print("Updating mappings...") self.mappings['concrete_motif'] = self.get_concrete_motif_mapping( concrete_motif_page) common.open_and_write_file(concrete_motif_file, self.mappings['concrete_motif'], as_json=True) self.mappings['geo'] = self.get_geo_mapping(geo_page) common.open_and_write_file(geo_file, self.mappings['geo'], as_json=True) self.mappings['keywords'] = self.get_keywords_mapping( keywords_page) common.open_and_write_file(keywords_file, self.mappings['keywords'], as_json=True) else: self.mappings['concrete_motif'] = common.open_and_read_file( concrete_motif_file, as_json=True) self.mappings['geo'] = common.open_and_read_file(geo_file, as_json=True) self.mappings['keywords'] = common.open_and_read_file( keywords_file, as_json=True) pywikibot.output('Loaded all mappings')
def load_single_object(self, uuid): """ Load the data for a single object. :param uuid: the uuid for the item """ url = 'http://api.dimu.org/artifact/uuid/{}'.format(uuid) try: filepath = os.path.join(CACHE_DIR, uuid + ".json") if self.settings["cache"]: print("Loading {} from local cache".format(uuid)) data = common.open_and_read_file(filepath, as_json=True) else: data = get_json_from_url(url) common.open_and_write_file(filepath, data, as_json=True) except requests.HTTPError as e: error_message = '{0}: {1}'.format(e, url) self.log.write(error_message) return None return data
def run(self, in_file, base_name, update_mappings): """ Entry point for outputting info data. Loads indata and any mappings to produce a make_info json file. @param in_file: filename (or tuple of such) containing the metadata @param base_name: base name to use for output (defaults to same as in_file) @update_mappings: if mappings should be updated against online sources """ if not base_name: if common.is_str(in_file): base_name, ext = os.path.splitext(in_file) else: raise common.MyError( 'A base name must be provided if multiple in_files ' 'are provided') self.cwd_path = os.path.split(base_name)[0] raw_data = self.load_data(in_file) self.load_mappings(update_mappings) self.process_data(raw_data) out_data = self.make_info() # store output out_file = '%s.json' % base_name common.open_and_write_file(out_file, out_data, as_json=True) pywikibot.output('Created %s' % out_file) # store filenames out_file = '%s.filenames.txt' % base_name out = '' for k in sorted(out_data.keys()): out += '%s|%s\n' % (k, out_data[k]['filename']) common.open_and_write_file(out_file, out) pywikibot.output('Created %s' % out_file)
def run(data, selection, log_file, output, media_ext): # fall back on defaults data_dir = os.path.split(data)[0] if not selection: selection = os.path.join(data_dir, DEFAULTS.get('selection')) selection_dir = os.path.split(selection)[0] if not output: output = os.path.join(selection_dir, DEFAULTS.get('output')) data = common.open_and_read_file(data, as_json=True) demo = common.open_and_read_file(selection, as_json=True) # load log log = {} if log_file: log_text = common.open_and_read_file(log_file) for l in log_text.split('\n'): if ' -- ' in l: idno, reason = l.split(' -- ') log[idno] = reason out = [] for idno in sorted(demo.keys()): info = '' if idno in data: info = mi.make_info_page(data[idno], preview=True) if media_ext: info = info.replace('<ext>', media_ext) elif log: info = log[idno] else: info = 'no make_info data found' out.append('== {idno} -- {reason} ==\n{info}'.format( reason=demo.get(idno), idno=idno, info=info)) common.open_and_write_file(output, '\n\n'.join(out))
def save_data(out_file, text): return common.open_and_write_file(out_file, text)
def save_data(self, filename=None): """Dump data as json blob.""" filename = filename or self.settings.get('harvest_file') sorted_data = self.sort_data('glam_id') common.open_and_write_file(filename, sorted_data, as_json=True) pywikibot.output('{0} created'.format(filename))
def save_data(data, filename=None): """Dump data as json blob.""" filename = filename or OUTPUT_FILE common.open_and_write_file(filename, data, as_json=True) print("Saved file: {}.".format(filename))
def load_mappings(self, update_mappings): """ Update mapping files, load these and package appropriately. :param update_mappings: whether to first download the latest mappings """ socken_file = os.path.join(MAPPINGS_DIR, 'socken.json') kommun_file = os.path.join(MAPPINGS_DIR, 'kommun.json') countries_file = os.path.join(MAPPINGS_DIR, 'countries_for_cats.json') tags_file = os.path.join(MAPPINGS_DIR, 'tags.json') primary_classes_file = os.path.join( MAPPINGS_DIR, 'primary_classes.json') photographer_file = os.path.join(MAPPINGS_DIR, 'photographers.json') kmb_files_file = os.path.join(MAPPINGS_DIR, 'kmb_files.json') commonscat_file = os.path.join(MAPPINGS_DIR, 'commonscat.json') church_file = os.path.join(MAPPINGS_DIR, 'churches.json') photographer_page = 'Institution:Riksantikvarieämbetet/KMB/creators' if update_mappings: query_props = {'P373': 'commonscat'} self.mappings['socken'] = KMBInfo.query_to_lookup( KMBInfo.build_query('P777', optional_props=query_props.keys()), props=query_props) self.mappings['kommun'] = KMBInfo.query_to_lookup( KMBInfo.build_query('P525', optional_props=query_props.keys()), props=query_props) self.mappings['photographers'] = self.get_photographer_mapping( photographer_page) self.mappings['kmb_files'] = self.get_existing_kmb_files() self.mappings['commonscat'] = {'bbr': {}, 'fmis': {}} KMBInfo.get_commonscat_from_heritage( 'se-bbr', limit=1000, data=self.mappings['commonscat']['bbr']) KMBInfo.get_commonscat_from_heritage( 'se-fornmin', limit=1000, data=self.mappings['commonscat']['fmis']) self.load_wikidata_bbr_fmis_commonscat() # dump to mappings common.open_and_write_file( socken_file, self.mappings['socken'], as_json=True) common.open_and_write_file( kommun_file, self.mappings['kommun'], as_json=True) common.open_and_write_file( photographer_file, self.mappings['photographers'], as_json=True) common.open_and_write_file( kmb_files_file, self.mappings['kmb_files'], as_json=True) common.open_and_write_file( commonscat_file, self.mappings['commonscat'], as_json=True) else: self.mappings['socken'] = common.open_and_read_file( socken_file, as_json=True) self.mappings['kommun'] = common.open_and_read_file( kommun_file, as_json=True) self.mappings['photographers'] = common.open_and_read_file( photographer_file, as_json=True) self.mappings['kmb_files'] = common.open_and_read_file( kmb_files_file, as_json=True) self.mappings['commonscat'] = common.open_and_read_file( commonscat_file, as_json=True) self.mappings['countries'] = common.open_and_read_file( countries_file, as_json=True) self.mappings['churches'] = common.open_and_read_file( church_file, as_json=True) self.mappings['tags'] = common.open_and_read_file( tags_file, as_json=True) self.mappings['primary_classes'] = common.open_and_read_file( primary_classes_file, as_json=True) pywikibot.output('Loaded all mappings')
def output_blob(data, filename=None): """Dump data as json blob.""" filename = filename or OUTPUT_FILE common.open_and_write_file(filename, data, as_json=True) pywikibot.output('{0} created'.format(filename))
def load_mappings(update_mappings, mappings_dir=None, load_mapping_lists=None): """ Update mapping files, load these and package appropriately. :param update_mappings: whether to first download the latest mappings :param mappings_dir: path to directory in which mappings are found :param load_mapping_lists: the root path to any mapping_lists which should be loaded. """ mappings = {} mappings_dir = mappings_dir or MAPPINGS_DIR common.create_dir(mappings_dir) # ensure it exists parish_file = os.path.join(mappings_dir, 'socken.json') muni_file = os.path.join(mappings_dir, 'kommun.json') county_file = os.path.join(mappings_dir, 'lan.json') province_file = os.path.join(mappings_dir, 'province.json') country_file = os.path.join(mappings_dir, 'country.json') if update_mappings: query_props = {'P373': 'commonscat'} lang = 'sv' mappings['parish'] = query_to_lookup( build_query('P777', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) mappings['municipality'] = query_to_lookup( build_query('P525', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) mappings['county'] = query_to_lookup( build_query('P507', optional_props=query_props.keys(), lang=lang), props=query_props, lang=lang) # dump to mappings common.open_and_write_file( parish_file, mappings['parish'], as_json=True) common.open_and_write_file( muni_file, mappings['municipality'], as_json=True) common.open_and_write_file( county_file, mappings['county'], as_json=True) else: mappings['parish'] = common.open_and_read_file( parish_file, as_json=True) mappings['municipality'] = common.open_and_read_file( muni_file, as_json=True) mappings['county'] = common.open_and_read_file( county_file, as_json=True) # static files mappings['province'] = common.open_and_read_file( province_file, as_json=True) mappings['country'] = common.open_and_read_file( country_file, as_json=True) if load_mapping_lists: load_mapping_lists_mappings( mappings_dir, update_mappings, mappings, load_mapping_lists) pywikibot.output('Loaded all mappings') return mappings
def main(): """Request church categories and output to json.""" church_cats = get_all_church_cats() church_file = os.path.join(MAPPINGS_DIR, 'churches.json') common.open_and_write_file(church_file, church_cats, as_json=True)