def makeHitlist(filenames_file=None): """ Goes through the allowed filenames and builds up a treestructure {directory: [filenames]} as well as a look-up dictionary for filenames to phoId {MulDateiS: {phoMull, filename, ext}} :param filenamesFile: filenames data file :return: dict, dict """ # set defaults unless overridden filenames_file = filenames_file or FILENAMES # load filenames file filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' filenames = helpers.csvFileToDict(filenames_file, 'PhoId', filenames_header) tree = {} name_to_pho = {} for pho_id, v in filenames.iteritems(): old_name = v['MulDateiS'] path = v['MulPfadS'].replace('\\', os.sep) # windows -> current os if path not in tree.keys(): tree[path] = [] tree[path].append(old_name) name_to_pho[old_name] = {'phoMull': u'%s:%s' % (pho_id, v['MulId']), 'filename': v['filename'], 'ext': v['ext']} return (tree, name_to_pho)
def makePhotoAll(photoAllFile, photo_multi, logFile): """ @toDO: if dupes are found then prompt manual cleanup then re-run makePhotoAll(), That way crash isn't complete. Given the photoAll data file read it and drop any entries without a commons connection. Also Simplify the data :param photoAllFile: path to photoAll data file :param photo_multi: photo_multi dict :param logFile: path to logfile :return: dict """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll " u"analysis log have been corrected and the updated " u"photoAll file saved...\n" u"...by pressing enter when done") # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Loading photoAll...") photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \ 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'), photoAllHeader) originalSize = len(photoAll) for k, v in photoAll.items(): link = v['PhoSystematikS'] # drop any entries without files if not link: del photoAll[k] continue # simplify link if '%' in link: link = helpers.urldecode_utf8(link) link = helpers.external_2_internal_link(link, project='wikimedia') link = link[len('[[:commons:File:'):-len(']]')] v['PhoSystematikS'] = link output('PhotoAll reduced from %d to %d entries' % (originalSize, len(photoAll))) # check that none of PhoId from photo_multi occur in photo dupes = [] for phoId in photo_multi.keys(): phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId']) if phoMul in photoAll.keys(): dupes.append(phoMul) if dupes: output(u'Found duplicates between photoAll and photo_multi. ' u'This will most likely mess things up. Check the log at ' u'%s for details.' % logFile) flog.write(u'* duplicates found in photo and photo_all\n' u'phoId:MulId|commonsFile\n') for d in dupes: flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS'])) flog.close() return photoAll
def test_read_write_roundtrip(self): key_col = self.test_header.split('|')[1] read_data = helpers.csvFileToDict(self.test_infile.name, key_col, self.test_header) helpers.dictToCsvFile(self.test_outfile.name, read_data, self.test_header) self.assertEquals(self.test_outfile.read(), self.test_infile.read())
def objDaten_sam(objDatenSamFile, objDaten): """ Adds objDatenSam field to ObjDaten * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param objDatenSamFile: path to ObjDaten-samhörande data file :param objDaten: objDaten dict :return: None (but updates objDaten) """ # setup output(u"Adding ObjDaten-samhörande to ObjDaten") # handle objDatenSam output('\treading ObjDaten_-_samhörande_nr into dictionary... (slow)') objDatenSamHeader = 'OobId|OobObj1ID|OobObj2ID' objDatenSam = helpers.csvFileToDict(objDatenSamFile, 'OobId', objDatenSamHeader) # map object connections output('\tmapping object connections...') objIdConnection = {} for k, v in objDatenSam.iteritems(): objId1 = v['OobObj1ID'] objId2 = v['OobObj2ID'] if objId1 not in objIdConnection.keys(): objIdConnection[objId1] = [] if objId2 not in objIdConnection.keys(): objIdConnection[objId2] = [] objIdConnection[objId1].append(objId2) objIdConnection[objId2].append(objId1) output('\tfound %d connected objIds in %d entries' % (len(objIdConnection), len(objDatenSam))) # clean up connections output('\tremoving dupes, invalids and self...') for objId, connectedIds in objIdConnection.items(): connectedIds = list(set(connectedIds)) # remove dupe if objId in connectedIds: connectedIds.remove(objId) # remove self for conId in connectedIds[:]: # slice allows changes from within loop if conId not in objDaten.keys(): connectedIds.remove(conId) # remove invalid # delete or update if not connectedIds: del objIdConnection[objId] else: objIdConnection[objId] = connectedIds # add to objDaten output('\tadding connections to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['related'] = [] if objId in objIdConnection.keys(): v['related'] = objIdConnection.pop(objId) output(u"...done")
def test_read_data(self): key_col = self.test_header.split('|')[1] expected = { u'2': { u'ett': u'1', u'lista': u'1;2;3;4;5', u'fem': u'5', u'tre': u'3', u'tv\xe5': u'2', u'fyra': u'4'}, u'a2': { u'lista': u'a1;a2;a3;a4;a5', u'ett': u'a1', u'fem': u'a5', u'tre': u'a3', u'tv\xe5': u'a2', u'fyra': u'a4'}} result = helpers.csvFileToDict(self.test_infile.name, key_col, self.test_header) self.assertEquals(result, expected)
def findAllMissing(filenamesFile=FILENAME_FILE, configPath=u'config.json'): """ Goes through the filenames file and checks each name for existence. Missing files are outputted to MISSING_FILES_FILE Existing files are outputted to LSH_EXPORT_FILE :param filenamesFile: path to filenames data file :param configPath: path to config.json file :return: None """ # create targetdirectory if it doesn't exist if not os.path.isdir(POST_DIR): os.mkdir(POST_DIR) # load filenames file filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' filenames = helpers.csvFileToDict(filenamesFile, 'PhoId', filenamesHeader) # identify all Commons filenames files = {} for k, v in filenames.iteritems(): commonsFile = u'File:%s.%s' % (v['filename'], v['ext']) files[commonsFile] = v print u'Found %d filenames' % len(files) # get extra info from Commons comApi = helpers.openConnection(configPath) fileInfos = comApi.getPageInfo(files.keys()) # determine which are present and which are missing missing = {} found = {} prefix = u'https://commons.wikimedia.org/wiki/' for name, info in fileInfos.iteritems(): if name not in files.keys(): print name continue if 'missing' in info.keys(): missing[name] = files[name] else: found[name] = { 'PhoId': files[name]['PhoId'], 'MulId': files[name]['MulId'], 'CommonsFile': '%s%s' % (prefix, name.replace(' ', '_')) } # output files foundHeader = u'PhoId|MulId|CommonsFile' helpers.dictToCsvFile(LSH_EXPORT_FILE, found, foundHeader) helpers.dictToCsvFile(MISSING_FILES_FILE, missing, filenamesHeader) print u'Created %s and %s' % (LSH_EXPORT_FILE, MISSING_FILES_FILE)
def test_read_list_data(self): key_col = self.test_header.split('|')[1] lists = ('lista', ) expected = { u'2': { u'ett': u'1', u'lista': [u'1', u'2', u'3', u'4', u'5'], u'fem': u'5', u'tre': u'3', u'tv\xe5': u'2', u'fyra': u'4'}, u'a2': { u'lista': [u'a1', u'a2', u'a3', u'a4', u'a5'], u'ett': u'a1', u'fem': u'a5', u'tre': u'a3', u'tv\xe5': u'a2', u'fyra': u'a4'}} result = helpers.csvFileToDict(self.test_infile.name, key_col, self.test_header, lists=lists) self.assertEquals(result, expected)
def moveHits(path, filenamesFile=None): """ Goes through the root export directory to find any matching file and moves these to a lower case version of the directory. This flattens out the directory structure whilst making it easy to identify any non-matched files. :param path: path to directory with image file structures :param filenamesFile: filenames data file :return: None """ # set defaults unless overridden filenamesFile = filenamesFile or FILENAMES # Find and move all relevant files tree, name_to_pho = makeHitlist(filenamesFile) subdirs = [] for filename in os.listdir(path): # for LSH all files are in upper case directories filename_path = os.path.join(path, filename) if os.path.isdir(filename_path) and filename.isupper(): subdirs.append(filename_path) for subdir in subdirs: # make a subdir path where (only the) last directory is lower case tmp_path, tmp_dir = os.path.split(subdir) lower_subdir = os.path.join(tmp_path, tmp_dir.lower()) counter, file_num = moveFiles(lower_subdir, tree, name_to_pho, path=subdir) output(u'%s: %d out of %d were hits' % (subdir, counter, file_num)) # load filenames file filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' old_filenames = helpers.csvFileToDict(filenamesFile, 'PhoId', filenames_header) # Add found extentions to filenames file for phoId, v in old_filenames.iteritems(): old_filename = v['MulDateiS'] if old_filename in name_to_pho.keys(): v['ext'] = name_to_pho[old_filename]['ext'] # overwrite extention # output updated file helpers.dictToCsvFile(filenamesFile, old_filenames, filenames_header) # delete all emptied directories for subdir in subdirs: removeEmptyDirectories(subdir, top=False)
def stichworth_photo(stichwortFile, photo_multi): """ Given the photo-multi data and the stichwort data file add a stichwort id field to photo-multi. Also returns the stichwort data after trimming away any unused info :param stichwortFile: path to stichwort data file :param photo_multi: photo_multi dict :return: dict (and updates photo_multi) """ # setup output(u"Adding stichworth to photo") # handle stichwort output(u'\treading in stichwort...') stichwortHeader = 'PstId|PhoId|StiBezeichnungS|StiSynonymS' stichwort = helpers.csvFileToDict(stichwortFile, 'PstId', stichwortHeader) originalSize = len(stichwort) # match each phoId to several stichId # removing any entries with invalid phoIds photoStichConnection = {} for k, v in stichwort.items(): phoId = v['PhoId'] pstId = v['PstId'] if phoId in photo_multi.keys(): if phoId not in photoStichConnection.keys(): photoStichConnection[phoId] = set([]) photoStichConnection[phoId].add(pstId) else: del stichwort[k] output('\tstichwort trimmed from %d to %d, found %d phoId' % (originalSize, len(stichwort), len(photoStichConnection))) # add stichId to photo_multi for k, v in photo_multi.iteritems(): phoId = v['PhoId'] v['PstId'] = [] if phoId in photoStichConnection.keys(): v['PstId'] = list(photoStichConnection.pop(phoId)) # confirm and return output(u"...done") return stichwort
def photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile, objDatenFile, logFile): """ Given the photo_multi data and the phoObjDaten + objDaten data files any additional relevant ObjIds are added to the PhoObjId field of the photo_multi dict, this field is also converted to a list. Also returns objDaten for later use :param photo_multi: photo_multi dict :param photoAll: photoAll dict :param photoObjDatenFile: path to phoObjDaten data file :param objDatenFile: path to objDaten data file :param logFile: path to logfile :return: dict (and updates photo_multi) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Combining all ObjId into the photo file...") # handle objDaten output(u'\treading in objDaten.. (takes a while)') objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \ 'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \ 'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \ 'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|' \ 'ObjFeld06M|ObjReserve01M' objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader) # match each objInvNr to several objId objInvNr2ObjId = {} # old oDict output(u'\tfinding objInvNr connections...') for k, v in objDaten.iteritems(): objId = v['ObjId'] objInvNr = v['ObjInventarNrS'] if not objInvNr: continue if objInvNr not in objInvNr2ObjId.keys(): objInvNr2ObjId[objInvNr] = [] objInvNr2ObjId[objInvNr].append(objId) output('\tFound %d objInvNr connections in %d objects' % (len(objInvNr2ObjId), len(objDaten))) # handle photoObjDaten photoObjDatenHeader = 'PhmId|AufId|AufAufgabeS|MulId|PhoId|ObjInvNrS' photoObjDaten = helpers.csvFileToDict(photoObjDatenFile, 'PhmId', photoObjDatenHeader, keep=('PhoId', 'ObjInvNrS')) # match each phoId to several objId via the ObjInvNr output(u'\tfinding photo-object connections...') photoObjConnections = {} skipped = [] # ObjInvNr not in ObjDaten for k, v in photoObjDaten.iteritems(): objInvNr = v['ObjInvNrS'] phoId = v['PhoId'] if not objInvNr: continue if objInvNr not in objInvNr2ObjId.keys(): skipped.append(objInvNr) continue if phoId not in photoObjConnections.keys(): photoObjConnections[phoId] = [] photoObjConnections[phoId] += objInvNr2ObjId[objInvNr] output('\tFound %d connected photos in %d photoObjDaten entries' % (len(photoObjConnections), len(photoObjDaten))) # add to photo_multi and photoAll photoDicts = (photo_multi, photoAll) allBadObjId = [] for pDict in photoDicts: for k, v in pDict.iteritems(): phoId = v['PhoId'] objIds = [] if phoId not in photoObjConnections.keys(): if v['PhoObjId']: objIds.append(v['PhoObjId']) else: # combine relevant objIds objIds = photoObjConnections.pop(phoId) # new connections if v['PhoObjId']: objIds.append(v['PhoObjId']) # old connection objIds = list(set(objIds)) # remove dupes # check that all of these actually exists (old realObjOnly()) # and remove otherwise badObjId = [] for objId in objIds: if objId not in objDaten.keys(): badObjId.append(objId) if badObjId: allBadObjId += badObjId for badId in badObjId: objIds.remove(badId) # set new value v['PhoObjId'] = objIds # log any skipped ObjInvNr if skipped: skipped = list(set(skipped)) # remove dupes output(u"\tthere were %d skipped ObjInvNr, see log (%s)" % (len(skipped), logFile)) flog.write(u'*Unknown objInvs, i.e. ObjInvNrS in photoObjDaten ' u'without a match in ObjDaten\n') flog.write(u'%s\n' % ', '.join(skipped)) # log any bad objId if allBadObjId: output('\tI found some bad objIds. Check the %s' % logFile) allBadObjId = list(set(allBadObjId)) # remove dupes flog.write(u'* objIds in photo but not in objDaten\n') flog.write(u'%s\n' % ', '.join(allBadObjId)) # trim objDaten trimObjDaten(objDaten, photo_multi, photoAll) # confirm and return output(u"...done") flog.close() return objDaten
def makePhoto_multi(photoFile, multiFile, logFile, tmpFile): """ Given the photo and multimedia data this combines the two into one dict :param photoFile: path to photo data file :param multiFile: path to multimedia data file :param logFile: path to logfile :param tmpFile: path to temporary file :return: dict """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Combining photo and multimedia file for unique files...") pathToTrim = u'R:\web\hires\\' tmpHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ 'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \ 'MulDateiS|MulExtentS' # handle multimedia multiHeader = 'MulId|MulPhoId|MulPfadS|MulDateiS|MulExtentS' multi = helpers.csvFileToDict(multiFile, 'MulId', multiHeader) # check that filename is unique flog.write('* Same files used by different PhoId, format is PhoId/MulId\n') logged = False namelist = [] mulPhoIdList = [] for k, v in multi.iteritems(): name = u'%s\\%s.%s' % (v['MulPfadS'], v['MulDateiS'], v['MulExtentS']) if name in namelist: logged = True flog.write('%s/%s\n' % (v['MulPhoId'], v['MullId'])) else: mulPhoIdList.append(v['MulPhoId']) namelist.append(name) output(u'\tmultimedia: %d' % len(multi)) if not logged: flog.write(u'None =)\n') # handle photo # @toDO add duplicate check to cleanup script photoHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \ 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader) output(u'\tphoto: %d' % len(photo)) # combine combined = {} flog.write(u'* unused rows in multimedia\n') logged = False for k, v in multi.iteritems(): phoId = v['MulPhoId'] mulId = v['MulId'] v['MulPfadS'] = v['MulPfadS'].replace(pathToTrim, u'') # trim filepath v['MulExtentS'] = u'' # MulExtentS is always wrong if phoId not in photo.keys(): logged = True flog.write(u'%s\n' % v) elif not photo[phoId]['MulId'] == v['MulId']: raise MyError("phoId matched but to wrong mulId: p:%s m_found:%s, " "m_expected %s" % (phoId, photo[phoId]['MulId'], mulId)) else: del v['MulPhoId'], v['MulId'] combo = photo.pop(phoId) # move out of photo combo.update(v) # add contents from multi combined[phoId] = combo if not logged: flog.write(u'None =)\n') # log any unused rows in photo flog.write(u'* unused rows in photo\n') logged = False for k, v in photo.iteritems(): logged = True flog.write(u'%s\n' % v) if not logged: flog.write(u'None =)\n') flog.close() output(u"...done") # check if anything needs to be manually handled output(u"Read the log (%s)" % logFile) combined = helpers.promptManualUpdate(combined, tmpFile, tmpHeader, 'PhoId') return combined
def mulMass_add(objMassFile, objMultipleFile, objDaten): """ Given the objMass and the objMultiple data file and the objDaten data add an objMass and an objMultiple field to objDaten. Also returns the objMass and objMultiple data after * removing any entries where objId not in objDaten :param objMassFile: path to objMass data file :param objMultipleFile: path to objMultiple data file :param objDaten: objDaten dict :return: dict, dict (and updates objDaten) """ # setup output(u"Putting objMultiple and objMass into objDaten...") # handle objMass output('\treading ObjMass into dictionary... (yes this takes some time)') objMassHeader = 'ObmId|ObmObjId|ObmTypMasseS|ObmMasseS|' \ 'ObjAufId|AufAufgabeS' objMass = helpers.csvFileToDict(objMassFile, 'ObmId', objMassHeader) originalSize = len(objMass) # invert to get per objId connections # and remove any entries where objId not in objDaten output('\tinverting objMass...') objIdMassConnection = {} for k, v in objMass.items(): obmId = v['ObmId'] objId = v['ObmObjId'] if objId not in objDaten.keys(): del objMass[k] continue if objId not in objIdMassConnection.keys(): objIdMassConnection[objId] = set([]) objIdMassConnection[objId].add(obmId) output('\tobjMass: reduced from %d to %d' % (originalSize, len(objMass))) # handle objMultiple output('\treading ObjMultiple into dictionary... (as does this)') objMultipleHeader = 'OmuId|OmuObjId|OmuTypS|OmuBemerkungM|OmuInhalt01M|' \ 'ObjInventarNrS|ObjAufId|AufAufgabeS' objMultiple = helpers.csvFileToDict(objMultipleFile, 'OmuId', objMultipleHeader) originalSize = len(objMultiple) # invert to get per objId connections # and remove any entries where objId not in objDaten output('\tinverting objIdMultiple...') objIdMultipleConnection = {} for k, v in objMultiple.items(): omulId = v['OmuId'] objId = v['OmuObjId'] if objId not in objDaten.keys(): del objMultiple[k] continue if objId not in objIdMultipleConnection.keys(): objIdMultipleConnection[objId] = set([]) objIdMultipleConnection[objId].add(omulId) output('\tobjMultiple: reduced from %d to %d' % (originalSize, len(objMultiple))) # adding ObjMul and ObjMass id to objDaten output('\tadding ObjMul and ObjMass id to objDaten... (and this)') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['massId'] = [] v['mulId'] = [] if objId in objIdMassConnection.keys(): v['massId'] = list(objIdMassConnection.pop(objId)) if objId in objIdMultipleConnection.keys(): v['mulId'] = list(objIdMultipleConnection.pop(objId)) output(u"...done") return objMass, objMultiple
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile): """ Given the kuenstler data file and the objDaten data add a kuenstler id field to objDaten. Also returns the kuenstler data after * removing certain irrelevant roles and dummy entries * combining all objIds for the same kueId * standardising years * dropping a lot of unneeded fields :param kuenstlerFile: path to kuenstler data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Crunching kuenstler...") dummyNames = (u'ingen uppgift', ) badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare', u'Återförsäljare', u'Konservator') badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia') droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS', 'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId', 'PhoId') # handle kuenstler kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \ 'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \ 'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \ 'KueFunktionS|MulId|PhoId' kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'), kuenstlerHeader) originalSize = len(kuenstler) # collect all kueId and drop any with invalid title or role # also invert to get per objId connections # @toDO: Is keeping objId in kuenstler really needed? # Otherwise populate objIdConnection here foundKueId = {} objIdConnection = {} for k, v in kuenstler.items(): # allow removing entries from within loop kueId = v['KueId'] objId = v['ObjId'] fName = v['KueVorNameS'] lName = v['KueNameS'] role = v['OkuArtS'] roleCmt = v['OkuFunktionS'] # filter out any undesired entries if role in badRoles or \ roleCmt in badRoleCmts or \ len(fName) + len(lName) == 0 or \ lName in dummyNames: del kuenstler[k] continue # send unique role/kueId combo for objid kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId) if objId not in objIdConnection.keys(): objIdConnection[objId] = set([]) objIdConnection[objId].add(kueCombo) # keep only one entry per unique kueId if kueId not in foundKueId.keys(): # keep this entry foundKueId[kueId] = k kuenstler[k]['ObjId'] = set([objId, ]) else: # keep only objId part of this entry kuenstler[foundKueId[kueId]]['ObjId'].add(objId) del kuenstler[k] output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler))) # add to objDaten output('\tadding kueId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['role:roleCmt:kueId'] = [] if objId in objIdConnection.keys(): v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId)) # further cleanup of kuenstler # correcting ort/land entries # stripping years from name # dropping a bunch of fields output('\tfurther cleanup of kuenstler...') for k, v in kuenstler.iteritems(): land = v['KudOrtS'] # missnamed in original database ort = v['KudLandS'] # missnamed in original database lName = v['KueNameS'] bYear = v['KudJahrVonL'] dYear = v['KudJahrBisL'] objIds = v['ObjId'] # correct missnaming in original database v['KudOrtS'] = ort v['KudLandS'] = land # convert set to list v['ObjId'] = list(objIds) # take yearinfo out of name, and store in year lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear) if log: flog.write(log) v['KueNameS'] = lName v['KudJahrVonL'] = bYear v['KudJahrBisL'] = dYear for field in droppedFields: del v[field] flog.close() output(u"...done") return kuenstler
def makeDescriptions(photoFile, objDatenFile, logFile): """ Given the photo and objDaten data this uses the two generate descriptions. Also returns photo for later use :param photoFile: path to photo data file :param multiFile: path to multimedia data file :param logFile: path to logfile :return: dict, dict """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile # load input files photoHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ 'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \ 'MulDateiS|MulExtentS|PstId|same_PhoId|same_object' photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader, lists=('PhoObjId', )) objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \ 'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \ 'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \ 'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|ObjFeld06M|' \ 'ObjReserve01M|ausId|related|ergId|role:roleCmt:kueId|' \ 'mulId|massId' objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader) # start process skipLog = [] # no photoDescr, no objectDescr manyLog = [] # no photoDescr, many objects noHopeLog = [] # no photoDescr, no objects descriptions = {} uniques = set([]) # unique filenames for k, v in photo.iteritems(): phoId = v['PhoId'] objIds = v['PhoObjId'] museum = v['PhoSwdS'] if not museum: museum = u'LSH' phoBes = getDescFromPhoBes(v['PhoBeschreibungM']) if not phoBes: # try to get description from object if len(objIds) == 1 and objIds[0]: # exactly one object phoBes = getDescFromObj(objDaten[objIds[0]]) if not phoBes: # failed to make a description from the object skipLog.append(phoId) elif len(objIds) > 1: # multiple objects manyLog.append(phoId) else: # no objects noHopeLog.append(phoId) if phoBes: filename = u'%s - %s - %s' % (phoBes, museum, phoId) descriptions[phoId] = {'descr': phoBes, 'filename': filename} uniques.add(filename) # check uniqueness if len(uniques) != len(descriptions): output(u'Descriptions are not unique!!!!: %d were duplicate' % (len(uniques) - len(descriptions))) # output logs if skipLog: flog.write('* No-objectDescr and No-photoDescr (phoIds)\n') flog.write('%s\n' % '\n'.join(skipLog)) if manyLog: flog.write('* Many objects and No-photoDescr (phoIds)\n') flog.write('%s\n' % '\n'.join(manyLog)) if noHopeLog: flog.write('* No-objects and No-photoDescr (phoIds)\n') flog.write('%s\n' % '\n'.join(noHopeLog)) # wrap up output(u'Processed %d images out of which %d has some type of problem. ' u'See log (%s) for more info.' % (len(photo), len(photo) - len(descriptions), logFile)) flog.close() return descriptions, photo
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([ objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def ausstellung_objDaten(austellungFile, objDaten): """ Given the austellung data file and the objDaten data add a austellung id field to objDaten. Also returns the austellung data after * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param austellungFile: path to austellung data file :param objDaten: objDaten dict :return: dict (and updates objDaten) """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any year formatting issues mentioned " u"in the analysis log have been corrected and the " u"updated ausstellung file saved...\n" u"...by pressing enter when done") # setup dummyTitles = ( u'reparation', u'utställning', u'lån för undersökning', u'OBS! Testpost för admin - utställning, export wikimedia commons', u'lån till Frankrike 1947', u'test karin 20100520', u'test 20100629 (en post skapad för administrativa tester)', u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk') output(u"Trimming ausstellung and adding ausstellung to ObjDaten...") # handle ausstellung austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \ 'AusDatumBisD|AobObjId|AufAufgabeS' austellung = helpers.csvFileToDict(austellungFile, 'AobId', austellungHeader) originalSize = len(austellung) # collect all ausId and drop any with invalid title # @toDO: Is keeping objId in austellung really needed? # Otherwise populate objIdConnection here foundAusId = {} for k, v in austellung.items(): # allow removing entries from within loop ausId = v['AusId'] objId = v['AobObjId'] title = v['AusTitelS'] if not title or title in dummyTitles: # remove empty/dummy del austellung[k] elif ausId not in foundAusId: # keep this entry foundAusId[ausId] = k austellung[k]['AobObjId'] = set([ objId, ]) austellung[k].pop('AobId') # drop unnecessary id else: # keep only objId part of this entry austellung[foundAusId[ausId]]['AobObjId'].add(objId) del austellung[k] output('\taustellung reduced from %d to %d entries' % (originalSize, len(austellung))) # populate std_year output('\tstandardising years...') for k, v in austellung.iteritems(): year = v['AusJahrS'] yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip() ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip() v['std_year'] = stdAustellungYear(year, yfrom, ytil) # to match with pre-redux results. Could possibly be dropped instead? v['AusDatumVonD'] = yfrom v['AusDatumBisD'] = ytil # invert to get per objId connections # and convert set to list objIdConnection = {} for k, v in austellung.iteritems(): ausId = v['AusId'] objIds = v['AobObjId'] v['AobObjId'] = list(objIds) for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ausId) output('\tadding ausId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ausId'] = [] if objId in objIdConnection.keys(): v['ausId'] = objIdConnection.pop(objId) output(u"...done") return austellung
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def run(outPath=None, dataPath=None, mappingsPath=None, commonsPrefix=None, configPath=u'config.json'): """ Define a list of pages and output files where page has the format Commons:Batch uploading/LSH/* and outputfile the format: commons-*.csv """ # set defaults unless overridden outPath = outPath or OUT_PATH dataPath = dataPath or DATA_PATH mappingsPath = mappingsPath or MAPPING_FOLDER commonsPrefix = commonsPrefix or COMMONS_PREFIX pages = {u'People': u'People', u'Events': u'Events', u'ObjKeywords': u'ObjKeywords', u'Keywords': u'Keywords', # stichwort u'Materials': u'Materials', u'Places': u'Places', u'Photographers': u'Photographers' } # create out_path if it doesn't exist if not os.path.isdir(outPath): os.mkdir(outPath) # fetch, parse and save each page comApi = helpers.openConnection(configPath) for k, v in pages.iteritems(): comPage = u'%s/%s' % (commonsPrefix, k) contents = comApi.getPage(comPage) units = parseEntries(contents[comPage]) outdata = formatOutput(units, k) outFile = os.path.join(outPath, u'commons-%s.csv' % v) out = codecs.open(outFile, 'w', 'utf8') out.write(outdata) out.close() output(u'Created %s' % outFile) # need to do filenames differently mappingFile = os.path.join(mappingsPath, u'Filenames.txt') comPage = u'%s/Filenames' % commonsPrefix contents = comApi.getPage(comPage) # identify changes units, allEntries = parseFilenameEntries(contents[comPage]) if units: # load old filenames filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' filenamesFile = os.path.join(dataPath, u'filenames.csv') oldFilenames = helpers.csvFileToDict(filenamesFile, 'PhoId', filenamesHeader) for unit in units: pho_id = unit[u'phoId'] if pho_id not in oldFilenames.keys(): print u'could not find id in old: %s, %s' % \ (pho_id, unit[u'generated']) exit(1) old_desc = oldFilenames[pho_id][u'filename'] # newDesc = oldDesc.replace(unit[u'generated'], unit[u'improved']) # a safer implementation where new description is appended to # old ending. I.e. "- Museum - idNo" new_desc = u'%s %s' % (unit[u'improved'], splitFilename(old_desc)[1]) if old_desc == new_desc: # indicator that commons file may not having been updated which # may cause more complex problems which are hard to test for print u'did you run the updater a second time without ' \ u'first updating the filenames table on Commons?' exit(1) oldFilenames[pho_id][u'filename'] = new_desc # overwrite old filenames and old mapping # new filename.csv file w. header helpers.dictToCsvFile(filenamesFile, oldFilenames, filenamesHeader) # new Commons mapping file needs a dict with all descriptions mapping_dict = {} for phoId, v in oldFilenames.iteritems(): descr = splitFilename(v[u'filename'])[0] mapping_dict[phoId] = {'descr': descr} Filenames.commonsOutput(mapping_dict, mappingFile, allEntries) output(u'Updated %s and produced a new mappingfile %s. Please upload ' u'the new one to Commons.' % (filenamesFile, mappingFile))
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile): """ Given the kuenstler data file and the objDaten data add a kuenstler id field to objDaten. Also returns the kuenstler data after * removing certain irrelevant roles and dummy entries * combining all objIds for the same kueId * standardising years * dropping a lot of unneeded fields :param kuenstlerFile: path to kuenstler data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Crunching kuenstler...") dummyNames = (u'ingen uppgift', ) badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare', u'Återförsäljare', u'Konservator') badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia') droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS', 'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId', 'PhoId') # handle kuenstler kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \ 'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \ 'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \ 'KueFunktionS|MulId|PhoId' kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'), kuenstlerHeader) originalSize = len(kuenstler) # collect all kueId and drop any with invalid title or role # also invert to get per objId connections # @toDO: Is keeping objId in kuenstler really needed? # Otherwise populate objIdConnection here foundKueId = {} objIdConnection = {} for k, v in kuenstler.items(): # allow removing entries from within loop kueId = v['KueId'] objId = v['ObjId'] fName = v['KueVorNameS'] lName = v['KueNameS'] role = v['OkuArtS'] roleCmt = v['OkuFunktionS'] # filter out any undesired entries if role in badRoles or \ roleCmt in badRoleCmts or \ len(fName) + len(lName) == 0 or \ lName in dummyNames: del kuenstler[k] continue # send unique role/kueId combo for objid kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId) if objId not in objIdConnection.keys(): objIdConnection[objId] = set([]) objIdConnection[objId].add(kueCombo) # keep only one entry per unique kueId if kueId not in foundKueId.keys(): # keep this entry foundKueId[kueId] = k kuenstler[k]['ObjId'] = set([ objId, ]) else: # keep only objId part of this entry kuenstler[foundKueId[kueId]]['ObjId'].add(objId) del kuenstler[k] output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler))) # add to objDaten output('\tadding kueId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['role:roleCmt:kueId'] = [] if objId in objIdConnection.keys(): v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId)) # further cleanup of kuenstler # correcting ort/land entries # stripping years from name # dropping a bunch of fields output('\tfurther cleanup of kuenstler...') for k, v in kuenstler.iteritems(): land = v['KudOrtS'] # missnamed in original database ort = v['KudLandS'] # missnamed in original database lName = v['KueNameS'] bYear = v['KudJahrVonL'] dYear = v['KudJahrBisL'] objIds = v['ObjId'] # correct missnaming in original database v['KudOrtS'] = ort v['KudLandS'] = land # convert set to list v['ObjId'] = list(objIds) # take yearinfo out of name, and store in year lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear) if log: flog.write(log) v['KueNameS'] = lName v['KudJahrVonL'] = bYear v['KudJahrBisL'] = dYear for field in droppedFields: del v[field] flog.close() output(u"...done") return kuenstler
def ausstellung_objDaten(austellungFile, objDaten): """ Given the austellung data file and the objDaten data add a austellung id field to objDaten. Also returns the austellung data after * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param austellungFile: path to austellung data file :param objDaten: objDaten dict :return: dict (and updates objDaten) """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any year formatting issues mentioned " u"in the analysis log have been corrected and the " u"updated ausstellung file saved...\n" u"...by pressing enter when done") # setup dummyTitles = ( u'reparation', u'utställning', u'lån för undersökning', u'OBS! Testpost för admin - utställning, export wikimedia commons', u'lån till Frankrike 1947', u'test karin 20100520', u'test 20100629 (en post skapad för administrativa tester)', u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk') output(u"Trimming ausstellung and adding ausstellung to ObjDaten...") # handle ausstellung austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \ 'AusDatumBisD|AobObjId|AufAufgabeS' austellung = helpers.csvFileToDict(austellungFile, 'AobId', austellungHeader) originalSize = len(austellung) # collect all ausId and drop any with invalid title # @toDO: Is keeping objId in austellung really needed? # Otherwise populate objIdConnection here foundAusId = {} for k, v in austellung.items(): # allow removing entries from within loop ausId = v['AusId'] objId = v['AobObjId'] title = v['AusTitelS'] if not title or title in dummyTitles: # remove empty/dummy del austellung[k] elif ausId not in foundAusId: # keep this entry foundAusId[ausId] = k austellung[k]['AobObjId'] = set([objId, ]) austellung[k].pop('AobId') # drop unnecessary id else: # keep only objId part of this entry austellung[foundAusId[ausId]]['AobObjId'].add(objId) del austellung[k] output('\taustellung reduced from %d to %d entries' % (originalSize, len(austellung))) # populate std_year output('\tstandardising years...') for k, v in austellung.iteritems(): year = v['AusJahrS'] yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip() ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip() v['std_year'] = stdAustellungYear(year, yfrom, ytil) # to match with pre-redux results. Could possibly be dropped instead? v['AusDatumVonD'] = yfrom v['AusDatumBisD'] = ytil # invert to get per objId connections # and convert set to list objIdConnection = {} for k, v in austellung.iteritems(): ausId = v['AusId'] objIds = v['AobObjId'] v['AobObjId'] = list(objIds) for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ausId) output('\tadding ausId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ausId'] = [] if objId in objIdConnection.keys(): v['ausId'] = objIdConnection.pop(objId) output(u"...done") return austellung