Esempio n. 1
def makeHitlist(filenames_file=None):
    Goes through the allowed filenames and builds up a treestructure
    {directory: [filenames]} as well as a look-up dictionary for filenames
    to phoId {MulDateiS: {phoMull, filename, ext}}
    :param filenamesFile: filenames data file
    :return: dict, dict
    # set defaults unless overridden
    filenames_file = filenames_file or FILENAMES

    # load filenames file
    filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'
    filenames = helpers.csvFileToDict(filenames_file, 'PhoId',

    tree = {}
    name_to_pho = {}
    for pho_id, v in filenames.iteritems():
        old_name = v['MulDateiS']
        path = v['MulPfadS'].replace('\\', os.sep)  # windows -> current os
        if path not in tree.keys():
            tree[path] = []
        name_to_pho[old_name] = {'phoMull': u'%s:%s' % (pho_id, v['MulId']),
                                 'filename': v['filename'],
                                 'ext': v['ext']}
    return (tree, name_to_pho)
Esempio n. 2
def makePhotoAll(photoAllFile, photo_multi, logFile):
    @toDO: if dupes are found then prompt manual cleanup then re-run
           makePhotoAll(), That way crash isn't complete.
    Given the photoAll data file read it and drop any entries without a
    commons connection. Also Simplify the data
    :param photoAllFile: path to photoAll data file
    :param photo_multi: photo_multi dict
    :param logFile: path to logfile
    :return: dict
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll "
                         u"analysis log have been corrected and the updated "
                         u"photoAll file saved...\n"
                         u" pressing enter when done")

    # setup
    flog =, 'w', 'utf-8')  # logfile
    output(u"Loading photoAll...")
    photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
    photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'),
    originalSize = len(photoAll)

    for k, v in photoAll.items():
        link = v['PhoSystematikS']

        # drop any entries without files
        if not link:
            del photoAll[k]

        # simplify link
        if '%' in link:
            link = helpers.urldecode_utf8(link)
        link = helpers.external_2_internal_link(link, project='wikimedia')
        link = link[len('[[:commons:File:'):-len(']]')]
        v['PhoSystematikS'] = link
    output('PhotoAll reduced from %d to %d entries' %
           (originalSize, len(photoAll)))

    # check that none of PhoId from photo_multi occur in photo
    dupes = []
    for phoId in photo_multi.keys():
        phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId'])
        if phoMul in photoAll.keys():
    if dupes:
        output(u'Found duplicates between photoAll and photo_multi. '
               u'This will most likely mess things up. Check the log at '
               u'%s for details.' % logFile)
        flog.write(u'* duplicates found in photo and photo_all\n'
        for d in dupes:
            flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS']))

    return photoAll
Esempio n. 3
 def test_read_write_roundtrip(self):
     key_col = self.test_header.split('|')[1]
     read_data = helpers.csvFileToDict(, key_col,
                           read_data, self.test_header)
Esempio n. 4
Esempio n. 5
def objDaten_sam(objDatenSamFile, objDaten):
    Adds objDatenSam field to ObjDaten
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param objDatenSamFile: path to ObjDaten-samhörande data file
    :param objDaten: objDaten dict
    :return: None (but updates objDaten)
    # setup
    output(u"Adding ObjDaten-samhörande to ObjDaten")

    # handle objDatenSam
    output('\treading ObjDaten_-_samhörande_nr into dictionary... (slow)')
    objDatenSamHeader = 'OobId|OobObj1ID|OobObj2ID'
    objDatenSam = helpers.csvFileToDict(objDatenSamFile, 'OobId',

    # map object connections
    output('\tmapping object connections...')
    objIdConnection = {}
    for k, v in objDatenSam.iteritems():
        objId1 = v['OobObj1ID']
        objId2 = v['OobObj2ID']
        if objId1 not in objIdConnection.keys():
            objIdConnection[objId1] = []
        if objId2 not in objIdConnection.keys():
            objIdConnection[objId2] = []
    output('\tfound %d connected objIds in %d entries' %
           (len(objIdConnection), len(objDatenSam)))

    # clean up connections
    output('\tremoving dupes, invalids and self...')
    for objId, connectedIds in objIdConnection.items():
        connectedIds = list(set(connectedIds))  # remove dupe
        if objId in connectedIds:
            connectedIds.remove(objId)  # remove self
        for conId in connectedIds[:]:  # slice allows changes from within loop
            if conId not in objDaten.keys():
                connectedIds.remove(conId)  # remove invalid

        # delete or update
        if not connectedIds:
            del objIdConnection[objId]
            objIdConnection[objId] = connectedIds

    # add to objDaten
    output('\tadding connections to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['related'] = []
        if objId in objIdConnection.keys():
            v['related'] = objIdConnection.pop(objId)

Esempio n. 6
Esempio n. 7
 def test_read_data(self):
     key_col = self.test_header.split('|')[1]
     expected = {
         u'2': {
             u'ett': u'1', u'lista': u'1;2;3;4;5', u'fem': u'5',
             u'tre': u'3', u'tv\xe5': u'2', u'fyra': u'4'},
         u'a2': {
             u'lista': u'a1;a2;a3;a4;a5',
             u'ett': u'a1', u'fem': u'a5', u'tre': u'a3',
             u'tv\xe5': u'a2', u'fyra': u'a4'}}
     result = helpers.csvFileToDict(, key_col,
     self.assertEquals(result, expected)
Esempio n. 8
def findAllMissing(filenamesFile=FILENAME_FILE, configPath=u'config.json'):
    Goes through the filenames file and checks each name for existence.
    Missing files are outputted to MISSING_FILES_FILE
    Existing files are outputted to LSH_EXPORT_FILE
    :param filenamesFile: path to filenames data file
    :param configPath: path to config.json file
    :return: None
    # create targetdirectory if it doesn't exist
    if not os.path.isdir(POST_DIR):

    # load filenames file
    filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'
    filenames = helpers.csvFileToDict(filenamesFile, 'PhoId', filenamesHeader)

    # identify all Commons filenames
    files = {}
    for k, v in filenames.iteritems():
        commonsFile = u'File:%s.%s' % (v['filename'], v['ext'])
        files[commonsFile] = v
    print u'Found %d filenames' % len(files)

    # get extra info from Commons
    comApi = helpers.openConnection(configPath)
    fileInfos = comApi.getPageInfo(files.keys())

    # determine which are present and which are missing
    missing = {}
    found = {}
    prefix = u''
    for name, info in fileInfos.iteritems():
        if name not in files.keys():
            print name
        if 'missing' in info.keys():
            missing[name] = files[name]
            found[name] = {
                'PhoId': files[name]['PhoId'],
                'MulId': files[name]['MulId'],
                'CommonsFile': '%s%s' % (prefix, name.replace(' ', '_'))

    # output files
    foundHeader = u'PhoId|MulId|CommonsFile'
    helpers.dictToCsvFile(LSH_EXPORT_FILE, found, foundHeader)
    helpers.dictToCsvFile(MISSING_FILES_FILE, missing, filenamesHeader)
    print u'Created %s and %s' % (LSH_EXPORT_FILE, MISSING_FILES_FILE)
Esempio n. 9
 def test_read_list_data(self):
     key_col = self.test_header.split('|')[1]
     lists = ('lista', )
     expected = {
         u'2': {
             u'ett': u'1',
             u'lista': [u'1', u'2', u'3', u'4', u'5'],
             u'fem': u'5', u'tre': u'3', u'tv\xe5': u'2', u'fyra': u'4'},
         u'a2': {
             u'lista': [u'a1', u'a2', u'a3', u'a4', u'a5'],
             u'ett': u'a1', u'fem': u'a5', u'tre': u'a3',
             u'tv\xe5': u'a2', u'fyra': u'a4'}}
     result = helpers.csvFileToDict(, key_col,
                                    self.test_header, lists=lists)
     self.assertEquals(result, expected)
Esempio n. 10
def moveHits(path, filenamesFile=None):
    Goes through the root export directory to find any matching file and
    moves these to a lower case version of the directory. This flattens
    out the directory structure whilst making it easy to identify any
    non-matched files.
    :param path: path to directory with image file structures
    :param filenamesFile: filenames data file
    :return: None
    # set defaults unless overridden
    filenamesFile = filenamesFile or FILENAMES

    # Find and move all relevant files
    tree, name_to_pho = makeHitlist(filenamesFile)
    subdirs = []
    for filename in os.listdir(path):
        # for LSH all files are in upper case directories
        filename_path = os.path.join(path, filename)
        if os.path.isdir(filename_path) and filename.isupper():
    for subdir in subdirs:
        # make a subdir path where (only the) last directory is lower case
        tmp_path, tmp_dir = os.path.split(subdir)
        lower_subdir = os.path.join(tmp_path, tmp_dir.lower())

        counter, file_num = moveFiles(lower_subdir, tree, name_to_pho,
        output(u'%s: %d out of %d were hits' % (subdir, counter, file_num))

    # load filenames file
    filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'
    old_filenames = helpers.csvFileToDict(filenamesFile, 'PhoId',

    # Add found extentions to filenames file
    for phoId, v in old_filenames.iteritems():
        old_filename = v['MulDateiS']
        if old_filename in name_to_pho.keys():
            v['ext'] = name_to_pho[old_filename]['ext']  # overwrite extention

    # output updated file
    helpers.dictToCsvFile(filenamesFile, old_filenames, filenames_header)

    # delete all emptied directories
    for subdir in subdirs:
        removeEmptyDirectories(subdir, top=False)
Esempio n. 11
def stichworth_photo(stichwortFile, photo_multi):
    Given the photo-multi data and the stichwort data file add a stichwort id
    field to photo-multi.
    Also returns the stichwort data after trimming away any unused info
    :param stichwortFile: path to stichwort data file
    :param photo_multi: photo_multi dict
    :return: dict (and updates photo_multi)
    # setup
    output(u"Adding stichworth to photo")

    # handle stichwort
    output(u'\treading in stichwort...')
    stichwortHeader = 'PstId|PhoId|StiBezeichnungS|StiSynonymS'
    stichwort = helpers.csvFileToDict(stichwortFile, 'PstId', stichwortHeader)
    originalSize = len(stichwort)

    # match each phoId to several stichId
    # removing any entries with invalid phoIds
    photoStichConnection = {}
    for k, v in stichwort.items():
        phoId = v['PhoId']
        pstId = v['PstId']
        if phoId in photo_multi.keys():
            if phoId not in photoStichConnection.keys():
                photoStichConnection[phoId] = set([])
            del stichwort[k]
    output('\tstichwort trimmed from %d to %d, found %d phoId' %
           (originalSize, len(stichwort), len(photoStichConnection)))

    # add stichId to photo_multi
    for k, v in photo_multi.iteritems():
        phoId = v['PhoId']
        v['PstId'] = []
        if phoId in photoStichConnection.keys():
            v['PstId'] = list(photoStichConnection.pop(phoId))

    # confirm and return
    return stichwort
Esempio n. 12
Esempio n. 13
def photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile, objDatenFile,
    Given the photo_multi data and the phoObjDaten + objDaten data files
    any additional relevant ObjIds are added to the PhoObjId field of the
    photo_multi dict, this field is also converted to a list.
    Also returns objDaten for later use
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :param photoObjDatenFile: path to phoObjDaten data file
    :param objDatenFile: path to objDaten data file
    :param logFile: path to logfile
    :return: dict (and updates photo_multi)
    # setup
    flog =, 'w', 'utf-8')  # logfile
    output(u"Combining all ObjId into the photo file...")

    # handle objDaten
    output(u'\treading in objDaten.. (takes a while)')
    objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \
                     'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \
                     'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \
                     'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|' \
    objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader)

    # match each objInvNr to several objId
    objInvNr2ObjId = {}  # old oDict
    output(u'\tfinding objInvNr connections...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        objInvNr = v['ObjInventarNrS']
        if not objInvNr:
        if objInvNr not in objInvNr2ObjId.keys():
            objInvNr2ObjId[objInvNr] = []
    output('\tFound %d objInvNr connections in %d objects' %
           (len(objInvNr2ObjId), len(objDaten)))

    # handle photoObjDaten
    photoObjDatenHeader = 'PhmId|AufId|AufAufgabeS|MulId|PhoId|ObjInvNrS'
    photoObjDaten = helpers.csvFileToDict(photoObjDatenFile,
                                          keep=('PhoId', 'ObjInvNrS'))

    # match each phoId to several objId via the ObjInvNr
    output(u'\tfinding photo-object connections...')
    photoObjConnections = {}
    skipped = []  # ObjInvNr not in ObjDaten
    for k, v in photoObjDaten.iteritems():
        objInvNr = v['ObjInvNrS']
        phoId = v['PhoId']
        if not objInvNr:
        if objInvNr not in objInvNr2ObjId.keys():
        if phoId not in photoObjConnections.keys():
            photoObjConnections[phoId] = []
        photoObjConnections[phoId] += objInvNr2ObjId[objInvNr]
    output('\tFound %d connected photos in %d photoObjDaten entries' %
           (len(photoObjConnections), len(photoObjDaten)))

    # add to photo_multi and photoAll
    photoDicts = (photo_multi, photoAll)
    allBadObjId = []
    for pDict in photoDicts:
        for k, v in pDict.iteritems():
            phoId = v['PhoId']
            objIds = []
            if phoId not in photoObjConnections.keys():
                if v['PhoObjId']:
                # combine relevant objIds
                objIds = photoObjConnections.pop(phoId)  # new connections
                if v['PhoObjId']:
                    objIds.append(v['PhoObjId'])  # old connection
                objIds = list(set(objIds))  # remove dupes

            # check that all of these actually exists (old realObjOnly())
            # and remove otherwise
            badObjId = []
            for objId in objIds:
                if objId not in objDaten.keys():
            if badObjId:
                allBadObjId += badObjId
                for badId in badObjId:

            # set new value
            v['PhoObjId'] = objIds

    # log any skipped ObjInvNr
    if skipped:
        skipped = list(set(skipped))  # remove dupes
        output(u"\tthere were %d skipped ObjInvNr, see log (%s)" %
               (len(skipped), logFile))
        flog.write(u'*Unknown objInvs, i.e. ObjInvNrS in photoObjDaten '
                   u'without a match in ObjDaten\n')
        flog.write(u'%s\n' % ', '.join(skipped))

    # log any bad objId
    if allBadObjId:
        output('\tI found some bad objIds. Check the %s' % logFile)
        allBadObjId = list(set(allBadObjId))  # remove dupes
        flog.write(u'* objIds in photo but not in objDaten\n')
        flog.write(u'%s\n' % ', '.join(allBadObjId))

    # trim objDaten
    trimObjDaten(objDaten, photo_multi, photoAll)

    # confirm and return
    return objDaten
Esempio n. 14
def makePhoto_multi(photoFile, multiFile, logFile, tmpFile):
    Given the photo and multimedia data this combines the two into one dict
    :param photoFile: path to photo data file
    :param multiFile: path to multimedia data file
    :param logFile: path to logfile
    :param tmpFile: path to temporary file
    :return: dict
    # setup
    flog =, 'w', 'utf-8')  # logfile
    output(u"Combining photo and multimedia file for unique files...")
    pathToTrim = u'R:\web\hires\\'
    tmpHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \
                'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \

    # handle multimedia
    multiHeader = 'MulId|MulPhoId|MulPfadS|MulDateiS|MulExtentS'
    multi = helpers.csvFileToDict(multiFile, 'MulId', multiHeader)

    # check that filename is unique
    flog.write('* Same files used by different PhoId, format is PhoId/MulId\n')
    logged = False
    namelist = []
    mulPhoIdList = []
    for k, v in multi.iteritems():
        name = u'%s\\%s.%s' % (v['MulPfadS'], v['MulDateiS'], v['MulExtentS'])
        if name in namelist:
            logged = True
            flog.write('%s/%s\n' % (v['MulPhoId'], v['MullId']))
    output(u'\tmultimedia: %d' % len(multi))
    if not logged:
        flog.write(u'None =)\n')

    # handle photo
    # @toDO add duplicate check to cleanup script
    photoHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
    photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader)
    output(u'\tphoto: %d' % len(photo))

    # combine
    combined = {}
    flog.write(u'* unused rows in multimedia\n')
    logged = False
    for k, v in multi.iteritems():
        phoId = v['MulPhoId']
        mulId = v['MulId']
        v['MulPfadS'] = v['MulPfadS'].replace(pathToTrim, u'')  # trim filepath
        v['MulExtentS'] = u''  # MulExtentS is always wrong
        if phoId not in photo.keys():
            logged = True
            flog.write(u'%s\n' % v)
        elif not photo[phoId]['MulId'] == v['MulId']:
            raise MyError("phoId matched but to wrong mulId: p:%s m_found:%s, "
                          "m_expected %s" %
                          (phoId, photo[phoId]['MulId'], mulId))
            del v['MulPhoId'], v['MulId']
            combo = photo.pop(phoId)  # move out of photo
            combo.update(v)  # add contents from multi
            combined[phoId] = combo
    if not logged:
        flog.write(u'None =)\n')

    # log any unused rows in photo
    flog.write(u'* unused rows in photo\n')
    logged = False
    for k, v in photo.iteritems():
        logged = True
        flog.write(u'%s\n' % v)
    if not logged:
        flog.write(u'None =)\n')

    # check if anything needs to be manually handled
    output(u"Read the log (%s)" % logFile)
    combined = helpers.promptManualUpdate(combined, tmpFile, tmpHeader,

    return combined
Esempio n. 15
def mulMass_add(objMassFile, objMultipleFile, objDaten):
    Given the objMass and the objMultiple data file and the objDaten data
    add an objMass and an objMultiple field to objDaten.
    Also returns the objMass and objMultiple data after
    * removing any entries where objId not in objDaten
    :param objMassFile: path to objMass data file
    :param objMultipleFile: path to objMultiple data file
    :param objDaten: objDaten dict
    :return: dict, dict (and updates objDaten)
    # setup
    output(u"Putting objMultiple and objMass into objDaten...")

    # handle objMass
    output('\treading ObjMass into dictionary... (yes this takes some time)')
    objMassHeader = 'ObmId|ObmObjId|ObmTypMasseS|ObmMasseS|' \
    objMass = helpers.csvFileToDict(objMassFile, 'ObmId', objMassHeader)
    originalSize = len(objMass)

    # invert to get per objId connections
    # and remove any entries where objId not in objDaten
    output('\tinverting objMass...')
    objIdMassConnection = {}
    for k, v in objMass.items():
        obmId = v['ObmId']
        objId = v['ObmObjId']
        if objId not in objDaten.keys():
            del objMass[k]
        if objId not in objIdMassConnection.keys():
            objIdMassConnection[objId] = set([])
    output('\tobjMass: reduced from %d to %d' % (originalSize, len(objMass)))

    # handle objMultiple
    output('\treading ObjMultiple into dictionary... (as does this)')
    objMultipleHeader = 'OmuId|OmuObjId|OmuTypS|OmuBemerkungM|OmuInhalt01M|' \
    objMultiple = helpers.csvFileToDict(objMultipleFile, 'OmuId',
    originalSize = len(objMultiple)

    # invert to get per objId connections
    # and remove any entries where objId not in objDaten
    output('\tinverting objIdMultiple...')
    objIdMultipleConnection = {}
    for k, v in objMultiple.items():
        omulId = v['OmuId']
        objId = v['OmuObjId']
        if objId not in objDaten.keys():
            del objMultiple[k]
        if objId not in objIdMultipleConnection.keys():
            objIdMultipleConnection[objId] = set([])
    output('\tobjMultiple: reduced from %d to %d' %
           (originalSize, len(objMultiple)))

    # adding ObjMul and ObjMass id to objDaten
    output('\tadding ObjMul and ObjMass id to objDaten... (and this)')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['massId'] = []
        v['mulId'] = []
        if objId in objIdMassConnection.keys():
            v['massId'] = list(objIdMassConnection.pop(objId))
        if objId in objIdMultipleConnection.keys():
            v['mulId'] = list(objIdMultipleConnection.pop(objId))

    return objMass, objMultiple
Esempio n. 16
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile):
    Given the kuenstler data file and the objDaten data add a kuenstler id
    field to objDaten.
    Also returns the kuenstler data after
    * removing certain irrelevant roles and dummy entries
    * combining all objIds for the same kueId
    * standardising years
    * dropping a lot of unneeded fields
    :param kuenstlerFile: path to kuenstler data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    # setup
    flog =, 'w', 'utf-8')  # logfile
    output(u"Crunching kuenstler...")
    dummyNames = (u'ingen uppgift', )
    badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare',
                u'Återförsäljare', u'Konservator')
    badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia')
    droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS',
                     'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId',

    # handle kuenstler
    kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \
                      'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \
                      'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \
    kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'),
    originalSize = len(kuenstler)

    # collect all kueId and drop any with invalid title or role
    # also invert to get per objId connections
    # @toDO: Is keeping objId in kuenstler really needed?
    #        Otherwise populate objIdConnection here
    foundKueId = {}
    objIdConnection = {}
    for k, v in kuenstler.items():  # allow removing entries from within loop
        kueId = v['KueId']
        objId = v['ObjId']
        fName = v['KueVorNameS']
        lName = v['KueNameS']
        role = v['OkuArtS']
        roleCmt = v['OkuFunktionS']

        # filter out any undesired entries
        if role in badRoles or \
                roleCmt in badRoleCmts or \
                len(fName) + len(lName) == 0 or \
                lName in dummyNames:
            del kuenstler[k]

        # send unique role/kueId combo for objid
        kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId)
        if objId not in objIdConnection.keys():
            objIdConnection[objId] = set([])

        # keep only one entry per unique kueId
        if kueId not in foundKueId.keys():  # keep this entry
            foundKueId[kueId] = k
            kuenstler[k]['ObjId'] = set([objId, ])
        else:  # keep only objId part of this entry
            del kuenstler[k]
    output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler)))

    # add to objDaten
    output('\tadding kueId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['role:roleCmt:kueId'] = []
        if objId in objIdConnection.keys():
            v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId))

    # further cleanup of kuenstler
    # correcting ort/land entries
    # stripping years from name
    # dropping a bunch of fields
    output('\tfurther cleanup of kuenstler...')
    for k, v in kuenstler.iteritems():
        land = v['KudOrtS']  # missnamed in original database
        ort = v['KudLandS']  # missnamed in original database
        lName = v['KueNameS']
        bYear = v['KudJahrVonL']
        dYear = v['KudJahrBisL']
        objIds = v['ObjId']

        # correct missnaming in original database
        v['KudOrtS'] = ort
        v['KudLandS'] = land

        # convert set to list
        v['ObjId'] = list(objIds)

        # take yearinfo out of name, and store in year
        lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear)
        if log:
        v['KueNameS'] = lName
        v['KudJahrVonL'] = bYear
        v['KudJahrBisL'] = dYear

        for field in droppedFields:
            del v[field]

    return kuenstler
Esempio n. 17
Esempio n. 18
def makeDescriptions(photoFile, objDatenFile, logFile):
    Given the photo and objDaten data this uses the two generate descriptions.
    Also returns photo for later use
    :param photoFile: path to photo data file
    :param multiFile: path to multimedia data file
    :param logFile: path to logfile
    :return: dict, dict
    # setup
    flog =, 'w', 'utf-8')  # logfile

    # load input files
    photoHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \
                  'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \
    photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader,
                                  lists=('PhoObjId', ))

    objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \
                     'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \
                     'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \
                     'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|ObjFeld06M|' \
                     'ObjReserve01M|ausId|related|ergId|role:roleCmt:kueId|' \
    objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader)

    # start process
    skipLog = []  # no photoDescr, no objectDescr
    manyLog = []  # no photoDescr, many objects
    noHopeLog = []  # no photoDescr, no objects
    descriptions = {}
    uniques = set([])  # unique filenames
    for k, v in photo.iteritems():
        phoId = v['PhoId']
        objIds = v['PhoObjId']
        museum = v['PhoSwdS']
        if not museum:
            museum = u'LSH'
        phoBes = getDescFromPhoBes(v['PhoBeschreibungM'])

        if not phoBes:  # try to get description from object
            if len(objIds) == 1 and objIds[0]:
                # exactly one object
                phoBes = getDescFromObj(objDaten[objIds[0]])
                if not phoBes:
                    # failed to make a description from the object
            elif len(objIds) > 1:
                # multiple objects
                # no objects

        if phoBes:
            filename = u'%s - %s - %s' % (phoBes, museum, phoId)
            descriptions[phoId] = {'descr': phoBes,
                                   'filename': filename}

    # check uniqueness
    if len(uniques) != len(descriptions):
        output(u'Descriptions are not unique!!!!: %d were duplicate' %
               (len(uniques) - len(descriptions)))

    # output logs
    if skipLog:
        flog.write('* No-objectDescr and No-photoDescr (phoIds)\n')
        flog.write('%s\n' % '\n'.join(skipLog))
    if manyLog:
        flog.write('* Many objects and No-photoDescr (phoIds)\n')
        flog.write('%s\n' % '\n'.join(manyLog))
    if noHopeLog:
        flog.write('* No-objects and No-photoDescr (phoIds)\n')
        flog.write('%s\n' % '\n'.join(noHopeLog))

    # wrap up
    output(u'Processed %d images out of which %d has some type of problem. '
           u'See log (%s) for more info.' %
           (len(photo), len(photo) - len(descriptions), logFile))
    return descriptions, photo
Esempio n. 19
def ereignis_objDaten(ereignisFile, objDaten, logFile):
    Given the ereignis data file and the objDaten data add a ereignis id
    field to objDaten.
    Also returns the ereignis data after
    * combining all objIds for the same ergId
    * dropping EroId
    :param ereignisFile: path to eregnis data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    # setup
    flog =, 'w', 'utf-8')  # logfile
    output(u"Trimming eregnis and adding eregnis to ObjDaten...")

    # handle eregnis
    ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS'
    ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader)
    originalSize = len(ereignis)

    # collect all ergId and drop any with invalid title
    # @toDO: Is keeping objId in eregnis really needed?
    #        Otherwise populate objIdConnection here
    foundErgId = {}
    for k, v in ereignis.items():  # allow removing entries from within loop
        ergId = v['ErgId']
        objId = v['EroObjId']
        title = v['ErgKurztitelS']
        if not title:  # remove empty
            del ereignis[k]
        elif ergId not in foundErgId.keys():  # keep this entry
            foundErgId[ergId] = k
            ereignis[k]['EroObjId'] = set([
            ereignis[k].pop('EroId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            del ereignis[k]
    output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis)))

    # handle urls in ereignis and convert set to list
    for k, v in ereignis.iteritems():
        objIds = v['EroObjId']
        url = v['ErgArtS']

        # convert set to list
        v['EroObjId'] = list(objIds)

        # handle urls
        if u'%' in url:
            url = helpers.urldecode_utf8(url)
        # convert external links to internal
        if 'wikipedia' in url:
            url = helpers.external_2_internal_link(url)
        elif url:
            flog.write(u'weird url: %s\n' % url)
        v['ErgArtS'] = url

    # invert to get per objId connections
    objIdConnection = {}
    for k, v in ereignis.iteritems():
        ergId = v['ErgId']
        objIds = v['EroObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []

    # add to objDaten
    output('\tadding ergId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ergId'] = []
        if objId in objIdConnection.keys():
            v['ergId'] = objIdConnection.pop(objId)

    return ereignis
Esempio n. 20
Esempio n. 21
Esempio n. 22
def ausstellung_objDaten(austellungFile, objDaten):
    Given the austellung data file and the objDaten data add a austellung id
    field to objDaten.
    Also returns the austellung data after
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param austellungFile: path to austellung data file
    :param objDaten: objDaten dict
    :return: dict (and updates objDaten)
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any year formatting issues mentioned "
                         u"in the analysis log have been corrected and the "
                         u"updated ausstellung file saved...\n"
                         u" pressing enter when done")

    # setup
    dummyTitles = (
        u'reparation', u'utställning', u'lån för undersökning',
        u'OBS! Testpost för admin - utställning, export wikimedia commons',
        u'lån till Frankrike 1947', u'test karin 20100520',
        u'test 20100629 (en post skapad för administrativa tester)',
        u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk')
    output(u"Trimming ausstellung and adding ausstellung to ObjDaten...")

    # handle ausstellung
    austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \
    austellung = helpers.csvFileToDict(austellungFile, 'AobId',
    originalSize = len(austellung)

    # collect all ausId and drop any with invalid title
    # @toDO: Is keeping objId in austellung really needed?
    #        Otherwise populate objIdConnection here
    foundAusId = {}
    for k, v in austellung.items():  # allow removing entries from within loop
        ausId = v['AusId']
        objId = v['AobObjId']
        title = v['AusTitelS']
        if not title or title in dummyTitles:  # remove empty/dummy
            del austellung[k]
        elif ausId not in foundAusId:  # keep this entry
            foundAusId[ausId] = k
            austellung[k]['AobObjId'] = set([
            austellung[k].pop('AobId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            del austellung[k]
    output('\taustellung reduced from %d to %d entries' %
           (originalSize, len(austellung)))

    # populate std_year
    output('\tstandardising years...')
    for k, v in austellung.iteritems():
        year = v['AusJahrS']
        yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip()
        ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip()
        v['std_year'] = stdAustellungYear(year, yfrom, ytil)
        # to match with pre-redux results. Could possibly be dropped instead?
        v['AusDatumVonD'] = yfrom
        v['AusDatumBisD'] = ytil

    # invert to get per objId connections
    # and convert set to list
    objIdConnection = {}
    for k, v in austellung.iteritems():
        ausId = v['AusId']
        objIds = v['AobObjId']
        v['AobObjId'] = list(objIds)
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []

    output('\tadding ausId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ausId'] = []
        if objId in objIdConnection.keys():
            v['ausId'] = objIdConnection.pop(objId)

    return austellung
Esempio n. 23
Esempio n. 24
def run(outPath=None, dataPath=None, mappingsPath=None,
        commonsPrefix=None, configPath=u'config.json'):
    Define a list of pages and output files
    where page has the format Commons:Batch uploading/LSH/*
    and outputfile the format: commons-*.csv
    # set defaults unless overridden
    outPath = outPath or OUT_PATH
    dataPath = dataPath or DATA_PATH
    mappingsPath = mappingsPath or MAPPING_FOLDER
    commonsPrefix = commonsPrefix or COMMONS_PREFIX

    pages = {u'People': u'People',
             u'Events': u'Events',
             u'ObjKeywords': u'ObjKeywords',
             u'Keywords': u'Keywords',  # stichwort
             u'Materials': u'Materials',
             u'Places': u'Places',
             u'Photographers': u'Photographers'
    # create out_path if it doesn't exist
    if not os.path.isdir(outPath):

    # fetch, parse and save each page
    comApi = helpers.openConnection(configPath)
    for k, v in pages.iteritems():
        comPage = u'%s/%s' % (commonsPrefix, k)
        contents = comApi.getPage(comPage)
        units = parseEntries(contents[comPage])
        outdata = formatOutput(units, k)
        outFile = os.path.join(outPath, u'commons-%s.csv' % v)
        out =, 'w', 'utf8')
        output(u'Created %s' % outFile)

    # need to do filenames differently
    mappingFile = os.path.join(mappingsPath, u'Filenames.txt')
    comPage = u'%s/Filenames' % commonsPrefix
    contents = comApi.getPage(comPage)

    # identify changes
    units, allEntries = parseFilenameEntries(contents[comPage])
    if units:
        # load old filenames
        filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'
        filenamesFile = os.path.join(dataPath, u'filenames.csv')
        oldFilenames = helpers.csvFileToDict(filenamesFile, 'PhoId',
        for unit in units:
            pho_id = unit[u'phoId']
            if pho_id not in oldFilenames.keys():
                print u'could not find id in old: %s, %s' % \
                      (pho_id, unit[u'generated'])
            old_desc = oldFilenames[pho_id][u'filename']
            # newDesc = oldDesc.replace(unit[u'generated'], unit[u'improved'])
            # a safer implementation where new description is appended to
            # old ending. I.e. "- Museum - idNo"
            new_desc = u'%s %s' % (unit[u'improved'],
            if old_desc == new_desc:
                # indicator that commons file may not having been updated which
                # may cause more complex problems which are hard to test for
                print u'did you run the updater a second time without ' \
                      u'first updating the filenames table on Commons?'
            oldFilenames[pho_id][u'filename'] = new_desc

        # overwrite old filenames and old mapping
        # new filename.csv file w. header
        helpers.dictToCsvFile(filenamesFile, oldFilenames, filenamesHeader)
        # new Commons mapping file needs a dict with all descriptions
        mapping_dict = {}
        for phoId, v in oldFilenames.iteritems():
            descr = splitFilename(v[u'filename'])[0]
            mapping_dict[phoId] = {'descr': descr}

        Filenames.commonsOutput(mapping_dict, mappingFile, allEntries)
        output(u'Updated %s and produced a new mappingfile %s. Please upload '
               u'the new one to Commons.' % (filenamesFile, mappingFile))
Esempio n. 25
Esempio n. 26
