def makePhotoAll(photoAllFile, photo_multi, logFile): """ @toDO: if dupes are found then prompt manual cleanup then re-run makePhotoAll(), That way crash isn't complete. Given the photoAll data file read it and drop any entries without a commons connection. Also Simplify the data :param photoAllFile: path to photoAll data file :param photo_multi: photo_multi dict :param logFile: path to logfile :return: dict """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll " u"analysis log have been corrected and the updated " u"photoAll file saved...\n" u"...by pressing enter when done") # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Loading photoAll...") photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \ 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'), photoAllHeader) originalSize = len(photoAll) for k, v in photoAll.items(): link = v['PhoSystematikS'] # drop any entries without files if not link: del photoAll[k] continue # simplify link if '%' in link: link = helpers.urldecode_utf8(link) link = helpers.external_2_internal_link(link, project='wikimedia') link = link[len('[[:commons:File:'):-len(']]')] v['PhoSystematikS'] = link output('PhotoAll reduced from %d to %d entries' % (originalSize, len(photoAll))) # check that none of PhoId from photo_multi occur in photo dupes = [] for phoId in photo_multi.keys(): phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId']) if phoMul in photoAll.keys(): dupes.append(phoMul) if dupes: output(u'Found duplicates between photoAll and photo_multi. ' u'This will most likely mess things up. Check the log at ' u'%s for details.' % logFile) flog.write(u'* duplicates found in photo and photo_all\n' u'phoId:MulId|commonsFile\n') for d in dupes: flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS'])) flog.close() return photoAll
def analysePhotoAll(f, file_in): """ Check that all PhoSystematikS are commonsfiles and each is unique """ header, lines = helpers.open_csv_file(file_in) badUrls = [] dupes = [] sources = {} for l in lines: if not l: continue col = l.split('|') source = col[8].strip() # PhoSystematikS phoId = col[0] # PhoId mulId = col[5] # MulId phoMul = u'%s:%s' % (phoId, mulId) if source: if '%' in source: source = helpers.urldecode_utf8(source) internal = helpers.external_2_internal_link(source, project='wikimedia') if not internal.startswith('[[:commons:File:'): badUrls.append((phoMul, source)) else: internal = internal[len('[[:commons:File:'):-len(']]')] if internal in sources.keys(): dupes.append( (phoMul, sources[internal], internal.replace(' ', '_'))) sources[internal] = phoMul f.write(u'\n\n<!--From: %s -->\n' % file_in) if badUrls: f.write(u'===BadUrls===\n') for b in badUrls: f.write(u'%s: %s\n' % b) if dupes: f.write(u'===DuplicateUrls===\n') f.write(u'phoId:mulId|phoId:mulId|Filename\n') for b in dupes: f.write(u'%s|%s|%s\n' % b)
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([ objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def test_external_2_internal_link_non_wikipedia_string_with_param(self): input_value = u'http://commons.wikimedia.org/wiki/Some_title' expected = u'[[:commons:Some title]]' result = helpers.external_2_internal_link(input_value, project='wikimedia') self.assertEquals(result, expected)
def test_external_2_internal_link_non_wikipedia_string(self): input_value = u'http://se.wikimedia.org/wiki/Some_title' expected = u'http://se.wikimedia.org/wiki/Some_title' self.assertEquals(helpers.external_2_internal_link(input_value), expected)
def test_external_2_internal_link_non_wiki_url_string(self): input_value = u'http://not.a.wiki/Some_title' expected = u'http://not.a.wiki/Some_title' self.assertEquals(helpers.external_2_internal_link(input_value), expected)
def test_external_2_internal_link_https_svwiki_string(self): input_value = u'https://sv.wikipedia.org/wiki/Some_title' expected = u'[[:sv:Some title]]' self.assertEquals(helpers.external_2_internal_link(input_value), expected)
def test_external_2_internal_link_on_empty_string(self): self.assertEquals(helpers.external_2_internal_link(''), '')