def makeEvents(A, oDict): ''' Populate mapping-tables for Events Analysis of Ereignis ''' # oDict gives frequency of each objId # A.objD[k][u'ergId'] gives the exhibit id(s) of each object # A.ereignisD[k][u'ErgKurztitelS'] gives the title # A.ereignisD[k][u'ErgArtS'] gives the wikilink # get frequency for each exhibit eventFreq = {} for k, v in oDict.iteritems(): ergIds = A.objD[k][u'ergId'] if ergIds: ergIds = ergIds.split(';') for e in ergIds: if e in eventFreq.keys(): eventFreq[e] += v else: eventFreq[e] = v # get frequency for each place events = {} for k, v in eventFreq.iteritems(): title = A.ereignisD[k][u'ErgKurztitelS'] link = A.ereignisD[k][u'ErgArtS'] if title in events.keys(): events[title][u'freq'] += v if link != events[title][u'link']: output(u'Found two events with title but different ' u'links %s' % k) else: events[title] = {u'link': link, u'freq': v} return events
def makeFilenames(descriptions, photo, filenamesFile): """ Given filedescriptions this outputs it correctly as csv for later import :param descriptions: dict of descriptions with phoId as key :param photo: the photo data :param filenamesFile: the target file for output :return: None """ # setup filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' # make a dict to be able to reuse helpers.dictToCsvFile() filenames = {} for phoId, v in descriptions.iteritems(): filenames[phoId] = { 'PhoId': phoId, 'MulId': photo[phoId]['MulId'], 'MulPfadS': photo[phoId]['MulPfadS'], 'MulDateiS': photo[phoId]['MulDateiS'], 'filename': v['filename'], 'ext': '' } # output helpers.dictToCsvFile(filenamesFile, filenames, filenamesHeader) output(u'Created %s' % filenamesFile)
def writePhotographers(filename, dDict): ''' output photographers in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|name=Photographer|' \ u'creator=|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|frequency = %d\n' \ u'|creator = %s\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n\n' % CSV_FILES[u'photo'] \ + u'===Photographers===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (key, val[u'freq'], val[u'creator'], val[u'cat'])) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def writeMaterials(filename, dDict): ''' output materials in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|name=Technique/material|technique=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ + u'|name = %s\n' \ + u'|frequency = %d\n' \ + u'|technique = %s\n' \ + u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n' % CSV_FILES[u'objMultiple'] \ + u'commonsconnection is the relevant parameter for ' \ + u'{{tl|technique}}. Don\'t forget to add a translation in ' \ + u'Swedish at [[Template:Technique/sv]]\n\n' \ + u'Set commonsconnection of irrelevant technique/material ' \ + u'to "-".\n\n' \ + u'===technique/material|frequency|commonsconnection===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (key, val[u'freq'], '/'.join(val[u'connect']))) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def writeKeywords(filename, dDict): ''' output keywords in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|more = %s\n' \ u'|frequency = %d\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n' % CSV_FILES[u'stichwort'] \ + u'Set commonsconnection of irrelevant keywords to "-"\n\n' \ + u'Multiple categories are separated by "/"\n' \ + u'===Keyword|frequency|description|commonsconnection===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (key, '/'.join(val[u'descr']), val[u'freq'], '/'.join(val[u'cat']))) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def writeKeywords(filename, dDict): ''' output keywords in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|more = %s\n' \ u'|frequency = %d\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n' % CSV_FILES[u'stichwort'] \ + u'Set commonsconnection of irrelevant keywords to "-"\n\n' \ + u'Multiple categories are separated by "/"\n' \ + u'===Keyword|frequency|description|commonsconnection===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (key, '/'.join(val[u'descr']), val[u'freq'], '/'.join( val[u'cat']))) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def makeKeywords(A): ''' Populate mapping-tables for Keywords Analysis of stichwords ''' # Create a dict of depicted StichId with frequency as value # Working from the trimmed file means each phoId has already been # verified to exist. keywords = {} phoIds = [] # to make sure all phoIds really are present for k, v in A.stichD.iteritems(): descr = v[u'StiSynonymS'] key = v[u'StiBezeichnungS'].lower() if descr == u'': descr = u'-' if key not in keywords.keys(): keywords[key] = {u'descr': [], u'freq': 0} if descr not in keywords[key][u'descr']: keywords[key][u'descr'].append(descr) keywords[key][u'freq'] += 1 # for debugging if v[u'PhoId'] not in phoIds: phoIds.append(v[u'PhoId']) # debug for k in A.photoD.keys(): k = k.split(':')[0] if k in phoIds: phoIds.remove(k) if phoIds: output(u'Stichwort_trim still contains unused phoIds') return keywords
def negatives(path, ext=u'.tif'): """ Identify and invert all files at the given location. * moves file to filename-NEGATIVE_PATTERN.ext * creates an inverted file at filename.ext * creates a info file for negative and modifes info file for positive :param path: realtive path to the directory in which to process the files :param ext: image file extension (only .tif are ever negatives?) """ negative_appendix = NEGATIVE_PATTERN % ext count = 0 skipcount = 0 for filename in os.listdir(path): if filename.endswith(ext) and \ not filename.endswith(negative_appendix): negative = u'%s%s' % (filename[:-len(ext)], negative_appendix) if os.path.isfile(os.path.join(path, negative)): output(u'%s was already inverted, skipping...' % filename) skipcount += 1 continue invert_file_and_info(path, filename, negative, ext) count += 1 if count % 10 == 0: output(u'%d files inverted (%d)' % (count, count + skipcount))
def get(self): last_metric_at = PostInstallActivityMetric.gql("ORDER BY updated_at DESC").get() if last_metric_at == None: last_metric_at = "[Never]" else: last_metric_at = last_metric_at.updated_at h.output(self, '<html><head><link href="/public/css/admin/admin.css" type="text/css" rel="stylesheet" /></head><body><div id="loading-msg">Loading...</div><div id="auto-fetch-post-install-activity-metrics"></div><div>Post Install Activity Re-Calculation Status: <span id="status">Last Run at: '+str(last_metric_at)+'</span> <a id="calculate-post-install-activity-metrics-button" href="#">RE-CALCULATE NOW!</a></div><div>Total Users: <span id="total-users">...</span></div><div>Histogram for Post Install Activity: <div id="histogram-text">...</div></div><script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script><script src="/public/js/admin/install-metrics.js" type="text/javascript"></script><script src="/public/js/admin/post-install-activity-metrics.js" type="text/javascript"></script></body></html>')
def get(self): span_in_days = self.request.get('span_in_days') last_metric_at = KValueMetric.gql("WHERE span_in_days = :1 ORDER BY updated_at DESC", span_in_days).get() if last_metric_at == None: last_metric_at = "[Never]" else: last_metric_at = last_metric_at.updated_at h.output(self, '<html><head><link href="/public/css/admin/admin.css" type="text/css" rel="stylesheet" /></head><body><div id="loading-msg">Loading...</div><div id="auto-fetch-k-value-metrics"></div><div>K-Value Re-Calculation Status: <span id="status">Last Run at: '+str(last_metric_at)+'</span> <a id="calculate-k-value-metrics-button" href="#">RE-CALCULATE NOW!</a></div><div>Chart for K-Value: <div id="graph">...</div></div><script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script><script type="text/javascript" src="http://www.google.com/jsapi"></script><script src="/public/js/admin/install-metrics.js" type="text/javascript"></script><script src="/public/js/admin/k-value-metrics.js" type="text/javascript"></script></body></html>')
def main(): B, L, D, scores, libraries = helpers.read('./data/d_tough_choices.txt') ansLibs = [] #libraries = [[id, [book1,book2,...]] , [id, [book1,book2,...]]] for i in range(0, (L - 1) / 2): lib = libraries[2 * i] ansLibs.append([lib[0], lib[4]]) helpers.output(ansLibs, "outputs/D_v1.txt")
def main(): input_fs = [ "input/a_example.in", "input/b_small.in", "input/c_medium.in", "input/d_big.in" ] for input_f in input_fs: output_f = input_f.replace(".in", ".out").replace("input", "output") inp = parse(input_f) rows, cols, min_ing, max_area, pizza = inp slices = cut(inp) # print(slices) output(output_f, slices) print('.', end='')
def main(): input_fs = [ "./input/0_submission_example.in", "./input/1_me_at_the_zoo.in", "./input/2_videos_worth_spreading.in", "./input/3_trending_today.in", "./input/4_kittens.in", ] for input_f in tqdm.tqdm(input_fs): output_f = input_f.replace(".in", ".out").replace("input", "output") inp = parse(input_f) cs_by_vid = solve(inp) output(output_f, cs_by_vid)
def moveHits(path, filenamesFile=None): """ Goes through the root export directory to find any matching file and moves these to a lower case version of the directory. This flattens out the directory structure whilst making it easy to identify any non-matched files. :param path: path to directory with image file structures :param filenamesFile: filenames data file :return: None """ # set defaults unless overridden filenamesFile = filenamesFile or FILENAMES # Find and move all relevant files tree, name_to_pho = makeHitlist(filenamesFile) subdirs = [] for filename in os.listdir(path): # for LSH all files are in upper case directories filename_path = os.path.join(path, filename) if os.path.isdir(filename_path) and filename.isupper(): subdirs.append(filename_path) for subdir in subdirs: # make a subdir path where (only the) last directory is lower case tmp_path, tmp_dir = os.path.split(subdir) lower_subdir = os.path.join(tmp_path, tmp_dir.lower()) counter, file_num = moveFiles(lower_subdir, tree, name_to_pho, path=subdir) output(u'%s: %d out of %d were hits' % (subdir, counter, file_num)) # load filenames file filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext' old_filenames = helpers.csvFileToDict(filenamesFile, 'PhoId', filenames_header) # Add found extentions to filenames file for phoId, v in old_filenames.iteritems(): old_filename = v['MulDateiS'] if old_filename in name_to_pho.keys(): v['ext'] = name_to_pho[old_filename]['ext'] # overwrite extention # output updated file helpers.dictToCsvFile(filenamesFile, old_filenames, filenames_header) # delete all emptied directories for subdir in subdirs: removeEmptyDirectories(subdir, top=False)
def writePlaces(filename, exhibitPlaces, landDict, ortDict, emptyPlaces): ''' output Places in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|name=Place|' \ u'other=Commons connection}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|frequency = %d\n' \ u'|other = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s - col: ausOrt-->\n' % CSV_FILES[u'ausstellung'] \ + u'<!--From: %s for OmuTypS = Tillverkningsland -->\n' % CSV_FILES[u'objMultiple'] \ + u'<!--From: %s for OmuTypS = Tillverkningsort-->\n' % CSV_FILES[u'objMultiple'] \ + u'The preffered order of making connections are: Institution, page, category' \ + u'(where the category is prefixed by a ":").\n\n' \ + u'Set commonsconnection of irrelevant places to "-"\n\n' \ + u'===Place|Frequency|Commonsconnection===\n' # output f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(u'\n====exhibit places====\n') f.write(header) for key, val in helpers.sortedBy(exhibitPlaces): f.write(row % (key, val[u'freq'], val[u'connect'])) f.write(footer) f.write(u'\n====origin-Countries====\n') f.write(header) for key, val in helpers.sortedBy(landDict): f.write(row % (key, val[u'freq'], val[u'connect'])) f.write(footer) f.write(u'\n====origin-cities====\n') f.write(header) for key, val in helpers.sortedBy(ortDict): f.write(row % (key, val[u'freq'], val[u'connect'])) f.write(footer) f.write(u'\n====Preserved mappings====\n') f.write(header) for key, val in helpers.sortedBy(emptyPlaces): f.write(row % (key, val[u'freq'], val[u'connect'])) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def writeObjKeywords(filename, ord1Dict, ord2Dict, gruppDict, emptyObjCats): ''' output ObjKeywords in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|frequency = %d\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n' % CSV_FILES[u'objDaten'] \ + u'These are the keywords used to describe the objects ' \ + u'themselves. Classification is used for all items whereas ' \ + u'group is only used at HWY.\n\n' \ + u'when possible ord1 will be used instead of the more ' \ + u'generic ord2.\n\n' \ + u'Multiple categories are separated by a "/"\n' \ + u'===Keyword|frequency|commonscategory===\n' # output f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(u'\n====class: ord1====\n') f.write(header) for key, val in helpers.sortedBy(ord1Dict): f.write(row % (key, val[u'freq'], '/'.join(val[u'connect']))) f.write(footer) f.write(u'\n====class: ord2====\n') f.write(header) for key, val in helpers.sortedBy(ord2Dict): f.write(row % (key, val[u'freq'], '/'.join(val[u'connect']))) f.write(footer) f.write(u'\n====class: class: HWY-grupp====\n') f.write(header) for key, val in helpers.sortedBy(gruppDict): f.write(row % (key, val[u'freq'], '/'.join(val[u'connect']))) f.write(footer) f.write(u'\n====Preserved mappings====\n') f.write(header) for key, val in helpers.sortedBy(emptyObjCats): f.write(row % (key, val[u'freq'], '/'.join(val[u'connect']))) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def commonsOutput(descriptions, mappingFile, allEntries=None): """ Given filedescriptions this outputs it correctly in a Commons export format :param descriptions: dict of descriptions with phoId as key :param mappingFile: the target file for output :param allEntries: optional, a list phoIds giving the order in which to output the entries. This allows for easier diff comparison :return: None """ # setup fOut = codecs.open(mappingFile, 'w', 'utf-8') chunkSize = 250 chunkStart = u"====%d-%d====\n" \ u"{| class=\"wikitable sortable\"\n|-\n! PhoId !! generated " \ u"<descr> !! improved <descr>\n" rowFormat = u"|-\n| %s || %s || \n" # write intro fOut.write( u'Final filename becomes: <descr> - <museum> - <photoId>.<ext>\n\n' u'Attempts have been made to keep descriptions under %d characters ' u'with a hard limit at %d characters\n\n' u'You are free to improve the descriptions by adding an alternativ ' u'in the last column.\n' u'===phoId | description | new description===\n\n' u'%s' % (GOODLENGTH, MAXLENGTH, chunkStart % (0, chunkSize))) if allEntries is None: allEntries = descriptions.keys() counter = 0 for phoId in allEntries: # Add regular breaks counter += 1 if counter % chunkSize == 0: fOut.write(u'|}\n\n' + chunkStart % (counter, counter + chunkSize)) # write row descr = descriptions[phoId]['descr'] fOut.write(rowFormat % (phoId, insufficient(descr))) # # write outro fOut.write(u'|}') fOut.write(u'\n\n[[%s]]' % LIST_CAT) fOut.close() output(u'Created %s' % mappingFile)
def combineEvents(oldCatDict, oldLinkDict, newDict): ''' Enrich mapping by previously done mapping newDict has freq and link parameters oldDict is split into oldCatDict list of categories oldLinkDict a link written ":sv:A link" Note that link can exist both in new and old but new uses "sv:A_link" ''' for k, v in newDict.iteritems(): newDict[k][u'cat'] = u'' newDict[k][u'link'] = newDict[k][u'link'].strip(u'[]') if k in oldCatDict.keys(): # assume key list is same in both if oldCatDict[k] is not None: newDict[k][u'cat'] = oldCatDict[k] if oldLinkDict[k] is not None: oldlink = oldLinkDict[k] newlink = newDict[k][u'link'].replace('_', ' ') if oldlink != newlink: # check if the same, otherwise use old if newlink: output(u'Eregnis: replaced %s by %s' % (newlink, oldlink)) newlink = oldlink newDict[k][ u'link'] = newlink # reformated and possibly replaced del oldCatDict[ k] # no need to delete oldLinkDict if we iterate over cat # add any previous mapping for k, v in oldCatDict.iteritems(): cat = v link = oldLinkDict[k] if (cat is not None) or (link is not None): if cat is None: cat = u'' if link is None: link = u'' newDict[k] = {u'freq': 0, u'cat': cat, u'link': link} return newDict
def main(): B, L, D, scores, libraries = h.read( "../data/e_so_many_books.txt" ) # libraries is [id,NBooks,TDays,MShipsperday,[books]] # TODO Call get_points book_scores = get_book_point_lib(libraries, scores) #list.sort(libraries, key=lambda library:get_points(library,book_scores), reverse=True) tot_points = 0 # sort books by value and at total points to calculate average for lib in libraries: list.sort(lib[4], key=lambda book: book_scores[book], reverse=True) tot_points += get_points(lib, book_scores) average_points = tot_points / L list.sort( libraries, key=lambda library: get_points2(library, book_scores, average_points), reverse=True) ansLibs = [] day = 0 new_libraries = [] for lib in libraries: day_local = day + lib[2] # Add time to set up books_to_scan = [] while day_local < D: list.sort(lib[4], key=lambda book: book_scores[book], reverse=True) #sort books_to_scan.append(lib[4][0:lib[2]]) for i in range(lib[2]): if i < len(lib[4]): books_to_scan.append(lib[4][i]) book_scores[lib[4][i]] = 0 day_local += lib[2] #iterate over days new_libraries.append([lib[0], books_to_scan]) #print("Days total are: " + str(D)) for i in range(int((L - 1) / 2)): lib = new_libraries[2 * i] ansLibs.append([lib[0], lib[4]]) h.output(ansLibs, "../outputs/E_v1.txt")
def writeEvents(filename, dDict): ''' output events in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|name=Event|' \ u'link=Wikipedia-link|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|frequency = %d\n' \ u'|link = %s\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->' % CSV_FILES[u'ereignis'] \ + u'\'\'wikipedia-link\'\' are used for descriptive texts ' \ + u'whereas \'\'commonsconnection\'\' is a relevant category ' \ + u'on commons.\n\n' \ + u'Set commonsconnection of irrelevant events to "-"\n\n' \ + u'Multiple categories are separated by "/"\n\n' \ + u'*död/begravning: [[:Category:Funeral of X of Sweden]]\n' \ + u'*kröning: [[:Category:Coronation of X of Sweden]]\n' \ + u'*bröllop: [[:Category:Wedding of X and Y of Sweden]]\n' \ + u'===Event|Frequency|wikipedia-link|Commonsconnection===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (key, val[u'freq'], val[u'link'], '/'.join(val[u'cat']))) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def writePeople(filename, dDict): ''' output People in Commons format ''' # set-up header = u'{{user:Lokal Profil/LSH2|name=Name <small>(kueId)</small>' \ u'|link=Wikipedia-link|creator=|category=}}\n' row = u'{{User:Lokal Profil/LSH3\n' \ u'|name = %s\n' \ u'|more = %s\n' \ u'|frequency = %d\n' \ u'|link = %s\n' \ u'|creator = %s\n' \ u'|category = %s\n' \ u'}}\n' footer = u'|}\n' intro = u'<!--From: %s -->\n' % CSV_FILES[u'kuenstler'] \ + u'\'\'wikipedia-link\'\' is used for descriptive texts whereas ' \ + u'creator is a creator template on commons and ' \ + u'\'\'commoncat\'\' is a relevant category on commons.\n\n' \ + u'Set commonsconnection of irrelevant events to "-". ' \ + u'Note that creator is only relevant for artists.\n\n' \ + u'===kueId|frequency|name|wikipedia-link|creator|commoncat===\n' # output once = True f = codecs.open(filename, 'w', 'utf8') f.write(intro) f.write(header) for key, val in helpers.sortedBy(dDict): if once and val[u'freq'] == 0: once = False f.write(footer) f.write(u'\n===Preserved mappings===\n') f.write(header) f.write(row % (val[u'descr'], key, val[u'freq'], val[u'link'], val[u'creator'], val[u'cat'])) f.write(footer) f.write(u'\n\n[[%s]]' % LIST_CAT) f.close() output(u'Created %s' % filename)
def makePhotoAll(photoAllFile, photo_multi, logFile): """ @toDO: if dupes are found then prompt manual cleanup then re-run makePhotoAll(), That way crash isn't complete. Given the photoAll data file read it and drop any entries without a commons connection. Also Simplify the data :param photoAllFile: path to photoAll data file :param photo_multi: photo_multi dict :param logFile: path to logfile :return: dict """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll " u"analysis log have been corrected and the updated " u"photoAll file saved...\n" u"...by pressing enter when done") # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Loading photoAll...") photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \ 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'), photoAllHeader) originalSize = len(photoAll) for k, v in photoAll.items(): link = v['PhoSystematikS'] # drop any entries without files if not link: del photoAll[k] continue # simplify link if '%' in link: link = helpers.urldecode_utf8(link) link = helpers.external_2_internal_link(link, project='wikimedia') link = link[len('[[:commons:File:'):-len(']]')] v['PhoSystematikS'] = link output('PhotoAll reduced from %d to %d entries' % (originalSize, len(photoAll))) # check that none of PhoId from photo_multi occur in photo dupes = [] for phoId in photo_multi.keys(): phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId']) if phoMul in photoAll.keys(): dupes.append(phoMul) if dupes: output(u'Found duplicates between photoAll and photo_multi. ' u'This will most likely mess things up. Check the log at ' u'%s for details.' % logFile) flog.write(u'* duplicates found in photo and photo_all\n' u'phoId:MulId|commonsFile\n') for d in dupes: flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS'])) flog.close() return photoAll
def combineEvents(oldCatDict, oldLinkDict, newDict): ''' Enrich mapping by previously done mapping newDict has freq and link parameters oldDict is split into oldCatDict list of categories oldLinkDict a link written ":sv:A link" Note that link can exist both in new and old but new uses "sv:A_link" ''' for k, v in newDict.iteritems(): newDict[k][u'cat'] = u'' newDict[k][u'link'] = newDict[k][u'link'].strip(u'[]') if k in oldCatDict.keys(): # assume key list is same in both if oldCatDict[k] is not None: newDict[k][u'cat'] = oldCatDict[k] if oldLinkDict[k] is not None: oldlink = oldLinkDict[k] newlink = newDict[k][u'link'].replace('_', ' ') if oldlink != newlink: # check if the same, otherwise use old if newlink: output(u'Eregnis: replaced %s by %s' % (newlink, oldlink)) newlink = oldlink newDict[k][u'link'] = newlink # reformated and possibly replaced del oldCatDict[k] # no need to delete oldLinkDict if we iterate over cat # add any previous mapping for k, v in oldCatDict.iteritems(): cat = v link = oldLinkDict[k] if (cat is not None) or (link is not None): if cat is None: cat = u'' if link is None: link = u'' newDict[k] = {u'freq': 0, u'cat': cat, u'link': link} return newDict
def main(): """ """ res = {} defs = all_defs() n_defs = len(defs) n_calls = 0 n_refs = 0 called_or_refed = set() helpers.output('\n'.join(defs), DEFS) print("Listing of functions (& methods) defined sent to '{}'".format(DEFS)) for mod_name in MODULE_NAMES: with open("{}.py".format(mod_name), 'r') as module: line_n = 0 for line in module: line_n += 1 for def_ in defs: # searching for funtion def_ mod, func_w_n = def_.split('.') func, n = func_w_n.split('@') refed = [] if call_found(func, line, refed): n_calls += 1 called_or_refed.add(def_) _ = res.setdefault(def_, {}) _ = res[def_].setdefault("calls", []) res[def_]["calls"].append("{} @ {}".format( mod_name, line_n)) if refed: n_refs += 1 called_or_refed.add(def_) _ = res.setdefault(def_, {}) _ = res[def_].setdefault("references", []) res[def_]["references"].append("{} @ {}".format( mod_name, line_n)) n_no_refs = len(defs) - len(called_or_refed) no_refs = set(defs) - called_or_refed helpers.output('\n'.join(pformat(res)), CALLS) print("Listing of function (& method) calls sent to '{}'".format(CALLS)) helpers.output(pprint.pformat(res, compact=True, width=70), CALLSf) print("Formatted listing of function (& method) calls sent to '{}'".format( CALLSf)) helpers.output('\n'.join(sorted(no_refs)), UNUSED) print( "Listing of unused functions (& methods) sent to '{}'".format(UNUSED)) print("Found {} defs, {} calls, {} refs & {} with out either".format( n_defs, n_calls, n_refs, n_no_refs))
def trimObjDaten(objDaten, photo_multi, photoAll): """ Removes any unused objects in objDaten, because it is huge! :param objDaten: objDaten dict :param photo_multi: photo_multi dict :param photoAll: photoAll dict :return: None """ output(u"\tTrimming objDaten...") originalSize = len(objDaten) # collect all objIds not mentioned in photo_multi or photoAll unusedObjIds = set(objDaten.keys()) for k, v in photo_multi.iteritems(): unusedObjIds = unusedObjIds - set(v['PhoObjId']) for k, v in photoAll.iteritems(): unusedObjIds = unusedObjIds - set(v['PhoObjId']) # remove any which should be trimmed for objId in unusedObjIds: del objDaten[objId] output('\tobjDaten reduced from: %d to %d' % (originalSize, len(objDaten)))
def get(self): h.output(self, """ Admin: <a href='/admin/pageviews'>Page Views</a> | <a href='/admin/users'>Users</a> | <a href='/admin/usergraphs'>User Graphs</a> | <a href='/admin/querys'>Searches</a> | <a href='/admin/resultviews'>Result Views</a> <br/> Calculated Metrics: <a href='/admin/installmetrics'>User Install Metrics</a> | <a href='/admin/installmetrics/summary'>Summary Install Metrics</a> <br/> Calculated Metrics: <a href='/admin/organicsearchmetrics'>Organic Search Metrics</a> | <a href='/admin/organicsearchmetrics/summary'>Summary Organic Search Metrics</a> <br/> Calculated Metrics: <a href='/admin/postinstallactivitymetrics'>Post-Install Activity Metrics</a> | <a href='/admin/postinstallactivitymetrics/summary'>Summary Post-Install Activity Metrics</a> <br/> Calculated Metrics: <a href='/admin/kvaluemetrics'>7-Day K Value Metrics</a> | <a href='/admin/kvaluemetrics/summary'>7-Day K Value Metrics</a> <br/> Beta: <a href='/admin/paths'>Navigation Paths</a> | <a href='/admin/url-analyzer'>URL Analyzer</a> | <a href='/admin/pageviews/normalizer'>Page View URL Normalizer</a>""")
def make_neg_and_pos_info(info_file, filename, ext): """ Generate a negative and positive version of the given info file. The two refer to each other using the negative/positive parameters. The negative file gets categories removed. :param info_file: the contents of the info file :param filename: the (positive) image filename :param ext: the file extension """ negative_appendix = NEGATIVE_PATTERN % ext ov_position = info_file.find(u'|other_versions=') # for negative we need to identify end position of the template end_position = -1 end_patterns = [u'</gallery>\n}}', u'|other_versions= \n}}'] for end_pattern in end_patterns: end_position = info_file.find(end_pattern) if end_position > 0: end_position += len(end_pattern) break if not end_position > 0: # if all else fails just keep it all output('%s: could not find end of template' % filename) end_position = len(info_file) # make new infos pos_info = u'%s|negative= %s\n%s' % ( info_file[:ov_position], u'%s%s' % (filename[:-len(ext)], negative_appendix), info_file[ov_position:]) neg_info = u'%s|positive= %s\n%s' % ( info_file[:ov_position], filename, info_file[ov_position:end_position]) return (neg_info, pos_info)
def removeEmptyDirectories(path, top=True): """ Remove any empty directories and subdirectories :param path: path to direcotry to start deleting from :param top: set to True to not delete the current directory :return: None """ if not os.path.isdir(path): return # remove empty sub-directory files = os.listdir(path) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): removeEmptyDirectories(fullpath, top=False) # re-read and delete directory if empty, files = os.listdir(path) if not top: if not files: os.rmdir(path) else: output('Not removing non-empty directory: %s' % path)
def stichworth_photo(stichwortFile, photo_multi): """ Given the photo-multi data and the stichwort data file add a stichwort id field to photo-multi. Also returns the stichwort data after trimming away any unused info :param stichwortFile: path to stichwort data file :param photo_multi: photo_multi dict :return: dict (and updates photo_multi) """ # setup output(u"Adding stichworth to photo") # handle stichwort output(u'\treading in stichwort...') stichwortHeader = 'PstId|PhoId|StiBezeichnungS|StiSynonymS' stichwort = helpers.csvFileToDict(stichwortFile, 'PstId', stichwortHeader) originalSize = len(stichwort) # match each phoId to several stichId # removing any entries with invalid phoIds photoStichConnection = {} for k, v in stichwort.items(): phoId = v['PhoId'] pstId = v['PstId'] if phoId in photo_multi.keys(): if phoId not in photoStichConnection.keys(): photoStichConnection[phoId] = set([]) photoStichConnection[phoId].add(pstId) else: del stichwort[k] output('\tstichwort trimmed from %d to %d, found %d phoId' % (originalSize, len(stichwort), len(photoStichConnection))) # add stichId to photo_multi for k, v in photo_multi.iteritems(): phoId = v['PhoId'] v['PstId'] = [] if phoId in photoStichConnection.keys(): v['PstId'] = list(photoStichConnection.pop(phoId)) # confirm and return output(u"...done") return stichwort
def photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile, objDatenFile, logFile): """ Given the photo_multi data and the phoObjDaten + objDaten data files any additional relevant ObjIds are added to the PhoObjId field of the photo_multi dict, this field is also converted to a list. Also returns objDaten for later use :param photo_multi: photo_multi dict :param photoAll: photoAll dict :param photoObjDatenFile: path to phoObjDaten data file :param objDatenFile: path to objDaten data file :param logFile: path to logfile :return: dict (and updates photo_multi) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Combining all ObjId into the photo file...") # handle objDaten output(u'\treading in objDaten.. (takes a while)') objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \ 'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \ 'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \ 'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|' \ 'ObjFeld06M|ObjReserve01M' objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader) # match each objInvNr to several objId objInvNr2ObjId = {} # old oDict output(u'\tfinding objInvNr connections...') for k, v in objDaten.iteritems(): objId = v['ObjId'] objInvNr = v['ObjInventarNrS'] if not objInvNr: continue if objInvNr not in objInvNr2ObjId.keys(): objInvNr2ObjId[objInvNr] = [] objInvNr2ObjId[objInvNr].append(objId) output('\tFound %d objInvNr connections in %d objects' % (len(objInvNr2ObjId), len(objDaten))) # handle photoObjDaten photoObjDatenHeader = 'PhmId|AufId|AufAufgabeS|MulId|PhoId|ObjInvNrS' photoObjDaten = helpers.csvFileToDict(photoObjDatenFile, 'PhmId', photoObjDatenHeader, keep=('PhoId', 'ObjInvNrS')) # match each phoId to several objId via the ObjInvNr output(u'\tfinding photo-object connections...') photoObjConnections = {} skipped = [] # ObjInvNr not in ObjDaten for k, v in photoObjDaten.iteritems(): objInvNr = v['ObjInvNrS'] phoId = v['PhoId'] if not objInvNr: continue if objInvNr not in objInvNr2ObjId.keys(): skipped.append(objInvNr) continue if phoId not in photoObjConnections.keys(): photoObjConnections[phoId] = [] photoObjConnections[phoId] += objInvNr2ObjId[objInvNr] output('\tFound %d connected photos in %d photoObjDaten entries' % (len(photoObjConnections), len(photoObjDaten))) # add to photo_multi and photoAll photoDicts = (photo_multi, photoAll) allBadObjId = [] for pDict in photoDicts: for k, v in pDict.iteritems(): phoId = v['PhoId'] objIds = [] if phoId not in photoObjConnections.keys(): if v['PhoObjId']: objIds.append(v['PhoObjId']) else: # combine relevant objIds objIds = photoObjConnections.pop(phoId) # new connections if v['PhoObjId']: objIds.append(v['PhoObjId']) # old connection objIds = list(set(objIds)) # remove dupes # check that all of these actually exists (old realObjOnly()) # and remove otherwise badObjId = [] for objId in objIds: if objId not in objDaten.keys(): badObjId.append(objId) if badObjId: allBadObjId += badObjId for badId in badObjId: objIds.remove(badId) # set new value v['PhoObjId'] = objIds # log any skipped ObjInvNr if skipped: skipped = list(set(skipped)) # remove dupes output(u"\tthere were %d skipped ObjInvNr, see log (%s)" % (len(skipped), logFile)) flog.write(u'*Unknown objInvs, i.e. ObjInvNrS in photoObjDaten ' u'without a match in ObjDaten\n') flog.write(u'%s\n' % ', '.join(skipped)) # log any bad objId if allBadObjId: output('\tI found some bad objIds. Check the %s' % logFile) allBadObjId = list(set(allBadObjId)) # remove dupes flog.write(u'* objIds in photo but not in objDaten\n') flog.write(u'%s\n' % ', '.join(allBadObjId)) # trim objDaten trimObjDaten(objDaten, photo_multi, photoAll) # confirm and return output(u"...done") flog.close() return objDaten
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile): """ Given the kuenstler data file and the objDaten data add a kuenstler id field to objDaten. Also returns the kuenstler data after * removing certain irrelevant roles and dummy entries * combining all objIds for the same kueId * standardising years * dropping a lot of unneeded fields :param kuenstlerFile: path to kuenstler data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Crunching kuenstler...") dummyNames = (u'ingen uppgift', ) badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare', u'Återförsäljare', u'Konservator') badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia') droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS', 'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId', 'PhoId') # handle kuenstler kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \ 'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \ 'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \ 'KueFunktionS|MulId|PhoId' kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'), kuenstlerHeader) originalSize = len(kuenstler) # collect all kueId and drop any with invalid title or role # also invert to get per objId connections # @toDO: Is keeping objId in kuenstler really needed? # Otherwise populate objIdConnection here foundKueId = {} objIdConnection = {} for k, v in kuenstler.items(): # allow removing entries from within loop kueId = v['KueId'] objId = v['ObjId'] fName = v['KueVorNameS'] lName = v['KueNameS'] role = v['OkuArtS'] roleCmt = v['OkuFunktionS'] # filter out any undesired entries if role in badRoles or \ roleCmt in badRoleCmts or \ len(fName) + len(lName) == 0 or \ lName in dummyNames: del kuenstler[k] continue # send unique role/kueId combo for objid kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId) if objId not in objIdConnection.keys(): objIdConnection[objId] = set([]) objIdConnection[objId].add(kueCombo) # keep only one entry per unique kueId if kueId not in foundKueId.keys(): # keep this entry foundKueId[kueId] = k kuenstler[k]['ObjId'] = set([objId, ]) else: # keep only objId part of this entry kuenstler[foundKueId[kueId]]['ObjId'].add(objId) del kuenstler[k] output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler))) # add to objDaten output('\tadding kueId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['role:roleCmt:kueId'] = [] if objId in objIdConnection.keys(): v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId)) # further cleanup of kuenstler # correcting ort/land entries # stripping years from name # dropping a bunch of fields output('\tfurther cleanup of kuenstler...') for k, v in kuenstler.iteritems(): land = v['KudOrtS'] # missnamed in original database ort = v['KudLandS'] # missnamed in original database lName = v['KueNameS'] bYear = v['KudJahrVonL'] dYear = v['KudJahrBisL'] objIds = v['ObjId'] # correct missnaming in original database v['KudOrtS'] = ort v['KudLandS'] = land # convert set to list v['ObjId'] = list(objIds) # take yearinfo out of name, and store in year lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear) if log: flog.write(log) v['KueNameS'] = lName v['KudJahrVonL'] = bYear v['KudJahrBisL'] = dYear for field in droppedFields: del v[field] flog.close() output(u"...done") return kuenstler
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def ereignis_objDaten(ereignisFile, objDaten, logFile): """ Given the ereignis data file and the objDaten data add a ereignis id field to objDaten. Also returns the ereignis data after * combining all objIds for the same ergId * dropping EroId :param ereignisFile: path to eregnis data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Trimming eregnis and adding eregnis to ObjDaten...") # handle eregnis ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS' ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader) originalSize = len(ereignis) # collect all ergId and drop any with invalid title # @toDO: Is keeping objId in eregnis really needed? # Otherwise populate objIdConnection here foundErgId = {} for k, v in ereignis.items(): # allow removing entries from within loop ergId = v['ErgId'] objId = v['EroObjId'] title = v['ErgKurztitelS'] if not title: # remove empty del ereignis[k] elif ergId not in foundErgId.keys(): # keep this entry foundErgId[ergId] = k ereignis[k]['EroObjId'] = set([ objId, ]) ereignis[k].pop('EroId') # drop unnecessary id else: # keep only objId part of this entry ereignis[foundErgId[ergId]]['EroObjId'].add(objId) del ereignis[k] output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis))) # handle urls in ereignis and convert set to list for k, v in ereignis.iteritems(): objIds = v['EroObjId'] url = v['ErgArtS'] # convert set to list v['EroObjId'] = list(objIds) # handle urls if u'%' in url: url = helpers.urldecode_utf8(url) # convert external links to internal if 'wikipedia' in url: url = helpers.external_2_internal_link(url) elif url: flog.write(u'weird url: %s\n' % url) v['ErgArtS'] = url # invert to get per objId connections objIdConnection = {} for k, v in ereignis.iteritems(): ergId = v['ErgId'] objIds = v['EroObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ergId) # add to objDaten output('\tadding ergId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ergId'] = [] if objId in objIdConnection.keys(): v['ergId'] = objIdConnection.pop(objId) flog.close() output(u"...done") return ereignis
def ausstellung_objDaten(austellungFile, objDaten): """ Given the austellung data file and the objDaten data add a austellung id field to objDaten. Also returns the austellung data after * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param austellungFile: path to austellung data file :param objDaten: objDaten dict :return: dict (and updates objDaten) """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any year formatting issues mentioned " u"in the analysis log have been corrected and the " u"updated ausstellung file saved...\n" u"...by pressing enter when done") # setup dummyTitles = ( u'reparation', u'utställning', u'lån för undersökning', u'OBS! Testpost för admin - utställning, export wikimedia commons', u'lån till Frankrike 1947', u'test karin 20100520', u'test 20100629 (en post skapad för administrativa tester)', u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk') output(u"Trimming ausstellung and adding ausstellung to ObjDaten...") # handle ausstellung austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \ 'AusDatumBisD|AobObjId|AufAufgabeS' austellung = helpers.csvFileToDict(austellungFile, 'AobId', austellungHeader) originalSize = len(austellung) # collect all ausId and drop any with invalid title # @toDO: Is keeping objId in austellung really needed? # Otherwise populate objIdConnection here foundAusId = {} for k, v in austellung.items(): # allow removing entries from within loop ausId = v['AusId'] objId = v['AobObjId'] title = v['AusTitelS'] if not title or title in dummyTitles: # remove empty/dummy del austellung[k] elif ausId not in foundAusId: # keep this entry foundAusId[ausId] = k austellung[k]['AobObjId'] = set([ objId, ]) austellung[k].pop('AobId') # drop unnecessary id else: # keep only objId part of this entry austellung[foundAusId[ausId]]['AobObjId'].add(objId) del austellung[k] output('\taustellung reduced from %d to %d entries' % (originalSize, len(austellung))) # populate std_year output('\tstandardising years...') for k, v in austellung.iteritems(): year = v['AusJahrS'] yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip() ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip() v['std_year'] = stdAustellungYear(year, yfrom, ytil) # to match with pre-redux results. Could possibly be dropped instead? v['AusDatumVonD'] = yfrom v['AusDatumBisD'] = ytil # invert to get per objId connections # and convert set to list objIdConnection = {} for k, v in austellung.iteritems(): ausId = v['AusId'] objIds = v['AobObjId'] v['AobObjId'] = list(objIds) for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ausId) output('\tadding ausId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ausId'] = [] if objId in objIdConnection.keys(): v['ausId'] = objIdConnection.pop(objId) output(u"...done") return austellung
def makePhoto_multi(photoFile, multiFile, logFile, tmpFile): """ Given the photo and multimedia data this combines the two into one dict :param photoFile: path to photo data file :param multiFile: path to multimedia data file :param logFile: path to logfile :param tmpFile: path to temporary file :return: dict """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Combining photo and multimedia file for unique files...") pathToTrim = u'R:\web\hires\\' tmpHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \ 'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \ 'MulDateiS|MulExtentS' # handle multimedia multiHeader = 'MulId|MulPhoId|MulPfadS|MulDateiS|MulExtentS' multi = helpers.csvFileToDict(multiFile, 'MulId', multiHeader) # check that filename is unique flog.write('* Same files used by different PhoId, format is PhoId/MulId\n') logged = False namelist = [] mulPhoIdList = [] for k, v in multi.iteritems(): name = u'%s\\%s.%s' % (v['MulPfadS'], v['MulDateiS'], v['MulExtentS']) if name in namelist: logged = True flog.write('%s/%s\n' % (v['MulPhoId'], v['MullId'])) else: mulPhoIdList.append(v['MulPhoId']) namelist.append(name) output(u'\tmultimedia: %d' % len(multi)) if not logged: flog.write(u'None =)\n') # handle photo # @toDO add duplicate check to cleanup script photoHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \ 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader) output(u'\tphoto: %d' % len(photo)) # combine combined = {} flog.write(u'* unused rows in multimedia\n') logged = False for k, v in multi.iteritems(): phoId = v['MulPhoId'] mulId = v['MulId'] v['MulPfadS'] = v['MulPfadS'].replace(pathToTrim, u'') # trim filepath v['MulExtentS'] = u'' # MulExtentS is always wrong if phoId not in photo.keys(): logged = True flog.write(u'%s\n' % v) elif not photo[phoId]['MulId'] == v['MulId']: raise MyError("phoId matched but to wrong mulId: p:%s m_found:%s, " "m_expected %s" % (phoId, photo[phoId]['MulId'], mulId)) else: del v['MulPhoId'], v['MulId'] combo = photo.pop(phoId) # move out of photo combo.update(v) # add contents from multi combined[phoId] = combo if not logged: flog.write(u'None =)\n') # log any unused rows in photo flog.write(u'* unused rows in photo\n') logged = False for k, v in photo.iteritems(): logged = True flog.write(u'%s\n' % v) if not logged: flog.write(u'None =)\n') flog.close() output(u"...done") # check if anything needs to be manually handled output(u"Read the log (%s)" % logFile) combined = helpers.promptManualUpdate(combined, tmpFile, tmpHeader, 'PhoId') return combined
def run(in_path=None, out_path=None): """ main process for crunching all of the files :param in_path: path to directory containing clean csv files :param out_path: path to direcotry in whihc to store output """ # set defaults unless overridden in_path = in_path or CSV_DIR_CLEAN out_path = out_path or CSV_DIR_CRUNCH # convert to unicode if not the case if type(in_path) == str: in_path = unicode(in_path) if type(out_path) == str: out_path = unicode(out_path) # create target paths if they doen't exist if not os.path.isdir(out_path): os.mkdir(out_path) log_path = os.path.join(out_path, u'logs') if not os.path.isdir(log_path): os.mkdir(log_path) # start crunching # combine photo and multi photoFile = os.path.join(in_path, u'photo.csv') multiFile = os.path.join(in_path, u'multimedia.csv') logFile = os.path.join(log_path, u'photo_multimedia.log') tmpFile = os.path.join(out_path, u'tmp.csv') photo_multi = makePhoto_multi(photoFile, multiFile, logFile, tmpFile) # load photoAll and drop any entries without a commons connection photoAllFile = os.path.join(in_path, u'photoAll.csv') logFile = os.path.join(log_path, u'photoAll.log') photoAll = makePhotoAll(photoAllFile, photo_multi, logFile) # combine photo and Photo-ObjDaten (and photoAll) # populates the objId field in photo_multi with ALL of the RELEVANT # ObjIds and # removes unused Objects from objDaten to make it smaller photoObjDatenFile = os.path.join(in_path, u'photoObjDaten.csv') objDatenFile = os.path.join(in_path, u'objDaten.csv') logFile = os.path.join(log_path, u'photo_objDaten.log') objDaten = photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile, objDatenFile, logFile) # Adds the stichwort id field to photo and # removes unused entries from stichworth to make it smaller stichFile = os.path.join(in_path, u'stichwort.csv') stichwort = stichworth_photo(stichFile, photo_multi) # Add two fields to photo_multi: # * same photoId-different file # * same objId-different photoID samesame(photo_multi, photoAll) # Adds the Ausstellung_id field to ObjDaten and # trims Ausstellung to unique ids ausstellungFile = os.path.join(in_path, u'ausstellung.csv') ausstellung = ausstellung_objDaten(ausstellungFile, objDaten) # Adds ObjDaten-samhörande field to ObjDaten objDatenSamFile = os.path.join(in_path, u'objDatenSam.csv') objDaten_sam(objDatenSamFile, objDaten) # Adds the Eregnis field to ObjDaten and # trims Eregnis to unique ids ereignisFile = os.path.join(in_path, u'ereignis.csv') logFile = os.path.join(log_path, u'ereignis.log') ereignis = ereignis_objDaten(ereignisFile, objDaten, logFile) # Adds the kuenstler field to ObjDaten and # trims kuenstler kuenstlerFile = os.path.join(in_path, u'kuenstler.csv') logFile = os.path.join(log_path, u'kuenstler.log') kuenstler = kuenstler_objDaten(kuenstlerFile, objDaten, logFile) # Adds objMul and objMass fields to ObjDaten # then trimms objMul and objMass objMassFile = os.path.join(in_path, u'objMass.csv') objMultipleFile = os.path.join(in_path, u'objMultiple.csv') objMass, objMultiple = mulMass_add(objMassFile, objMultipleFile, objDaten) # output all the above # @toDO: simplify names (once downstream is checked) out_csv = { u'photo_multimedia_etc': photo_multi, u'stichwort_trim': stichwort, u'objMass_trim': objMass, u'objMultiple_trim': objMultiple, u'objDaten_etc': objDaten, u'ausstellung_trim': ausstellung, u'ereignis_trim': ereignis, u'kuenstler_trim': kuenstler, u'photoAll': photoAll } # @toDO: Not needed once downstream reads json out_headers = { u'photo_multimedia_etc': 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' 'AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|MulDateiS|' 'MulExtentS|PstId|same_PhoId|same_object', u'stichwort_trim': 'PstId|PhoId|StiBezeichnungS|StiSynonymS', u'objMass_trim': 'ObmId|ObmObjId|ObmTypMasseS|ObmMasseS|ObjAufId|AufAufgabeS', u'objMultiple_trim': 'OmuId|OmuObjId|OmuTypS|OmuBemerkungM|OmuInhalt01M|ObjInventarNrS|' 'ObjAufId|AufAufgabeS', u'objDaten_etc': 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' 'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' 'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' 'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|ObjFeld06M|' 'ObjReserve01M|ausId|related|ergId|role:roleCmt:kueId|mulId|' 'massId', u'ausstellung_trim': 'AusId|AusTitelS|AusOrtS|std_year|AusJahrS|AusDatumVonD|' 'AusDatumBisD|AufAufgabeS|AobObjId', u'ereignis_trim': 'ErgId|ErgKurztitelS|ErgArtS|EroObjId', u'kuenstler_trim': 'KueId|KueVorNameS|KueNameS|KudDatierungS|KudJahrVonL|KudJahrBisL|' 'KudOrtS|KudLandS|KueFunktionS|ObjId', u'photoAll': 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' 'MulId|AdrVorNameS|AdrNameS|PhoSystematikS' } for k, v in out_csv.iteritems(): outFile = os.path.join(out_path, u'%s.csv' % k) helpers.dictToCsvFile(outFile, v, out_headers[k]) output(u'\tOutputted %s' % outFile) output(u'Done!')
def samesame(photo_multi, photoAll): """ @toDo (after redux) * samePhoId no longer needed (but need to make sure it is not expected later since removing changes order) * base on photo_all + photo_multi to get connections to old Adds two fields to the photo_multi dict * same_PhoId: same phoId different file * same_object: same objId different phoId :param photo_multi: photo_multi dict :param photoAll: photoAll dict :return: None (but updates photo_multi) """ output(u"Samesame()") # load all objId connections from photo_multi objIdConnection = {} output(u'\tloading objects from photo_multi...') for k, v in photo_multi.iteritems(): phoId = v['PhoId'] mullId = v['MulId'] phoMullId = '%s:%s' % (phoId, mullId) objIds = v['PhoObjId'] for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append((phoId, phoMullId)) # load only objId connections from photoAll where object in photo_multi output(u'\tloading objects from photoAll...') for k, v in photoAll.iteritems(): phoId = v['PhoId'] mullId = v['MulId'] phoMullId = '%s:%s' % (phoId, mullId) objIds = v['PhoObjId'] for objId in objIds: if objId in objIdConnection.keys(): objIdConnection[objId].append((phoId, phoMullId)) # remove any with only one associated phoId for k, v in objIdConnection.items(): if len(v) < 2: del objIdConnection[k] output(u'\tfound %d objects in multiple photos' % len(objIdConnection)) # invert objIdConnection to get per phoId connection output('\tinverting objIdConnection...') phoIdConnection = {} for objId, v in objIdConnection.iteritems(): allPhoMull = [entry[1] for entry in v] for phoId, phoMullId in v: if phoId not in phoIdConnection.keys(): phoIdConnection[phoId] = [] phoIdConnection[phoId] += allPhoMull output('\tadding samesame to photo...') for k, v in photo_multi.iteritems(): v['same_PhoId'] = '' # @toDo remove once safe v['same_object'] = [] phoId = v['PhoId'] mullId = v['MulId'] phoMullId = '%s:%s' % (phoId, mullId) if phoId in phoIdConnection.keys(): ll = list(set(phoIdConnection[phoId])) # clone and remove dupes ll.remove(phoMullId) # remove self v['same_object'] = ll output(u"...done")
def objDaten_sam(objDatenSamFile, objDaten): """ Adds objDatenSam field to ObjDaten * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param objDatenSamFile: path to ObjDaten-samhörande data file :param objDaten: objDaten dict :return: None (but updates objDaten) """ # setup output(u"Adding ObjDaten-samhörande to ObjDaten") # handle objDatenSam output('\treading ObjDaten_-_samhörande_nr into dictionary... (slow)') objDatenSamHeader = 'OobId|OobObj1ID|OobObj2ID' objDatenSam = helpers.csvFileToDict(objDatenSamFile, 'OobId', objDatenSamHeader) # map object connections output('\tmapping object connections...') objIdConnection = {} for k, v in objDatenSam.iteritems(): objId1 = v['OobObj1ID'] objId2 = v['OobObj2ID'] if objId1 not in objIdConnection.keys(): objIdConnection[objId1] = [] if objId2 not in objIdConnection.keys(): objIdConnection[objId2] = [] objIdConnection[objId1].append(objId2) objIdConnection[objId2].append(objId1) output('\tfound %d connected objIds in %d entries' % (len(objIdConnection), len(objDatenSam))) # clean up connections output('\tremoving dupes, invalids and self...') for objId, connectedIds in objIdConnection.items(): connectedIds = list(set(connectedIds)) # remove dupe if objId in connectedIds: connectedIds.remove(objId) # remove self for conId in connectedIds[:]: # slice allows changes from within loop if conId not in objDaten.keys(): connectedIds.remove(conId) # remove invalid # delete or update if not connectedIds: del objIdConnection[objId] else: objIdConnection[objId] = connectedIds # add to objDaten output('\tadding connections to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['related'] = [] if objId in objIdConnection.keys(): v['related'] = objIdConnection.pop(objId) output(u"...done")
def ausstellung_objDaten(austellungFile, objDaten): """ Given the austellung data file and the objDaten data add a austellung id field to objDaten. Also returns the austellung data after * adding a std_year field * combining all objIds for the same ausId * dropping AobId :param austellungFile: path to austellung data file :param objDaten: objDaten dict :return: dict (and updates objDaten) """ # often requires manual fixing prior to crunch helpers.verboseInput(u"Confirm that any year formatting issues mentioned " u"in the analysis log have been corrected and the " u"updated ausstellung file saved...\n" u"...by pressing enter when done") # setup dummyTitles = ( u'reparation', u'utställning', u'lån för undersökning', u'OBS! Testpost för admin - utställning, export wikimedia commons', u'lån till Frankrike 1947', u'test karin 20100520', u'test 20100629 (en post skapad för administrativa tester)', u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk') output(u"Trimming ausstellung and adding ausstellung to ObjDaten...") # handle ausstellung austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \ 'AusDatumBisD|AobObjId|AufAufgabeS' austellung = helpers.csvFileToDict(austellungFile, 'AobId', austellungHeader) originalSize = len(austellung) # collect all ausId and drop any with invalid title # @toDO: Is keeping objId in austellung really needed? # Otherwise populate objIdConnection here foundAusId = {} for k, v in austellung.items(): # allow removing entries from within loop ausId = v['AusId'] objId = v['AobObjId'] title = v['AusTitelS'] if not title or title in dummyTitles: # remove empty/dummy del austellung[k] elif ausId not in foundAusId: # keep this entry foundAusId[ausId] = k austellung[k]['AobObjId'] = set([objId, ]) austellung[k].pop('AobId') # drop unnecessary id else: # keep only objId part of this entry austellung[foundAusId[ausId]]['AobObjId'].add(objId) del austellung[k] output('\taustellung reduced from %d to %d entries' % (originalSize, len(austellung))) # populate std_year output('\tstandardising years...') for k, v in austellung.iteritems(): year = v['AusJahrS'] yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip() ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip() v['std_year'] = stdAustellungYear(year, yfrom, ytil) # to match with pre-redux results. Could possibly be dropped instead? v['AusDatumVonD'] = yfrom v['AusDatumBisD'] = ytil # invert to get per objId connections # and convert set to list objIdConnection = {} for k, v in austellung.iteritems(): ausId = v['AusId'] objIds = v['AobObjId'] v['AobObjId'] = list(objIds) for objId in objIds: if objId not in objIdConnection.keys(): objIdConnection[objId] = [] objIdConnection[objId].append(ausId) output('\tadding ausId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['ausId'] = [] if objId in objIdConnection.keys(): v['ausId'] = objIdConnection.pop(objId) output(u"...done") return austellung
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile): """ Given the kuenstler data file and the objDaten data add a kuenstler id field to objDaten. Also returns the kuenstler data after * removing certain irrelevant roles and dummy entries * combining all objIds for the same kueId * standardising years * dropping a lot of unneeded fields :param kuenstlerFile: path to kuenstler data file :param objDaten: objDaten dict :param logFile: path to logfile :return: dict (and updates objDaten) """ # setup flog = codecs.open(logFile, 'w', 'utf-8') # logfile output(u"Crunching kuenstler...") dummyNames = (u'ingen uppgift', ) badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare', u'Återförsäljare', u'Konservator') badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia') droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS', 'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId', 'PhoId') # handle kuenstler kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \ 'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \ 'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \ 'KueFunktionS|MulId|PhoId' kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'), kuenstlerHeader) originalSize = len(kuenstler) # collect all kueId and drop any with invalid title or role # also invert to get per objId connections # @toDO: Is keeping objId in kuenstler really needed? # Otherwise populate objIdConnection here foundKueId = {} objIdConnection = {} for k, v in kuenstler.items(): # allow removing entries from within loop kueId = v['KueId'] objId = v['ObjId'] fName = v['KueVorNameS'] lName = v['KueNameS'] role = v['OkuArtS'] roleCmt = v['OkuFunktionS'] # filter out any undesired entries if role in badRoles or \ roleCmt in badRoleCmts or \ len(fName) + len(lName) == 0 or \ lName in dummyNames: del kuenstler[k] continue # send unique role/kueId combo for objid kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId) if objId not in objIdConnection.keys(): objIdConnection[objId] = set([]) objIdConnection[objId].add(kueCombo) # keep only one entry per unique kueId if kueId not in foundKueId.keys(): # keep this entry foundKueId[kueId] = k kuenstler[k]['ObjId'] = set([ objId, ]) else: # keep only objId part of this entry kuenstler[foundKueId[kueId]]['ObjId'].add(objId) del kuenstler[k] output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler))) # add to objDaten output('\tadding kueId to objDaten...') for k, v in objDaten.iteritems(): objId = v['ObjId'] v['role:roleCmt:kueId'] = [] if objId in objIdConnection.keys(): v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId)) # further cleanup of kuenstler # correcting ort/land entries # stripping years from name # dropping a bunch of fields output('\tfurther cleanup of kuenstler...') for k, v in kuenstler.iteritems(): land = v['KudOrtS'] # missnamed in original database ort = v['KudLandS'] # missnamed in original database lName = v['KueNameS'] bYear = v['KudJahrVonL'] dYear = v['KudJahrBisL'] objIds = v['ObjId'] # correct missnaming in original database v['KudOrtS'] = ort v['KudLandS'] = land # convert set to list v['ObjId'] = list(objIds) # take yearinfo out of name, and store in year lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear) if log: flog.write(log) v['KueNameS'] = lName v['KudJahrVonL'] = bYear v['KudJahrBisL'] = dYear for field in droppedFields: del v[field] flog.close() output(u"...done") return kuenstler