Ejemplo n.º 1
0
def makeEvents(A, oDict):
    '''
    Populate mapping-tables for Events
    Analysis of Ereignis
    '''
    # oDict gives frequency of each objId
    # A.objD[k][u'ergId'] gives the exhibit id(s) of each object
    # A.ereignisD[k][u'ErgKurztitelS'] gives the title
    # A.ereignisD[k][u'ErgArtS'] gives the wikilink

    # get frequency for each exhibit
    eventFreq = {}
    for k, v in oDict.iteritems():
        ergIds = A.objD[k][u'ergId']
        if ergIds:
            ergIds = ergIds.split(';')
            for e in ergIds:
                if e in eventFreq.keys():
                    eventFreq[e] += v
                else:
                    eventFreq[e] = v

    # get frequency for each place
    events = {}
    for k, v in eventFreq.iteritems():
        title = A.ereignisD[k][u'ErgKurztitelS']
        link = A.ereignisD[k][u'ErgArtS']
        if title in events.keys():
            events[title][u'freq'] += v
            if link != events[title][u'link']:
                output(u'Found two events with title but different '
                       u'links %s' % k)
        else:
            events[title] = {u'link': link, u'freq': v}
    return events
Ejemplo n.º 2
0
def makeFilenames(descriptions, photo, filenamesFile):
    """
    Given filedescriptions this outputs it correctly as csv for later import
    :param descriptions: dict of descriptions with phoId as key
    :param photo: the photo data
    :param filenamesFile: the target file for output
    :return: None
    """
    # setup
    filenamesHeader = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'

    # make a dict to be able to reuse helpers.dictToCsvFile()
    filenames = {}
    for phoId, v in descriptions.iteritems():
        filenames[phoId] = {
            'PhoId': phoId,
            'MulId': photo[phoId]['MulId'],
            'MulPfadS': photo[phoId]['MulPfadS'],
            'MulDateiS': photo[phoId]['MulDateiS'],
            'filename': v['filename'],
            'ext': ''
        }

    # output
    helpers.dictToCsvFile(filenamesFile, filenames, filenamesHeader)
    output(u'Created %s' % filenamesFile)
Ejemplo n.º 3
0
def writePhotographers(filename, dDict):
    '''
    output photographers in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Photographer|' \
        u'creator=|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|creator   = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n\n' % CSV_FILES[u'photo'] \
        + u'===Photographers===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], val[u'creator'], val[u'cat']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 4
0
def writeMaterials(filename, dDict):
    '''
    output materials in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Technique/material|technique=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        + u'|name      = %s\n' \
        + u'|frequency = %d\n' \
        + u'|technique = %s\n' \
        + u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'objMultiple'] \
        + u'commonsconnection is the relevant parameter for ' \
        + u'{{tl|technique}}. Don\'t forget to add a translation in ' \
        + u'Swedish at [[Template:Technique/sv]]\n\n' \
        + u'Set commonsconnection of irrelevant technique/material ' \
        + u'to "-".\n\n' \
        + u'===technique/material|frequency|commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 5
0
def writeKeywords(filename, dDict):
    '''
    output keywords in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|more      = %s\n' \
        u'|frequency = %d\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'stichwort'] \
        + u'Set commonsconnection of irrelevant keywords to "-"\n\n' \
        + u'Multiple categories are separated by "/"\n' \
        + u'===Keyword|frequency|description|commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, '/'.join(val[u'descr']),
                       val[u'freq'], '/'.join(val[u'cat'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 6
0
def writeKeywords(filename, dDict):
    '''
    output keywords in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|more      = %s\n' \
        u'|frequency = %d\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'stichwort'] \
        + u'Set commonsconnection of irrelevant keywords to "-"\n\n' \
        + u'Multiple categories are separated by "/"\n' \
        + u'===Keyword|frequency|description|commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, '/'.join(val[u'descr']), val[u'freq'], '/'.join(
            val[u'cat'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 7
0
def writePhotographers(filename, dDict):
    '''
    output photographers in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Photographer|' \
        u'creator=|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|creator   = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n\n' % CSV_FILES[u'photo'] \
        + u'===Photographers===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], val[u'creator'], val[u'cat']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 8
0
def makeEvents(A, oDict):
    '''
    Populate mapping-tables for Events
    Analysis of Ereignis
    '''
    # oDict gives frequency of each objId
    # A.objD[k][u'ergId'] gives the exhibit id(s) of each object
    # A.ereignisD[k][u'ErgKurztitelS'] gives the title
    # A.ereignisD[k][u'ErgArtS'] gives the wikilink

    # get frequency for each exhibit
    eventFreq = {}
    for k, v in oDict.iteritems():
        ergIds = A.objD[k][u'ergId']
        if ergIds:
            ergIds = ergIds.split(';')
            for e in ergIds:
                if e in eventFreq.keys():
                    eventFreq[e] += v
                else:
                    eventFreq[e] = v

    # get frequency for each place
    events = {}
    for k, v in eventFreq.iteritems():
        title = A.ereignisD[k][u'ErgKurztitelS']
        link = A.ereignisD[k][u'ErgArtS']
        if title in events.keys():
            events[title][u'freq'] += v
            if link != events[title][u'link']:
                output(u'Found two events with title but different '
                       u'links %s' % k)
        else:
            events[title] = {u'link': link, u'freq': v}
    return events
Ejemplo n.º 9
0
def writeMaterials(filename, dDict):
    '''
    output materials in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Technique/material|technique=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        + u'|name      = %s\n' \
        + u'|frequency = %d\n' \
        + u'|technique = %s\n' \
        + u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'objMultiple'] \
        + u'commonsconnection is the relevant parameter for ' \
        + u'{{tl|technique}}. Don\'t forget to add a translation in ' \
        + u'Swedish at [[Template:Technique/sv]]\n\n' \
        + u'Set commonsconnection of irrelevant technique/material ' \
        + u'to "-".\n\n' \
        + u'===technique/material|frequency|commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 10
0
def makeKeywords(A):
    '''
    Populate mapping-tables for Keywords
    Analysis of stichwords
    '''
    # Create a dict of depicted StichId with frequency as value
    # Working from the trimmed file means each phoId has already been
    # verified to exist.
    keywords = {}
    phoIds = []  # to make sure all phoIds really are present
    for k, v in A.stichD.iteritems():
        descr = v[u'StiSynonymS']
        key = v[u'StiBezeichnungS'].lower()
        if descr == u'':
            descr = u'-'
        if key not in keywords.keys():
            keywords[key] = {u'descr': [], u'freq': 0}
        if descr not in keywords[key][u'descr']:
            keywords[key][u'descr'].append(descr)
        keywords[key][u'freq'] += 1
        # for debugging
        if v[u'PhoId'] not in phoIds:
            phoIds.append(v[u'PhoId'])
    # debug
    for k in A.photoD.keys():
        k = k.split(':')[0]
        if k in phoIds:
            phoIds.remove(k)
    if phoIds:
        output(u'Stichwort_trim still contains unused phoIds')
    return keywords
Ejemplo n.º 11
0
def negatives(path, ext=u'.tif'):
    """
    Identify and invert all files at the given location.

    * moves file to filename-NEGATIVE_PATTERN.ext
    * creates an inverted file at filename.ext
    * creates a info file for negative and modifes info file for positive
    :param path: realtive path to the directory in which to process the files
    :param ext: image file extension (only .tif are ever negatives?)
    """
    negative_appendix = NEGATIVE_PATTERN % ext
    count = 0
    skipcount = 0
    for filename in os.listdir(path):
        if filename.endswith(ext) and \
                not filename.endswith(negative_appendix):
            negative = u'%s%s' % (filename[:-len(ext)], negative_appendix)
            if os.path.isfile(os.path.join(path, negative)):
                output(u'%s was already inverted, skipping...' % filename)
                skipcount += 1
                continue
            invert_file_and_info(path, filename, negative, ext)
            count += 1
            if count % 10 == 0:
                output(u'%d files inverted (%d)' % (count, count + skipcount))
Ejemplo n.º 12
0
def makeKeywords(A):
    '''
    Populate mapping-tables for Keywords
    Analysis of stichwords
    '''
    # Create a dict of depicted StichId with frequency as value
    # Working from the trimmed file means each phoId has already been
    # verified to exist.
    keywords = {}
    phoIds = []  # to make sure all phoIds really are present
    for k, v in A.stichD.iteritems():
        descr = v[u'StiSynonymS']
        key = v[u'StiBezeichnungS'].lower()
        if descr == u'':
            descr = u'-'
        if key not in keywords.keys():
            keywords[key] = {u'descr': [], u'freq': 0}
        if descr not in keywords[key][u'descr']:
            keywords[key][u'descr'].append(descr)
        keywords[key][u'freq'] += 1
        # for debugging
        if v[u'PhoId'] not in phoIds:
            phoIds.append(v[u'PhoId'])
    # debug
    for k in A.photoD.keys():
        k = k.split(':')[0]
        if k in phoIds:
            phoIds.remove(k)
    if phoIds:
        output(u'Stichwort_trim still contains unused phoIds')
    return keywords
Ejemplo n.º 13
0
    def get(self):
        last_metric_at = PostInstallActivityMetric.gql("ORDER BY updated_at DESC").get()
        if last_metric_at == None:
            last_metric_at = "[Never]"
        else:
            last_metric_at = last_metric_at.updated_at

        h.output(self, '<html><head><link href="/public/css/admin/admin.css" type="text/css" rel="stylesheet" /></head><body><div id="loading-msg">Loading...</div><div id="auto-fetch-post-install-activity-metrics"></div><div>Post Install Activity Re-Calculation Status: <span id="status">Last Run at: '+str(last_metric_at)+'</span> <a id="calculate-post-install-activity-metrics-button" href="#">RE-CALCULATE NOW!</a></div><div>Total Users: <span id="total-users">...</span></div><div>Histogram for Post Install Activity: <div id="histogram-text">...</div></div><script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script><script src="/public/js/admin/install-metrics.js" type="text/javascript"></script><script src="/public/js/admin/post-install-activity-metrics.js" type="text/javascript"></script></body></html>')
Ejemplo n.º 14
0
    def get(self):
        span_in_days = self.request.get('span_in_days')
        last_metric_at = KValueMetric.gql("WHERE span_in_days = :1 ORDER BY updated_at DESC", span_in_days).get()
        if last_metric_at == None:
            last_metric_at = "[Never]"
        else:
            last_metric_at = last_metric_at.updated_at

        h.output(self, '<html><head><link href="/public/css/admin/admin.css" type="text/css" rel="stylesheet" /></head><body><div id="loading-msg">Loading...</div><div id="auto-fetch-k-value-metrics"></div><div>K-Value Re-Calculation Status: <span id="status">Last Run at: '+str(last_metric_at)+'</span> <a id="calculate-k-value-metrics-button" href="#">RE-CALCULATE NOW!</a></div><div>Chart for K-Value: <div id="graph">...</div></div><script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script><script type="text/javascript" src="http://www.google.com/jsapi"></script><script src="/public/js/admin/install-metrics.js" type="text/javascript"></script><script src="/public/js/admin/k-value-metrics.js" type="text/javascript"></script></body></html>')
Ejemplo n.º 15
0
def main():
    B, L, D, scores, libraries = helpers.read('./data/d_tough_choices.txt')

    ansLibs = []
    #libraries = [[id, [book1,book2,...]] , [id, [book1,book2,...]]]

    for i in range(0, (L - 1) / 2):

        lib = libraries[2 * i]
        ansLibs.append([lib[0], lib[4]])

    helpers.output(ansLibs, "outputs/D_v1.txt")
Ejemplo n.º 16
0
def main():
    input_fs = [
        "input/a_example.in", "input/b_small.in", "input/c_medium.in",
        "input/d_big.in"
    ]
    for input_f in input_fs:
        output_f = input_f.replace(".in", ".out").replace("input", "output")
        inp = parse(input_f)
        rows, cols, min_ing, max_area, pizza = inp
        slices = cut(inp)
        # print(slices)
        output(output_f, slices)
        print('.', end='')
Ejemplo n.º 17
0
def main():
    input_fs = [
        "./input/0_submission_example.in",
        "./input/1_me_at_the_zoo.in",
        "./input/2_videos_worth_spreading.in",
        "./input/3_trending_today.in",
        "./input/4_kittens.in",
    ]
    for input_f in tqdm.tqdm(input_fs):
        output_f = input_f.replace(".in", ".out").replace("input", "output")
        inp = parse(input_f)
        cs_by_vid = solve(inp)
        output(output_f, cs_by_vid)
Ejemplo n.º 18
0
def moveHits(path, filenamesFile=None):
    """
    Goes through the root export directory to find any matching file and
    moves these to a lower case version of the directory. This flattens
    out the directory structure whilst making it easy to identify any
    non-matched files.
    :param path: path to directory with image file structures
    :param filenamesFile: filenames data file
    :return: None
    """
    # set defaults unless overridden
    filenamesFile = filenamesFile or FILENAMES

    # Find and move all relevant files
    tree, name_to_pho = makeHitlist(filenamesFile)
    subdirs = []
    for filename in os.listdir(path):
        # for LSH all files are in upper case directories
        filename_path = os.path.join(path, filename)
        if os.path.isdir(filename_path) and filename.isupper():
            subdirs.append(filename_path)
    for subdir in subdirs:
        # make a subdir path where (only the) last directory is lower case
        tmp_path, tmp_dir = os.path.split(subdir)
        lower_subdir = os.path.join(tmp_path, tmp_dir.lower())

        counter, file_num = moveFiles(lower_subdir, tree, name_to_pho,
                                      path=subdir)
        output(u'%s: %d out of %d were hits' % (subdir, counter, file_num))

    # load filenames file
    filenames_header = 'PhoId|MulId|MulPfadS|MulDateiS|filename|ext'
    old_filenames = helpers.csvFileToDict(filenamesFile, 'PhoId',
                                          filenames_header)

    # Add found extentions to filenames file
    for phoId, v in old_filenames.iteritems():
        old_filename = v['MulDateiS']
        if old_filename in name_to_pho.keys():
            v['ext'] = name_to_pho[old_filename]['ext']  # overwrite extention

    # output updated file
    helpers.dictToCsvFile(filenamesFile, old_filenames, filenames_header)

    # delete all emptied directories
    for subdir in subdirs:
        removeEmptyDirectories(subdir, top=False)
Ejemplo n.º 19
0
def writePlaces(filename, exhibitPlaces, landDict, ortDict, emptyPlaces):
    '''
    output Places in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Place|' \
        u'other=Commons connection}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|other     = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s - col: ausOrt-->\n' % CSV_FILES[u'ausstellung'] \
        + u'<!--From: %s for OmuTypS = Tillverkningsland -->\n' % CSV_FILES[u'objMultiple'] \
        + u'<!--From: %s for OmuTypS = Tillverkningsort-->\n' % CSV_FILES[u'objMultiple'] \
        + u'The preffered order of making connections are: Institution, page, category' \
        + u'(where the category is prefixed by a ":").\n\n' \
        + u'Set commonsconnection of irrelevant places to "-"\n\n' \
        + u'===Place|Frequency|Commonsconnection===\n'
    # output
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(u'\n====exhibit places====\n')
    f.write(header)
    for key, val in helpers.sortedBy(exhibitPlaces):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====origin-Countries====\n')
    f.write(header)
    for key, val in helpers.sortedBy(landDict):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====origin-cities====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ortDict):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====Preserved mappings====\n')
    f.write(header)
    for key, val in helpers.sortedBy(emptyPlaces):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 20
0
def writeObjKeywords(filename, ord1Dict, ord2Dict, gruppDict, emptyObjCats):
    '''
    output ObjKeywords in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'objDaten'] \
        + u'These are the keywords used to describe the objects ' \
        + u'themselves. Classification is used for all items whereas ' \
        + u'group is only used at HWY.\n\n' \
        + u'when possible ord1 will be used instead of the more ' \
        + u'generic ord2.\n\n' \
        + u'Multiple categories are separated by a "/"\n' \
        + u'===Keyword|frequency|commonscategory===\n'
    # output
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(u'\n====class: ord1====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ord1Dict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====class: ord2====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ord2Dict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====class: class: HWY-grupp====\n')
    f.write(header)
    for key, val in helpers.sortedBy(gruppDict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====Preserved mappings====\n')
    f.write(header)
    for key, val in helpers.sortedBy(emptyObjCats):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 21
0
def writeObjKeywords(filename, ord1Dict, ord2Dict, gruppDict, emptyObjCats):
    '''
    output ObjKeywords in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'objDaten'] \
        + u'These are the keywords used to describe the objects ' \
        + u'themselves. Classification is used for all items whereas ' \
        + u'group is only used at HWY.\n\n' \
        + u'when possible ord1 will be used instead of the more ' \
        + u'generic ord2.\n\n' \
        + u'Multiple categories are separated by a "/"\n' \
        + u'===Keyword|frequency|commonscategory===\n'
    # output
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(u'\n====class: ord1====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ord1Dict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====class: ord2====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ord2Dict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====class: class: HWY-grupp====\n')
    f.write(header)
    for key, val in helpers.sortedBy(gruppDict):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n====Preserved mappings====\n')
    f.write(header)
    for key, val in helpers.sortedBy(emptyObjCats):
        f.write(row % (key, val[u'freq'], '/'.join(val[u'connect'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 22
0
def commonsOutput(descriptions, mappingFile, allEntries=None):
    """
    Given filedescriptions this outputs it correctly in a Commons export format
    :param descriptions: dict of descriptions with phoId as key
    :param mappingFile: the target file for output
    :param allEntries: optional, a list phoIds giving the order in which
                       to output the entries. This allows for easier
                       diff comparison
    :return: None
    """
    # setup
    fOut = codecs.open(mappingFile, 'w', 'utf-8')
    chunkSize = 250
    chunkStart = u"====%d-%d====\n" \
                 u"{| class=\"wikitable sortable\"\n|-\n! PhoId !! generated " \
                 u"<descr> !! improved <descr>\n"
    rowFormat = u"|-\n| %s || %s || \n"

    # write intro
    fOut.write(
        u'Final filename becomes: <descr> - <museum> - <photoId>.<ext>\n\n'
        u'Attempts have been made to keep descriptions under %d characters '
        u'with a hard limit at %d characters\n\n'
        u'You are free to improve the descriptions by adding an alternativ '
        u'in the last column.\n'
        u'===phoId | description | new description===\n\n'
        u'%s' % (GOODLENGTH, MAXLENGTH, chunkStart % (0, chunkSize)))

    if allEntries is None:
        allEntries = descriptions.keys()
    counter = 0
    for phoId in allEntries:
        # Add regular breaks
        counter += 1
        if counter % chunkSize == 0:
            fOut.write(u'|}\n\n' + chunkStart % (counter, counter + chunkSize))

        # write row
        descr = descriptions[phoId]['descr']
        fOut.write(rowFormat % (phoId, insufficient(descr)))

    # # write outro
    fOut.write(u'|}')
    fOut.write(u'\n\n[[%s]]' % LIST_CAT)
    fOut.close()
    output(u'Created %s' % mappingFile)
Ejemplo n.º 23
0
def writePlaces(filename, exhibitPlaces, landDict, ortDict, emptyPlaces):
    '''
    output Places in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Place|' \
        u'other=Commons connection}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|other     = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s - col: ausOrt-->\n' % CSV_FILES[u'ausstellung'] \
        + u'<!--From: %s for OmuTypS = Tillverkningsland -->\n' % CSV_FILES[u'objMultiple'] \
        + u'<!--From: %s for OmuTypS = Tillverkningsort-->\n' % CSV_FILES[u'objMultiple'] \
        + u'The preffered order of making connections are: Institution, page, category' \
        + u'(where the category is prefixed by a ":").\n\n' \
        + u'Set commonsconnection of irrelevant places to "-"\n\n' \
        + u'===Place|Frequency|Commonsconnection===\n'
    # output
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(u'\n====exhibit places====\n')
    f.write(header)
    for key, val in helpers.sortedBy(exhibitPlaces):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====origin-Countries====\n')
    f.write(header)
    for key, val in helpers.sortedBy(landDict):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====origin-cities====\n')
    f.write(header)
    for key, val in helpers.sortedBy(ortDict):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n====Preserved mappings====\n')
    f.write(header)
    for key, val in helpers.sortedBy(emptyPlaces):
        f.write(row % (key, val[u'freq'], val[u'connect']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 24
0
def combineEvents(oldCatDict, oldLinkDict, newDict):
    '''
    Enrich mapping by previously done mapping
    newDict has freq and link parameters
    oldDict is split into
    oldCatDict list of categories
    oldLinkDict a link written ":sv:A link"
    Note that link can exist both in new and old but new uses "sv:A_link"
    '''
    for k, v in newDict.iteritems():
        newDict[k][u'cat'] = u''
        newDict[k][u'link'] = newDict[k][u'link'].strip(u'[]')
        if k in oldCatDict.keys():  # assume key list is same in both
            if oldCatDict[k] is not None:
                newDict[k][u'cat'] = oldCatDict[k]
            if oldLinkDict[k] is not None:
                oldlink = oldLinkDict[k]
                newlink = newDict[k][u'link'].replace('_', ' ')
                if oldlink != newlink:
                    # check if the same, otherwise use old
                    if newlink:
                        output(u'Eregnis: replaced %s by %s' %
                               (newlink, oldlink))
                    newlink = oldlink
                newDict[k][
                    u'link'] = newlink  # reformated and possibly replaced
            del oldCatDict[
                k]  # no need to delete oldLinkDict if we iterate over cat

    # add any previous mapping
    for k, v in oldCatDict.iteritems():
        cat = v
        link = oldLinkDict[k]
        if (cat is not None) or (link is not None):
            if cat is None:
                cat = u''
            if link is None:
                link = u''
            newDict[k] = {u'freq': 0, u'cat': cat, u'link': link}

    return newDict
Ejemplo n.º 25
0
def main():
    B, L, D, scores, libraries = h.read(
        "../data/e_so_many_books.txt"
    )  # libraries is [id,NBooks,TDays,MShipsperday,[books]]
    # TODO Call get_points
    book_scores = get_book_point_lib(libraries, scores)

    #list.sort(libraries, key=lambda library:get_points(library,book_scores), reverse=True)
    tot_points = 0
    # sort books by value and at total points to calculate average
    for lib in libraries:
        list.sort(lib[4], key=lambda book: book_scores[book], reverse=True)
        tot_points += get_points(lib, book_scores)
    average_points = tot_points / L
    list.sort(
        libraries,
        key=lambda library: get_points2(library, book_scores, average_points),
        reverse=True)
    ansLibs = []

    day = 0
    new_libraries = []
    for lib in libraries:
        day_local = day + lib[2]  # Add time to set up
        books_to_scan = []
        while day_local < D:
            list.sort(lib[4], key=lambda book: book_scores[book],
                      reverse=True)  #sort
            books_to_scan.append(lib[4][0:lib[2]])
            for i in range(lib[2]):
                if i < len(lib[4]):
                    books_to_scan.append(lib[4][i])
                    book_scores[lib[4][i]] = 0
            day_local += lib[2]  #iterate over days
        new_libraries.append([lib[0], books_to_scan])

    #print("Days total are: " + str(D))
    for i in range(int((L - 1) / 2)):
        lib = new_libraries[2 * i]
        ansLibs.append([lib[0], lib[4]])
    h.output(ansLibs, "../outputs/E_v1.txt")
Ejemplo n.º 26
0
def writeEvents(filename, dDict):
    '''
    output events in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Event|' \
        u'link=Wikipedia-link|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|link      = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->' % CSV_FILES[u'ereignis'] \
        + u'\'\'wikipedia-link\'\' are used for descriptive texts ' \
        + u'whereas \'\'commonsconnection\'\' is a relevant category ' \
        + u'on commons.\n\n' \
        + u'Set commonsconnection of irrelevant events to "-"\n\n' \
        + u'Multiple categories are separated by "/"\n\n' \
        + u'*död/begravning: [[:Category:Funeral of X of Sweden]]\n' \
        + u'*kröning: [[:Category:Coronation of X of Sweden]]\n' \
        + u'*bröllop: [[:Category:Wedding of X and Y of Sweden]]\n' \
        + u'===Event|Frequency|wikipedia-link|Commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], val[u'link'], '/'.join(val[u'cat'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 27
0
def writeEvents(filename, dDict):
    '''
    output events in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Event|' \
        u'link=Wikipedia-link|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|frequency = %d\n' \
        u'|link      = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->' % CSV_FILES[u'ereignis'] \
        + u'\'\'wikipedia-link\'\' are used for descriptive texts ' \
        + u'whereas \'\'commonsconnection\'\' is a relevant category ' \
        + u'on commons.\n\n' \
        + u'Set commonsconnection of irrelevant events to "-"\n\n' \
        + u'Multiple categories are separated by "/"\n\n' \
        + u'*död/begravning: [[:Category:Funeral of X of Sweden]]\n' \
        + u'*kröning: [[:Category:Coronation of X of Sweden]]\n' \
        + u'*bröllop: [[:Category:Wedding of X and Y of Sweden]]\n' \
        + u'===Event|Frequency|wikipedia-link|Commonsconnection===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (key, val[u'freq'], val[u'link'], '/'.join(val[u'cat'])))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 28
0
def writePeople(filename, dDict):
    '''
    output People in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Name <small>(kueId)</small>' \
        u'|link=Wikipedia-link|creator=|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|more      = %s\n' \
        u'|frequency = %d\n' \
        u'|link      = %s\n' \
        u'|creator   = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'kuenstler'] \
        + u'\'\'wikipedia-link\'\' is used for descriptive texts whereas ' \
        + u'creator is a creator template on commons and ' \
        + u'\'\'commoncat\'\' is a relevant category on commons.\n\n' \
        + u'Set commonsconnection of irrelevant events to "-". ' \
        + u'Note that creator is only relevant for artists.\n\n' \
        + u'===kueId|frequency|name|wikipedia-link|creator|commoncat===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (val[u'descr'], key, val[u'freq'], val[u'link'],
                       val[u'creator'], val[u'cat']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 29
0
def writePeople(filename, dDict):
    '''
    output People in Commons format
    '''
    # set-up
    header = u'{{user:Lokal Profil/LSH2|name=Name <small>(kueId)</small>' \
        u'|link=Wikipedia-link|creator=|category=}}\n'
    row = u'{{User:Lokal Profil/LSH3\n' \
        u'|name      = %s\n' \
        u'|more      = %s\n' \
        u'|frequency = %d\n' \
        u'|link      = %s\n' \
        u'|creator   = %s\n' \
        u'|category  = %s\n' \
        u'}}\n'
    footer = u'|}\n'
    intro = u'<!--From: %s -->\n' % CSV_FILES[u'kuenstler'] \
        + u'\'\'wikipedia-link\'\' is used for descriptive texts whereas ' \
        + u'creator is a creator template on commons and ' \
        + u'\'\'commoncat\'\' is a relevant category on commons.\n\n' \
        + u'Set commonsconnection of irrelevant events to "-". ' \
        + u'Note that creator is only relevant for artists.\n\n' \
        + u'===kueId|frequency|name|wikipedia-link|creator|commoncat===\n'
    # output
    once = True
    f = codecs.open(filename, 'w', 'utf8')
    f.write(intro)
    f.write(header)
    for key, val in helpers.sortedBy(dDict):
        if once and val[u'freq'] == 0:
            once = False
            f.write(footer)
            f.write(u'\n===Preserved mappings===\n')
            f.write(header)
        f.write(row % (val[u'descr'], key, val[u'freq'], val[u'link'],
                       val[u'creator'], val[u'cat']))
    f.write(footer)
    f.write(u'\n\n[[%s]]' % LIST_CAT)
    f.close()
    output(u'Created %s' % filename)
Ejemplo n.º 30
0
def makePhotoAll(photoAllFile, photo_multi, logFile):
    """
    @toDO: if dupes are found then prompt manual cleanup then re-run
           makePhotoAll(), That way crash isn't complete.
    Given the photoAll data file read it and drop any entries without a
    commons connection. Also Simplify the data
    :param photoAllFile: path to photoAll data file
    :param photo_multi: photo_multi dict
    :param logFile: path to logfile
    :return: dict
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll "
                         u"analysis log have been corrected and the updated "
                         u"photoAll file saved...\n"
                         u"...by pressing enter when done")

    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Loading photoAll...")
    photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
                     'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'),
                                     photoAllHeader)
    originalSize = len(photoAll)

    for k, v in photoAll.items():
        link = v['PhoSystematikS']

        # drop any entries without files
        if not link:
            del photoAll[k]
            continue

        # simplify link
        if '%' in link:
            link = helpers.urldecode_utf8(link)
        link = helpers.external_2_internal_link(link, project='wikimedia')
        link = link[len('[[:commons:File:'):-len(']]')]
        v['PhoSystematikS'] = link
    output('PhotoAll reduced from %d to %d entries' % (originalSize,
                                                       len(photoAll)))

    # check that none of PhoId from photo_multi occur in photo
    dupes = []
    for phoId in photo_multi.keys():
        phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId'])
        if phoMul in photoAll.keys():
            dupes.append(phoMul)
    if dupes:
        output(u'Found duplicates between photoAll and photo_multi. '
               u'This will most likely mess things up. Check the log at '
               u'%s for details.' % logFile)
        flog.write(u'* duplicates found in photo and photo_all\n'
                   u'phoId:MulId|commonsFile\n')
        for d in dupes:
            flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS']))

    flog.close()
    return photoAll
Ejemplo n.º 31
0
def makePhotoAll(photoAllFile, photo_multi, logFile):
    """
    @toDO: if dupes are found then prompt manual cleanup then re-run
           makePhotoAll(), That way crash isn't complete.
    Given the photoAll data file read it and drop any entries without a
    commons connection. Also Simplify the data
    :param photoAllFile: path to photoAll data file
    :param photo_multi: photo_multi dict
    :param logFile: path to logfile
    :return: dict
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll "
                         u"analysis log have been corrected and the updated "
                         u"photoAll file saved...\n"
                         u"...by pressing enter when done")

    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Loading photoAll...")
    photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
                     'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'),
                                     photoAllHeader)
    originalSize = len(photoAll)

    for k, v in photoAll.items():
        link = v['PhoSystematikS']

        # drop any entries without files
        if not link:
            del photoAll[k]
            continue

        # simplify link
        if '%' in link:
            link = helpers.urldecode_utf8(link)
        link = helpers.external_2_internal_link(link, project='wikimedia')
        link = link[len('[[:commons:File:'):-len(']]')]
        v['PhoSystematikS'] = link
    output('PhotoAll reduced from %d to %d entries' %
           (originalSize, len(photoAll)))

    # check that none of PhoId from photo_multi occur in photo
    dupes = []
    for phoId in photo_multi.keys():
        phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId'])
        if phoMul in photoAll.keys():
            dupes.append(phoMul)
    if dupes:
        output(u'Found duplicates between photoAll and photo_multi. '
               u'This will most likely mess things up. Check the log at '
               u'%s for details.' % logFile)
        flog.write(u'* duplicates found in photo and photo_all\n'
                   u'phoId:MulId|commonsFile\n')
        for d in dupes:
            flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS']))

    flog.close()
    return photoAll
Ejemplo n.º 32
0
def combineEvents(oldCatDict, oldLinkDict, newDict):
    '''
    Enrich mapping by previously done mapping
    newDict has freq and link parameters
    oldDict is split into
    oldCatDict list of categories
    oldLinkDict a link written ":sv:A link"
    Note that link can exist both in new and old but new uses "sv:A_link"
    '''
    for k, v in newDict.iteritems():
        newDict[k][u'cat'] = u''
        newDict[k][u'link'] = newDict[k][u'link'].strip(u'[]')
        if k in oldCatDict.keys():  # assume key list is same in both
            if oldCatDict[k] is not None:
                newDict[k][u'cat'] = oldCatDict[k]
            if oldLinkDict[k] is not None:
                oldlink = oldLinkDict[k]
                newlink = newDict[k][u'link'].replace('_', ' ')
                if oldlink != newlink:
                    # check if the same, otherwise use old
                    if newlink:
                        output(u'Eregnis: replaced %s by %s' %
                               (newlink, oldlink))
                    newlink = oldlink
                newDict[k][u'link'] = newlink  # reformated and possibly replaced
            del oldCatDict[k]  # no need to delete oldLinkDict if we iterate over cat

    # add any previous mapping
    for k, v in oldCatDict.iteritems():
        cat = v
        link = oldLinkDict[k]
        if (cat is not None) or (link is not None):
            if cat is None:
                cat = u''
            if link is None:
                link = u''
            newDict[k] = {u'freq': 0, u'cat': cat, u'link': link}

    return newDict
Ejemplo n.º 33
0
def main():
    """
    """
    res = {}
    defs = all_defs()
    n_defs = len(defs)
    n_calls = 0
    n_refs = 0
    called_or_refed = set()
    helpers.output('\n'.join(defs), DEFS)
    print("Listing of functions (& methods) defined sent to '{}'".format(DEFS))
    for mod_name in MODULE_NAMES:
        with open("{}.py".format(mod_name), 'r') as module:
            line_n = 0
            for line in module:
                line_n += 1
                for def_ in defs:
                    # searching for funtion def_
                    mod, func_w_n = def_.split('.')
                    func, n = func_w_n.split('@')
                    refed = []
                    if call_found(func, line, refed):
                        n_calls += 1
                        called_or_refed.add(def_)
                        _ = res.setdefault(def_, {})
                        _ = res[def_].setdefault("calls", [])
                        res[def_]["calls"].append("{} @ {}".format(
                            mod_name, line_n))
                    if refed:
                        n_refs += 1
                        called_or_refed.add(def_)
                        _ = res.setdefault(def_, {})
                        _ = res[def_].setdefault("references", [])
                        res[def_]["references"].append("{} @ {}".format(
                            mod_name, line_n))

    n_no_refs = len(defs) - len(called_or_refed)
    no_refs = set(defs) - called_or_refed
    helpers.output('\n'.join(pformat(res)), CALLS)
    print("Listing of function (& method) calls sent to '{}'".format(CALLS))
    helpers.output(pprint.pformat(res, compact=True, width=70), CALLSf)
    print("Formatted listing of function (& method) calls sent to '{}'".format(
        CALLSf))
    helpers.output('\n'.join(sorted(no_refs)), UNUSED)
    print(
        "Listing of unused functions (& methods) sent to '{}'".format(UNUSED))
    print("Found {} defs, {} calls, {} refs & {} with out either".format(
        n_defs, n_calls, n_refs, n_no_refs))
Ejemplo n.º 34
0
def trimObjDaten(objDaten, photo_multi, photoAll):
    """
    Removes any unused objects in objDaten, because it is huge!
    :param objDaten: objDaten dict
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :return: None
    """
    output(u"\tTrimming objDaten...")
    originalSize = len(objDaten)

    # collect all objIds not mentioned in photo_multi or photoAll
    unusedObjIds = set(objDaten.keys())
    for k, v in photo_multi.iteritems():
        unusedObjIds = unusedObjIds - set(v['PhoObjId'])
    for k, v in photoAll.iteritems():
        unusedObjIds = unusedObjIds - set(v['PhoObjId'])

    # remove any which should be trimmed
    for objId in unusedObjIds:
        del objDaten[objId]

    output('\tobjDaten reduced from: %d to %d' % (originalSize, len(objDaten)))
Ejemplo n.º 35
0
def trimObjDaten(objDaten, photo_multi, photoAll):
    """
    Removes any unused objects in objDaten, because it is huge!
    :param objDaten: objDaten dict
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :return: None
    """
    output(u"\tTrimming objDaten...")
    originalSize = len(objDaten)

    # collect all objIds not mentioned in photo_multi or photoAll
    unusedObjIds = set(objDaten.keys())
    for k, v in photo_multi.iteritems():
        unusedObjIds = unusedObjIds - set(v['PhoObjId'])
    for k, v in photoAll.iteritems():
        unusedObjIds = unusedObjIds - set(v['PhoObjId'])

    # remove any which should be trimmed
    for objId in unusedObjIds:
        del objDaten[objId]

    output('\tobjDaten reduced from: %d to %d' % (originalSize, len(objDaten)))
Ejemplo n.º 36
0
 def get(self):
     h.output(self, """
         Admin: 
             <a href='/admin/pageviews'>Page Views</a> | 
             <a href='/admin/users'>Users</a> | 
             <a href='/admin/usergraphs'>User Graphs</a> | 
             <a href='/admin/querys'>Searches</a> | 
             <a href='/admin/resultviews'>Result Views</a> 
         <br/> Calculated Metrics: 
             <a href='/admin/installmetrics'>User Install Metrics</a> | 
             <a href='/admin/installmetrics/summary'>Summary Install Metrics</a> 
         <br/> Calculated Metrics: 
             <a href='/admin/organicsearchmetrics'>Organic Search Metrics</a> | 
             <a href='/admin/organicsearchmetrics/summary'>Summary Organic Search Metrics</a> 
         <br/> Calculated Metrics: 
             <a href='/admin/postinstallactivitymetrics'>Post-Install Activity Metrics</a> | 
             <a href='/admin/postinstallactivitymetrics/summary'>Summary Post-Install Activity Metrics</a> 
         <br/> Calculated Metrics: 
             <a href='/admin/kvaluemetrics'>7-Day K Value Metrics</a> | 
             <a href='/admin/kvaluemetrics/summary'>7-Day K Value Metrics</a>
         <br/> Beta: 
             <a href='/admin/paths'>Navigation Paths</a> | 
             <a href='/admin/url-analyzer'>URL Analyzer</a> |  
             <a href='/admin/pageviews/normalizer'>Page View URL Normalizer</a>""")
Ejemplo n.º 37
0
def make_neg_and_pos_info(info_file, filename, ext):
    """
    Generate a negative and positive version of the given info file.

    The two refer to each other using the negative/positive parameters. The
    negative file gets categories removed.
    :param info_file: the contents of the info file
    :param filename: the (positive) image filename
    :param ext: the file extension
    """
    negative_appendix = NEGATIVE_PATTERN % ext
    ov_position = info_file.find(u'|other_versions=')

    # for negative we need to identify end position of the template
    end_position = -1
    end_patterns = [u'</gallery>\n}}', u'|other_versions= \n}}']
    for end_pattern in end_patterns:
        end_position = info_file.find(end_pattern)
        if end_position > 0:
            end_position += len(end_pattern)
            break
    if not end_position > 0:
        # if all else fails just keep it all
        output('%s: could not find end of template' % filename)
        end_position = len(info_file)

    # make new infos
    pos_info = u'%s|negative= %s\n%s' % (
        info_file[:ov_position],
        u'%s%s' % (filename[:-len(ext)], negative_appendix),
        info_file[ov_position:])
    neg_info = u'%s|positive= %s\n%s' % (
        info_file[:ov_position],
        filename,
        info_file[ov_position:end_position])
    return (neg_info, pos_info)
Ejemplo n.º 38
0
def removeEmptyDirectories(path, top=True):
    """
    Remove any empty directories and subdirectories
    :param path: path to direcotry to start deleting from
    :param top: set to True to not delete the current directory
    :return: None
    """
    if not os.path.isdir(path):
        return

    # remove empty sub-directory
    files = os.listdir(path)
    for f in files:
        fullpath = os.path.join(path, f)
        if os.path.isdir(fullpath):
            removeEmptyDirectories(fullpath, top=False)

    # re-read and delete directory if empty,
    files = os.listdir(path)
    if not top:
        if not files:
            os.rmdir(path)
        else:
            output('Not removing non-empty directory: %s' % path)
Ejemplo n.º 39
0
def stichworth_photo(stichwortFile, photo_multi):
    """
    Given the photo-multi data and the stichwort data file add a stichwort id
    field to photo-multi.
    Also returns the stichwort data after trimming away any unused info
    :param stichwortFile: path to stichwort data file
    :param photo_multi: photo_multi dict
    :return: dict (and updates photo_multi)
    """
    # setup
    output(u"Adding stichworth to photo")

    # handle stichwort
    output(u'\treading in stichwort...')
    stichwortHeader = 'PstId|PhoId|StiBezeichnungS|StiSynonymS'
    stichwort = helpers.csvFileToDict(stichwortFile, 'PstId', stichwortHeader)
    originalSize = len(stichwort)

    # match each phoId to several stichId
    # removing any entries with invalid phoIds
    photoStichConnection = {}
    for k, v in stichwort.items():
        phoId = v['PhoId']
        pstId = v['PstId']
        if phoId in photo_multi.keys():
            if phoId not in photoStichConnection.keys():
                photoStichConnection[phoId] = set([])
            photoStichConnection[phoId].add(pstId)
        else:
            del stichwort[k]
    output('\tstichwort trimmed from %d to %d, found %d phoId' %
           (originalSize, len(stichwort), len(photoStichConnection)))

    # add stichId to photo_multi
    for k, v in photo_multi.iteritems():
        phoId = v['PhoId']
        v['PstId'] = []
        if phoId in photoStichConnection.keys():
            v['PstId'] = list(photoStichConnection.pop(phoId))

    # confirm and return
    output(u"...done")
    return stichwort
Ejemplo n.º 40
0
def stichworth_photo(stichwortFile, photo_multi):
    """
    Given the photo-multi data and the stichwort data file add a stichwort id
    field to photo-multi.
    Also returns the stichwort data after trimming away any unused info
    :param stichwortFile: path to stichwort data file
    :param photo_multi: photo_multi dict
    :return: dict (and updates photo_multi)
    """
    # setup
    output(u"Adding stichworth to photo")

    # handle stichwort
    output(u'\treading in stichwort...')
    stichwortHeader = 'PstId|PhoId|StiBezeichnungS|StiSynonymS'
    stichwort = helpers.csvFileToDict(stichwortFile, 'PstId', stichwortHeader)
    originalSize = len(stichwort)

    # match each phoId to several stichId
    # removing any entries with invalid phoIds
    photoStichConnection = {}
    for k, v in stichwort.items():
        phoId = v['PhoId']
        pstId = v['PstId']
        if phoId in photo_multi.keys():
            if phoId not in photoStichConnection.keys():
                photoStichConnection[phoId] = set([])
            photoStichConnection[phoId].add(pstId)
        else:
            del stichwort[k]
    output('\tstichwort trimmed from %d to %d, found %d phoId' %
           (originalSize, len(stichwort), len(photoStichConnection)))

    # add stichId to photo_multi
    for k, v in photo_multi.iteritems():
        phoId = v['PhoId']
        v['PstId'] = []
        if phoId in photoStichConnection.keys():
            v['PstId'] = list(photoStichConnection.pop(phoId))

    # confirm and return
    output(u"...done")
    return stichwort
Ejemplo n.º 41
0
def photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile, objDatenFile,
                   logFile):
    """
    Given the photo_multi data and the phoObjDaten + objDaten data files
    any additional relevant ObjIds are added to the PhoObjId field of the
    photo_multi dict, this field is also converted to a list.
    Also returns objDaten for later use
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :param photoObjDatenFile: path to phoObjDaten data file
    :param objDatenFile: path to objDaten data file
    :param logFile: path to logfile
    :return: dict (and updates photo_multi)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Combining all ObjId into the photo file...")

    # handle objDaten
    output(u'\treading in objDaten.. (takes a while)')
    objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \
                     'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \
                     'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \
                     'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|' \
                     'ObjFeld06M|ObjReserve01M'
    objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader)

    # match each objInvNr to several objId
    objInvNr2ObjId = {}  # old oDict
    output(u'\tfinding objInvNr connections...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        objInvNr = v['ObjInventarNrS']
        if not objInvNr:
            continue
        if objInvNr not in objInvNr2ObjId.keys():
            objInvNr2ObjId[objInvNr] = []
        objInvNr2ObjId[objInvNr].append(objId)
    output('\tFound %d objInvNr connections in %d objects' %
           (len(objInvNr2ObjId), len(objDaten)))

    # handle photoObjDaten
    photoObjDatenHeader = 'PhmId|AufId|AufAufgabeS|MulId|PhoId|ObjInvNrS'
    photoObjDaten = helpers.csvFileToDict(photoObjDatenFile,
                                          'PhmId',
                                          photoObjDatenHeader,
                                          keep=('PhoId', 'ObjInvNrS'))

    # match each phoId to several objId via the ObjInvNr
    output(u'\tfinding photo-object connections...')
    photoObjConnections = {}
    skipped = []  # ObjInvNr not in ObjDaten
    for k, v in photoObjDaten.iteritems():
        objInvNr = v['ObjInvNrS']
        phoId = v['PhoId']
        if not objInvNr:
            continue
        if objInvNr not in objInvNr2ObjId.keys():
            skipped.append(objInvNr)
            continue
        if phoId not in photoObjConnections.keys():
            photoObjConnections[phoId] = []
        photoObjConnections[phoId] += objInvNr2ObjId[objInvNr]
    output('\tFound %d connected photos in %d photoObjDaten entries' %
           (len(photoObjConnections), len(photoObjDaten)))

    # add to photo_multi and photoAll
    photoDicts = (photo_multi, photoAll)
    allBadObjId = []
    for pDict in photoDicts:
        for k, v in pDict.iteritems():
            phoId = v['PhoId']
            objIds = []
            if phoId not in photoObjConnections.keys():
                if v['PhoObjId']:
                    objIds.append(v['PhoObjId'])
            else:
                # combine relevant objIds
                objIds = photoObjConnections.pop(phoId)  # new connections
                if v['PhoObjId']:
                    objIds.append(v['PhoObjId'])  # old connection
                objIds = list(set(objIds))  # remove dupes

            # check that all of these actually exists (old realObjOnly())
            # and remove otherwise
            badObjId = []
            for objId in objIds:
                if objId not in objDaten.keys():
                    badObjId.append(objId)
            if badObjId:
                allBadObjId += badObjId
                for badId in badObjId:
                    objIds.remove(badId)

            # set new value
            v['PhoObjId'] = objIds

    # log any skipped ObjInvNr
    if skipped:
        skipped = list(set(skipped))  # remove dupes
        output(u"\tthere were %d skipped ObjInvNr, see log (%s)" %
               (len(skipped), logFile))
        flog.write(u'*Unknown objInvs, i.e. ObjInvNrS in photoObjDaten '
                   u'without a match in ObjDaten\n')
        flog.write(u'%s\n' % ', '.join(skipped))

    # log any bad objId
    if allBadObjId:
        output('\tI found some bad objIds. Check the %s' % logFile)
        allBadObjId = list(set(allBadObjId))  # remove dupes
        flog.write(u'* objIds in photo but not in objDaten\n')
        flog.write(u'%s\n' % ', '.join(allBadObjId))

    # trim objDaten
    trimObjDaten(objDaten, photo_multi, photoAll)

    # confirm and return
    output(u"...done")
    flog.close()
    return objDaten
Ejemplo n.º 42
0
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile):
    """
    Given the kuenstler data file and the objDaten data add a kuenstler id
    field to objDaten.
    Also returns the kuenstler data after
    * removing certain irrelevant roles and dummy entries
    * combining all objIds for the same kueId
    * standardising years
    * dropping a lot of unneeded fields
    :param kuenstlerFile: path to kuenstler data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Crunching kuenstler...")
    dummyNames = (u'ingen uppgift', )
    badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare',
                u'Återförsäljare', u'Konservator')
    badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia')
    droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS',
                     'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId',
                     'PhoId')

    # handle kuenstler
    kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \
                      'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \
                      'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \
                      'KueFunktionS|MulId|PhoId'
    kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'),
                                      kuenstlerHeader)
    originalSize = len(kuenstler)

    # collect all kueId and drop any with invalid title or role
    # also invert to get per objId connections
    # @toDO: Is keeping objId in kuenstler really needed?
    #        Otherwise populate objIdConnection here
    foundKueId = {}
    objIdConnection = {}
    for k, v in kuenstler.items():  # allow removing entries from within loop
        kueId = v['KueId']
        objId = v['ObjId']
        fName = v['KueVorNameS']
        lName = v['KueNameS']
        role = v['OkuArtS']
        roleCmt = v['OkuFunktionS']

        # filter out any undesired entries
        if role in badRoles or \
                roleCmt in badRoleCmts or \
                len(fName) + len(lName) == 0 or \
                lName in dummyNames:
            del kuenstler[k]
            continue

        # send unique role/kueId combo for objid
        kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId)
        if objId not in objIdConnection.keys():
            objIdConnection[objId] = set([])
        objIdConnection[objId].add(kueCombo)

        # keep only one entry per unique kueId
        if kueId not in foundKueId.keys():  # keep this entry
            foundKueId[kueId] = k
            kuenstler[k]['ObjId'] = set([objId, ])
        else:  # keep only objId part of this entry
            kuenstler[foundKueId[kueId]]['ObjId'].add(objId)
            del kuenstler[k]
    output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler)))

    # add to objDaten
    output('\tadding kueId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['role:roleCmt:kueId'] = []
        if objId in objIdConnection.keys():
            v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId))

    # further cleanup of kuenstler
    # correcting ort/land entries
    # stripping years from name
    # dropping a bunch of fields
    output('\tfurther cleanup of kuenstler...')
    for k, v in kuenstler.iteritems():
        land = v['KudOrtS']  # missnamed in original database
        ort = v['KudLandS']  # missnamed in original database
        lName = v['KueNameS']
        bYear = v['KudJahrVonL']
        dYear = v['KudJahrBisL']
        objIds = v['ObjId']

        # correct missnaming in original database
        v['KudOrtS'] = ort
        v['KudLandS'] = land

        # convert set to list
        v['ObjId'] = list(objIds)

        # take yearinfo out of name, and store in year
        lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear)
        if log:
            flog.write(log)
        v['KueNameS'] = lName
        v['KudJahrVonL'] = bYear
        v['KudJahrBisL'] = dYear

        for field in droppedFields:
            del v[field]

    flog.close()
    output(u"...done")
    return kuenstler
Ejemplo n.º 43
0
def ereignis_objDaten(ereignisFile, objDaten, logFile):
    """
    Given the ereignis data file and the objDaten data add a ereignis id
    field to objDaten.
    Also returns the ereignis data after
    * combining all objIds for the same ergId
    * dropping EroId
    :param ereignisFile: path to eregnis data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Trimming eregnis and adding eregnis to ObjDaten...")

    # handle eregnis
    ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS'
    ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader)
    originalSize = len(ereignis)

    # collect all ergId and drop any with invalid title
    # @toDO: Is keeping objId in eregnis really needed?
    #        Otherwise populate objIdConnection here
    foundErgId = {}
    for k, v in ereignis.items():  # allow removing entries from within loop
        ergId = v['ErgId']
        objId = v['EroObjId']
        title = v['ErgKurztitelS']
        if not title:  # remove empty
            del ereignis[k]
        elif ergId not in foundErgId.keys():  # keep this entry
            foundErgId[ergId] = k
            ereignis[k]['EroObjId'] = set([objId, ])
            ereignis[k].pop('EroId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            ereignis[foundErgId[ergId]]['EroObjId'].add(objId)
            del ereignis[k]
    output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis)))

    # handle urls in ereignis and convert set to list
    for k, v in ereignis.iteritems():
        objIds = v['EroObjId']
        url = v['ErgArtS']

        # convert set to list
        v['EroObjId'] = list(objIds)

        # handle urls
        if u'%' in url:
            url = helpers.urldecode_utf8(url)
        # convert external links to internal
        if 'wikipedia' in url:
            url = helpers.external_2_internal_link(url)
        elif url:
            flog.write(u'weird url: %s\n' % url)
        v['ErgArtS'] = url

    # invert to get per objId connections
    objIdConnection = {}
    for k, v in ereignis.iteritems():
        ergId = v['ErgId']
        objIds = v['EroObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ergId)

    # add to objDaten
    output('\tadding ergId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ergId'] = []
        if objId in objIdConnection.keys():
            v['ergId'] = objIdConnection.pop(objId)

    flog.close()
    output(u"...done")
    return ereignis
Ejemplo n.º 44
0
def ereignis_objDaten(ereignisFile, objDaten, logFile):
    """
    Given the ereignis data file and the objDaten data add a ereignis id
    field to objDaten.
    Also returns the ereignis data after
    * combining all objIds for the same ergId
    * dropping EroId
    :param ereignisFile: path to eregnis data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Trimming eregnis and adding eregnis to ObjDaten...")

    # handle eregnis
    ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS'
    ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader)
    originalSize = len(ereignis)

    # collect all ergId and drop any with invalid title
    # @toDO: Is keeping objId in eregnis really needed?
    #        Otherwise populate objIdConnection here
    foundErgId = {}
    for k, v in ereignis.items():  # allow removing entries from within loop
        ergId = v['ErgId']
        objId = v['EroObjId']
        title = v['ErgKurztitelS']
        if not title:  # remove empty
            del ereignis[k]
        elif ergId not in foundErgId.keys():  # keep this entry
            foundErgId[ergId] = k
            ereignis[k]['EroObjId'] = set([
                objId,
            ])
            ereignis[k].pop('EroId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            ereignis[foundErgId[ergId]]['EroObjId'].add(objId)
            del ereignis[k]
    output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis)))

    # handle urls in ereignis and convert set to list
    for k, v in ereignis.iteritems():
        objIds = v['EroObjId']
        url = v['ErgArtS']

        # convert set to list
        v['EroObjId'] = list(objIds)

        # handle urls
        if u'%' in url:
            url = helpers.urldecode_utf8(url)
        # convert external links to internal
        if 'wikipedia' in url:
            url = helpers.external_2_internal_link(url)
        elif url:
            flog.write(u'weird url: %s\n' % url)
        v['ErgArtS'] = url

    # invert to get per objId connections
    objIdConnection = {}
    for k, v in ereignis.iteritems():
        ergId = v['ErgId']
        objIds = v['EroObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ergId)

    # add to objDaten
    output('\tadding ergId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ergId'] = []
        if objId in objIdConnection.keys():
            v['ergId'] = objIdConnection.pop(objId)

    flog.close()
    output(u"...done")
    return ereignis
Ejemplo n.º 45
0
def ausstellung_objDaten(austellungFile, objDaten):
    """
    Given the austellung data file and the objDaten data add a austellung id
    field to objDaten.
    Also returns the austellung data after
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param austellungFile: path to austellung data file
    :param objDaten: objDaten dict
    :return: dict (and updates objDaten)
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any year formatting issues mentioned "
                         u"in the analysis log have been corrected and the "
                         u"updated ausstellung file saved...\n"
                         u"...by pressing enter when done")

    # setup
    dummyTitles = (
        u'reparation', u'utställning', u'lån för undersökning',
        u'OBS! Testpost för admin - utställning, export wikimedia commons',
        u'lån till Frankrike 1947', u'test karin 20100520',
        u'test 20100629 (en post skapad för administrativa tester)',
        u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk')
    output(u"Trimming ausstellung and adding ausstellung to ObjDaten...")

    # handle ausstellung
    austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \
                       'AusDatumBisD|AobObjId|AufAufgabeS'
    austellung = helpers.csvFileToDict(austellungFile, 'AobId',
                                       austellungHeader)
    originalSize = len(austellung)

    # collect all ausId and drop any with invalid title
    # @toDO: Is keeping objId in austellung really needed?
    #        Otherwise populate objIdConnection here
    foundAusId = {}
    for k, v in austellung.items():  # allow removing entries from within loop
        ausId = v['AusId']
        objId = v['AobObjId']
        title = v['AusTitelS']
        if not title or title in dummyTitles:  # remove empty/dummy
            del austellung[k]
        elif ausId not in foundAusId:  # keep this entry
            foundAusId[ausId] = k
            austellung[k]['AobObjId'] = set([
                objId,
            ])
            austellung[k].pop('AobId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            austellung[foundAusId[ausId]]['AobObjId'].add(objId)
            del austellung[k]
    output('\taustellung reduced from %d to %d entries' %
           (originalSize, len(austellung)))

    # populate std_year
    output('\tstandardising years...')
    for k, v in austellung.iteritems():
        year = v['AusJahrS']
        yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip()
        ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip()
        v['std_year'] = stdAustellungYear(year, yfrom, ytil)
        # to match with pre-redux results. Could possibly be dropped instead?
        v['AusDatumVonD'] = yfrom
        v['AusDatumBisD'] = ytil

    # invert to get per objId connections
    # and convert set to list
    objIdConnection = {}
    for k, v in austellung.iteritems():
        ausId = v['AusId']
        objIds = v['AobObjId']
        v['AobObjId'] = list(objIds)
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ausId)

    output('\tadding ausId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ausId'] = []
        if objId in objIdConnection.keys():
            v['ausId'] = objIdConnection.pop(objId)

    output(u"...done")
    return austellung
Ejemplo n.º 46
0
def photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile,
                   objDatenFile, logFile):
    """
    Given the photo_multi data and the phoObjDaten + objDaten data files
    any additional relevant ObjIds are added to the PhoObjId field of the
    photo_multi dict, this field is also converted to a list.
    Also returns objDaten for later use
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :param photoObjDatenFile: path to phoObjDaten data file
    :param objDatenFile: path to objDaten data file
    :param logFile: path to logfile
    :return: dict (and updates photo_multi)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Combining all ObjId into the photo file...")

    # handle objDaten
    output(u'\treading in objDaten.. (takes a while)')
    objDatenHeader = 'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|' \
                     'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|' \
                     'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|' \
                     'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|' \
                     'ObjFeld06M|ObjReserve01M'
    objDaten = helpers.csvFileToDict(objDatenFile, 'ObjId', objDatenHeader)

    # match each objInvNr to several objId
    objInvNr2ObjId = {}  # old oDict
    output(u'\tfinding objInvNr connections...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        objInvNr = v['ObjInventarNrS']
        if not objInvNr:
            continue
        if objInvNr not in objInvNr2ObjId.keys():
            objInvNr2ObjId[objInvNr] = []
        objInvNr2ObjId[objInvNr].append(objId)
    output('\tFound %d objInvNr connections in %d objects' %
           (len(objInvNr2ObjId), len(objDaten)))

    # handle photoObjDaten
    photoObjDatenHeader = 'PhmId|AufId|AufAufgabeS|MulId|PhoId|ObjInvNrS'
    photoObjDaten = helpers.csvFileToDict(photoObjDatenFile,
                                          'PhmId',
                                          photoObjDatenHeader,
                                          keep=('PhoId', 'ObjInvNrS'))

    # match each phoId to several objId via the ObjInvNr
    output(u'\tfinding photo-object connections...')
    photoObjConnections = {}
    skipped = []  # ObjInvNr not in ObjDaten
    for k, v in photoObjDaten.iteritems():
        objInvNr = v['ObjInvNrS']
        phoId = v['PhoId']
        if not objInvNr:
            continue
        if objInvNr not in objInvNr2ObjId.keys():
            skipped.append(objInvNr)
            continue
        if phoId not in photoObjConnections.keys():
            photoObjConnections[phoId] = []
        photoObjConnections[phoId] += objInvNr2ObjId[objInvNr]
    output('\tFound %d connected photos in %d photoObjDaten entries' %
           (len(photoObjConnections), len(photoObjDaten)))

    # add to photo_multi and photoAll
    photoDicts = (photo_multi, photoAll)
    allBadObjId = []
    for pDict in photoDicts:
        for k, v in pDict.iteritems():
            phoId = v['PhoId']
            objIds = []
            if phoId not in photoObjConnections.keys():
                if v['PhoObjId']:
                    objIds.append(v['PhoObjId'])
            else:
                # combine relevant objIds
                objIds = photoObjConnections.pop(phoId)  # new connections
                if v['PhoObjId']:
                    objIds.append(v['PhoObjId'])  # old connection
                objIds = list(set(objIds))  # remove dupes

            # check that all of these actually exists (old realObjOnly())
            # and remove otherwise
            badObjId = []
            for objId in objIds:
                if objId not in objDaten.keys():
                    badObjId.append(objId)
            if badObjId:
                allBadObjId += badObjId
                for badId in badObjId:
                    objIds.remove(badId)

            # set new value
            v['PhoObjId'] = objIds

    # log any skipped ObjInvNr
    if skipped:
        skipped = list(set(skipped))  # remove dupes
        output(u"\tthere were %d skipped ObjInvNr, see log (%s)" %
               (len(skipped), logFile))
        flog.write(u'*Unknown objInvs, i.e. ObjInvNrS in photoObjDaten '
                   u'without a match in ObjDaten\n')
        flog.write(u'%s\n' % ', '.join(skipped))

    # log any bad objId
    if allBadObjId:
        output('\tI found some bad objIds. Check the %s' % logFile)
        allBadObjId = list(set(allBadObjId))  # remove dupes
        flog.write(u'* objIds in photo but not in objDaten\n')
        flog.write(u'%s\n' % ', '.join(allBadObjId))

    # trim objDaten
    trimObjDaten(objDaten, photo_multi, photoAll)

    # confirm and return
    output(u"...done")
    flog.close()
    return objDaten
Ejemplo n.º 47
0
def makePhoto_multi(photoFile, multiFile, logFile, tmpFile):
    """
    Given the photo and multimedia data this combines the two into one dict
    :param photoFile: path to photo data file
    :param multiFile: path to multimedia data file
    :param logFile: path to logfile
    :param tmpFile: path to temporary file
    :return: dict
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Combining photo and multimedia file for unique files...")
    pathToTrim = u'R:\web\hires\\'
    tmpHeader = 'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|' \
                'PhoSwdS|AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|' \
                'MulDateiS|MulExtentS'

    # handle multimedia
    multiHeader = 'MulId|MulPhoId|MulPfadS|MulDateiS|MulExtentS'
    multi = helpers.csvFileToDict(multiFile, 'MulId', multiHeader)

    # check that filename is unique
    flog.write('* Same files used by different PhoId, format is PhoId/MulId\n')
    logged = False
    namelist = []
    mulPhoIdList = []
    for k, v in multi.iteritems():
        name = u'%s\\%s.%s' % (v['MulPfadS'], v['MulDateiS'], v['MulExtentS'])
        if name in namelist:
            logged = True
            flog.write('%s/%s\n' % (v['MulPhoId'], v['MullId']))
        else:
            mulPhoIdList.append(v['MulPhoId'])
            namelist.append(name)
    output(u'\tmultimedia: %d' % len(multi))
    if not logged:
        flog.write(u'None =)\n')

    # handle photo
    # @toDO add duplicate check to cleanup script
    photoHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
                  'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    photo = helpers.csvFileToDict(photoFile, 'PhoId', photoHeader)
    output(u'\tphoto: %d' % len(photo))

    # combine
    combined = {}
    flog.write(u'* unused rows in multimedia\n')
    logged = False
    for k, v in multi.iteritems():
        phoId = v['MulPhoId']
        mulId = v['MulId']
        v['MulPfadS'] = v['MulPfadS'].replace(pathToTrim, u'')  # trim filepath
        v['MulExtentS'] = u''  # MulExtentS is always wrong
        if phoId not in photo.keys():
            logged = True
            flog.write(u'%s\n' % v)
        elif not photo[phoId]['MulId'] == v['MulId']:
            raise MyError("phoId matched but to wrong mulId: p:%s m_found:%s, "
                          "m_expected %s" % (phoId, photo[phoId]['MulId'],
                                             mulId))
        else:
            del v['MulPhoId'], v['MulId']
            combo = photo.pop(phoId)  # move out of photo
            combo.update(v)  # add contents from multi
            combined[phoId] = combo
    if not logged:
        flog.write(u'None =)\n')

    # log any unused rows in photo
    flog.write(u'* unused rows in photo\n')
    logged = False
    for k, v in photo.iteritems():
        logged = True
        flog.write(u'%s\n' % v)
    if not logged:
        flog.write(u'None =)\n')
    flog.close()
    output(u"...done")

    # check if anything needs to be manually handled
    output(u"Read the log (%s)" % logFile)
    combined = helpers.promptManualUpdate(combined, tmpFile,
                                          tmpHeader, 'PhoId')

    return combined
Ejemplo n.º 48
0
def run(in_path=None, out_path=None):
    """
    main process for crunching all of the files
    :param in_path: path to directory containing clean csv files
    :param out_path: path to direcotry in whihc to store output
    """
    # set defaults unless overridden
    in_path = in_path or CSV_DIR_CLEAN
    out_path = out_path or CSV_DIR_CRUNCH

    # convert to unicode if not the case
    if type(in_path) == str:
        in_path = unicode(in_path)
    if type(out_path) == str:
        out_path = unicode(out_path)

    # create target paths if they doen't exist
    if not os.path.isdir(out_path):
        os.mkdir(out_path)
    log_path = os.path.join(out_path, u'logs')
    if not os.path.isdir(log_path):
        os.mkdir(log_path)

    # start crunching
    # combine photo and multi
    photoFile = os.path.join(in_path, u'photo.csv')
    multiFile = os.path.join(in_path, u'multimedia.csv')
    logFile = os.path.join(log_path, u'photo_multimedia.log')
    tmpFile = os.path.join(out_path, u'tmp.csv')
    photo_multi = makePhoto_multi(photoFile, multiFile, logFile, tmpFile)

    # load photoAll and drop any entries without a commons connection
    photoAllFile = os.path.join(in_path, u'photoAll.csv')
    logFile = os.path.join(log_path, u'photoAll.log')
    photoAll = makePhotoAll(photoAllFile, photo_multi, logFile)

    # combine photo and Photo-ObjDaten (and photoAll)
    # populates the objId field in photo_multi with ALL of the RELEVANT
    # ObjIds and
    # removes unused Objects from objDaten to make it smaller
    photoObjDatenFile = os.path.join(in_path, u'photoObjDaten.csv')
    objDatenFile = os.path.join(in_path, u'objDaten.csv')
    logFile = os.path.join(log_path, u'photo_objDaten.log')
    objDaten = photo_ObjDaten(photo_multi, photoAll, photoObjDatenFile,
                              objDatenFile, logFile)

    # Adds the stichwort id field to photo and
    # removes unused entries from stichworth to make it smaller
    stichFile = os.path.join(in_path, u'stichwort.csv')
    stichwort = stichworth_photo(stichFile, photo_multi)

    # Add two fields to photo_multi:
    # * same photoId-different file
    # * same objId-different photoID
    samesame(photo_multi, photoAll)

    # Adds the Ausstellung_id field to ObjDaten and
    # trims Ausstellung to unique ids
    ausstellungFile = os.path.join(in_path, u'ausstellung.csv')
    ausstellung = ausstellung_objDaten(ausstellungFile, objDaten)

    # Adds ObjDaten-samhörande field to ObjDaten
    objDatenSamFile = os.path.join(in_path, u'objDatenSam.csv')
    objDaten_sam(objDatenSamFile, objDaten)

    # Adds the Eregnis field to ObjDaten and
    # trims Eregnis to unique ids
    ereignisFile = os.path.join(in_path, u'ereignis.csv')
    logFile = os.path.join(log_path, u'ereignis.log')
    ereignis = ereignis_objDaten(ereignisFile, objDaten, logFile)

    # Adds the kuenstler field to ObjDaten and
    # trims kuenstler
    kuenstlerFile = os.path.join(in_path, u'kuenstler.csv')
    logFile = os.path.join(log_path, u'kuenstler.log')
    kuenstler = kuenstler_objDaten(kuenstlerFile, objDaten, logFile)

    # Adds objMul and objMass fields to ObjDaten
    # then trimms objMul and objMass
    objMassFile = os.path.join(in_path, u'objMass.csv')
    objMultipleFile = os.path.join(in_path, u'objMultiple.csv')
    objMass, objMultiple = mulMass_add(objMassFile, objMultipleFile, objDaten)

    # output all the above
    # @toDO: simplify names (once downstream is checked)
    out_csv = {
        u'photo_multimedia_etc': photo_multi,
        u'stichwort_trim': stichwort,
        u'objMass_trim': objMass,
        u'objMultiple_trim': objMultiple,
        u'objDaten_etc': objDaten,
        u'ausstellung_trim': ausstellung,
        u'ereignis_trim': ereignis,
        u'kuenstler_trim': kuenstler,
        u'photoAll': photoAll
    }
    # @toDO: Not needed once downstream reads json
    out_headers = {
        u'photo_multimedia_etc':
            'PhoId|MulId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|'
            'AdrVorNameS|AdrNameS|PhoSystematikS|MulPfadS|MulDateiS|'
            'MulExtentS|PstId|same_PhoId|same_object',
        u'stichwort_trim':
            'PstId|PhoId|StiBezeichnungS|StiSynonymS',
        u'objMass_trim':
            'ObmId|ObmObjId|ObmTypMasseS|ObmMasseS|ObjAufId|AufAufgabeS',
        u'objMultiple_trim':
            'OmuId|OmuObjId|OmuTypS|OmuBemerkungM|OmuInhalt01M|ObjInventarNrS|'
            'ObjAufId|AufAufgabeS',
        u'objDaten_etc':
            'ObjId|ObjKueId|AufId|AufAufgabeS|ObjTitelOriginalS|'
            'ObjTitelWeitereM|ObjInventarNrS|ObjInventarNrSortiertS|'
            'ObjReferenzNrS|ObjDatierungS|ObjJahrVonL|ObjJahrBisL|'
            'ObjSystematikS|ObjFeld01M|ObjFeld02M|ObjFeld03M|ObjFeld06M|'
            'ObjReserve01M|ausId|related|ergId|role:roleCmt:kueId|mulId|'
            'massId',
        u'ausstellung_trim':
            'AusId|AusTitelS|AusOrtS|std_year|AusJahrS|AusDatumVonD|'
            'AusDatumBisD|AufAufgabeS|AobObjId',
        u'ereignis_trim':
            'ErgId|ErgKurztitelS|ErgArtS|EroObjId',
        u'kuenstler_trim':
            'KueId|KueVorNameS|KueNameS|KudDatierungS|KudJahrVonL|KudJahrBisL|'
            'KudOrtS|KudLandS|KueFunktionS|ObjId',
        u'photoAll':
            'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|'
            'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    }
    for k, v in out_csv.iteritems():
        outFile = os.path.join(out_path, u'%s.csv' % k)
        helpers.dictToCsvFile(outFile, v, out_headers[k])
        output(u'\tOutputted %s' % outFile)
    output(u'Done!')
Ejemplo n.º 49
0
def samesame(photo_multi, photoAll):
    """
    @toDo (after redux)
        * samePhoId no longer needed (but need to make sure it is not
          expected later since removing changes order)
        * base on photo_all + photo_multi to get connections to old
    Adds two fields to the photo_multi dict
    * same_PhoId: same phoId different file
    * same_object: same objId different phoId
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :return: None (but updates photo_multi)
    """
    output(u"Samesame()")
    # load all objId connections from photo_multi
    objIdConnection = {}
    output(u'\tloading objects from photo_multi...')
    for k, v in photo_multi.iteritems():
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        objIds = v['PhoObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append((phoId, phoMullId))

    # load only objId connections from photoAll where object in photo_multi
    output(u'\tloading objects from photoAll...')
    for k, v in photoAll.iteritems():
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        objIds = v['PhoObjId']
        for objId in objIds:
            if objId in objIdConnection.keys():
                objIdConnection[objId].append((phoId, phoMullId))

    # remove any with only one associated phoId
    for k, v in objIdConnection.items():
        if len(v) < 2:
            del objIdConnection[k]
    output(u'\tfound %d objects in multiple photos' % len(objIdConnection))

    # invert objIdConnection to get per phoId connection
    output('\tinverting objIdConnection...')
    phoIdConnection = {}
    for objId, v in objIdConnection.iteritems():
        allPhoMull = [entry[1] for entry in v]
        for phoId, phoMullId in v:
            if phoId not in phoIdConnection.keys():
                phoIdConnection[phoId] = []
            phoIdConnection[phoId] += allPhoMull

    output('\tadding samesame to photo...')
    for k, v in photo_multi.iteritems():
        v['same_PhoId'] = ''  # @toDo remove once safe
        v['same_object'] = []
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        if phoId in phoIdConnection.keys():
            ll = list(set(phoIdConnection[phoId]))  # clone and remove dupes
            ll.remove(phoMullId)  # remove self
            v['same_object'] = ll

    output(u"...done")
Ejemplo n.º 50
0
def samesame(photo_multi, photoAll):
    """
    @toDo (after redux)
        * samePhoId no longer needed (but need to make sure it is not
          expected later since removing changes order)
        * base on photo_all + photo_multi to get connections to old
    Adds two fields to the photo_multi dict
    * same_PhoId: same phoId different file
    * same_object: same objId different phoId
    :param photo_multi: photo_multi dict
    :param photoAll: photoAll dict
    :return: None (but updates photo_multi)
    """
    output(u"Samesame()")
    # load all objId connections from photo_multi
    objIdConnection = {}
    output(u'\tloading objects from photo_multi...')
    for k, v in photo_multi.iteritems():
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        objIds = v['PhoObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append((phoId, phoMullId))

    # load only objId connections from photoAll where object in photo_multi
    output(u'\tloading objects from photoAll...')
    for k, v in photoAll.iteritems():
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        objIds = v['PhoObjId']
        for objId in objIds:
            if objId in objIdConnection.keys():
                objIdConnection[objId].append((phoId, phoMullId))

    # remove any with only one associated phoId
    for k, v in objIdConnection.items():
        if len(v) < 2:
            del objIdConnection[k]
    output(u'\tfound %d objects in multiple photos' % len(objIdConnection))

    # invert objIdConnection to get per phoId connection
    output('\tinverting objIdConnection...')
    phoIdConnection = {}
    for objId, v in objIdConnection.iteritems():
        allPhoMull = [entry[1] for entry in v]
        for phoId, phoMullId in v:
            if phoId not in phoIdConnection.keys():
                phoIdConnection[phoId] = []
            phoIdConnection[phoId] += allPhoMull

    output('\tadding samesame to photo...')
    for k, v in photo_multi.iteritems():
        v['same_PhoId'] = ''  # @toDo remove once safe
        v['same_object'] = []
        phoId = v['PhoId']
        mullId = v['MulId']
        phoMullId = '%s:%s' % (phoId, mullId)
        if phoId in phoIdConnection.keys():
            ll = list(set(phoIdConnection[phoId]))  # clone and remove dupes
            ll.remove(phoMullId)  # remove self
            v['same_object'] = ll

    output(u"...done")
Ejemplo n.º 51
0
def objDaten_sam(objDatenSamFile, objDaten):
    """
    Adds objDatenSam field to ObjDaten
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param objDatenSamFile: path to ObjDaten-samhörande data file
    :param objDaten: objDaten dict
    :return: None (but updates objDaten)
    """
    # setup
    output(u"Adding ObjDaten-samhörande to ObjDaten")

    # handle objDatenSam
    output('\treading ObjDaten_-_samhörande_nr into dictionary... (slow)')
    objDatenSamHeader = 'OobId|OobObj1ID|OobObj2ID'
    objDatenSam = helpers.csvFileToDict(objDatenSamFile, 'OobId',
                                        objDatenSamHeader)

    # map object connections
    output('\tmapping object connections...')
    objIdConnection = {}
    for k, v in objDatenSam.iteritems():
        objId1 = v['OobObj1ID']
        objId2 = v['OobObj2ID']
        if objId1 not in objIdConnection.keys():
            objIdConnection[objId1] = []
        if objId2 not in objIdConnection.keys():
            objIdConnection[objId2] = []
        objIdConnection[objId1].append(objId2)
        objIdConnection[objId2].append(objId1)
    output('\tfound %d connected objIds in %d entries' %
           (len(objIdConnection), len(objDatenSam)))

    # clean up connections
    output('\tremoving dupes, invalids and self...')
    for objId, connectedIds in objIdConnection.items():
        connectedIds = list(set(connectedIds))  # remove dupe
        if objId in connectedIds:
            connectedIds.remove(objId)  # remove self
        for conId in connectedIds[:]:  # slice allows changes from within loop
            if conId not in objDaten.keys():
                connectedIds.remove(conId)  # remove invalid

        # delete or update
        if not connectedIds:
            del objIdConnection[objId]
        else:
            objIdConnection[objId] = connectedIds

    # add to objDaten
    output('\tadding connections to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['related'] = []
        if objId in objIdConnection.keys():
            v['related'] = objIdConnection.pop(objId)

    output(u"...done")
Ejemplo n.º 52
0
def ausstellung_objDaten(austellungFile, objDaten):
    """
    Given the austellung data file and the objDaten data add a austellung id
    field to objDaten.
    Also returns the austellung data after
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param austellungFile: path to austellung data file
    :param objDaten: objDaten dict
    :return: dict (and updates objDaten)
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any year formatting issues mentioned "
                         u"in the analysis log have been corrected and the "
                         u"updated ausstellung file saved...\n"
                         u"...by pressing enter when done")

    # setup
    dummyTitles = (
        u'reparation', u'utställning', u'lån för undersökning',
        u'OBS! Testpost för admin - utställning, export wikimedia commons',
        u'lån till Frankrike 1947', u'test karin 20100520',
        u'test 20100629 (en post skapad för administrativa tester)',
        u'tennföremål 8 st till Strömsholm', u'utlån f justering av urverk')
    output(u"Trimming ausstellung and adding ausstellung to ObjDaten...")

    # handle ausstellung
    austellungHeader = 'AobId|AusId|AusTitelS|AusOrtS|AusJahrS|AusDatumVonD|' \
                       'AusDatumBisD|AobObjId|AufAufgabeS'
    austellung = helpers.csvFileToDict(austellungFile, 'AobId',
                                       austellungHeader)
    originalSize = len(austellung)

    # collect all ausId and drop any with invalid title
    # @toDO: Is keeping objId in austellung really needed?
    #        Otherwise populate objIdConnection here
    foundAusId = {}
    for k, v in austellung.items():  # allow removing entries from within loop
        ausId = v['AusId']
        objId = v['AobObjId']
        title = v['AusTitelS']
        if not title or title in dummyTitles:  # remove empty/dummy
            del austellung[k]
        elif ausId not in foundAusId:  # keep this entry
            foundAusId[ausId] = k
            austellung[k]['AobObjId'] = set([objId, ])
            austellung[k].pop('AobId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            austellung[foundAusId[ausId]]['AobObjId'].add(objId)
            del austellung[k]
    output('\taustellung reduced from %d to %d entries' %
           (originalSize, len(austellung)))

    # populate std_year
    output('\tstandardising years...')
    for k, v in austellung.iteritems():
        year = v['AusJahrS']
        yfrom = v['AusDatumVonD'].replace(u' 00:00:00', u'').strip()
        ytil = v['AusDatumBisD'].replace(u' 00:00:00', u'').strip()
        v['std_year'] = stdAustellungYear(year, yfrom, ytil)
        # to match with pre-redux results. Could possibly be dropped instead?
        v['AusDatumVonD'] = yfrom
        v['AusDatumBisD'] = ytil

    # invert to get per objId connections
    # and convert set to list
    objIdConnection = {}
    for k, v in austellung.iteritems():
        ausId = v['AusId']
        objIds = v['AobObjId']
        v['AobObjId'] = list(objIds)
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ausId)

    output('\tadding ausId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ausId'] = []
        if objId in objIdConnection.keys():
            v['ausId'] = objIdConnection.pop(objId)

    output(u"...done")
    return austellung
Ejemplo n.º 53
0
def kuenstler_objDaten(kuenstlerFile, objDaten, logFile):
    """
    Given the kuenstler data file and the objDaten data add a kuenstler id
    field to objDaten.
    Also returns the kuenstler data after
    * removing certain irrelevant roles and dummy entries
    * combining all objIds for the same kueId
    * standardising years
    * dropping a lot of unneeded fields
    :param kuenstlerFile: path to kuenstler data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Crunching kuenstler...")
    dummyNames = (u'ingen uppgift', )
    badRoles = (u'Leverantör', u'Auktion', u'Förmedlare', u'Givare',
                u'Återförsäljare', u'Konservator')
    badRoleCmts = (u'Förpaktare, kontrollör', u'av kopia')
    droppedFields = ('OkuId', 'ObjAufId', 'AufAufgabeS', 'OkuArtS',
                     'OkuFunktionS', 'OkuValidierungS', 'KudArtS', 'MulId',
                     'PhoId')

    # handle kuenstler
    kuenstlerHeader = 'OkuId|ObjId|ObjAufId|AufAufgabeS|KueId|KueVorNameS|' \
                      'KueNameS|OkuArtS|OkuFunktionS|OkuValidierungS|KudArtS|' \
                      'KudDatierungS|KudJahrVonL|KudJahrBisL|KudOrtS|KudLandS|' \
                      'KueFunktionS|MulId|PhoId'
    kuenstler = helpers.csvFileToDict(kuenstlerFile, ('OkuId', 'MulId'),
                                      kuenstlerHeader)
    originalSize = len(kuenstler)

    # collect all kueId and drop any with invalid title or role
    # also invert to get per objId connections
    # @toDO: Is keeping objId in kuenstler really needed?
    #        Otherwise populate objIdConnection here
    foundKueId = {}
    objIdConnection = {}
    for k, v in kuenstler.items():  # allow removing entries from within loop
        kueId = v['KueId']
        objId = v['ObjId']
        fName = v['KueVorNameS']
        lName = v['KueNameS']
        role = v['OkuArtS']
        roleCmt = v['OkuFunktionS']

        # filter out any undesired entries
        if role in badRoles or \
                roleCmt in badRoleCmts or \
                len(fName) + len(lName) == 0 or \
                lName in dummyNames:
            del kuenstler[k]
            continue

        # send unique role/kueId combo for objid
        kueCombo = u'%s:%s:%s' % (role, roleCmt, kueId)
        if objId not in objIdConnection.keys():
            objIdConnection[objId] = set([])
        objIdConnection[objId].add(kueCombo)

        # keep only one entry per unique kueId
        if kueId not in foundKueId.keys():  # keep this entry
            foundKueId[kueId] = k
            kuenstler[k]['ObjId'] = set([
                objId,
            ])
        else:  # keep only objId part of this entry
            kuenstler[foundKueId[kueId]]['ObjId'].add(objId)
            del kuenstler[k]
    output('\tkueIds: reduced from %d to %d' % (originalSize, len(kuenstler)))

    # add to objDaten
    output('\tadding kueId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['role:roleCmt:kueId'] = []
        if objId in objIdConnection.keys():
            v['role:roleCmt:kueId'] = list(objIdConnection.pop(objId))

    # further cleanup of kuenstler
    # correcting ort/land entries
    # stripping years from name
    # dropping a bunch of fields
    output('\tfurther cleanup of kuenstler...')
    for k, v in kuenstler.iteritems():
        land = v['KudOrtS']  # missnamed in original database
        ort = v['KudLandS']  # missnamed in original database
        lName = v['KueNameS']
        bYear = v['KudJahrVonL']
        dYear = v['KudJahrBisL']
        objIds = v['ObjId']

        # correct missnaming in original database
        v['KudOrtS'] = ort
        v['KudLandS'] = land

        # convert set to list
        v['ObjId'] = list(objIds)

        # take yearinfo out of name, and store in year
        lName, bYear, dYear, log = extractKuenstlerYear(lName, bYear, dYear)
        if log:
            flog.write(log)
        v['KueNameS'] = lName
        v['KudJahrVonL'] = bYear
        v['KudJahrBisL'] = dYear

        for field in droppedFields:
            del v[field]

    flog.close()
    output(u"...done")
    return kuenstler
Ejemplo n.º 54
0
def objDaten_sam(objDatenSamFile, objDaten):
    """
    Adds objDatenSam field to ObjDaten
    * adding a std_year field
    * combining all objIds for the same ausId
    * dropping AobId
    :param objDatenSamFile: path to ObjDaten-samhörande data file
    :param objDaten: objDaten dict
    :return: None (but updates objDaten)
    """
    # setup
    output(u"Adding ObjDaten-samhörande to ObjDaten")

    # handle objDatenSam
    output('\treading ObjDaten_-_samhörande_nr into dictionary... (slow)')
    objDatenSamHeader = 'OobId|OobObj1ID|OobObj2ID'
    objDatenSam = helpers.csvFileToDict(objDatenSamFile, 'OobId',
                                        objDatenSamHeader)

    # map object connections
    output('\tmapping object connections...')
    objIdConnection = {}
    for k, v in objDatenSam.iteritems():
        objId1 = v['OobObj1ID']
        objId2 = v['OobObj2ID']
        if objId1 not in objIdConnection.keys():
            objIdConnection[objId1] = []
        if objId2 not in objIdConnection.keys():
            objIdConnection[objId2] = []
        objIdConnection[objId1].append(objId2)
        objIdConnection[objId2].append(objId1)
    output('\tfound %d connected objIds in %d entries' %
           (len(objIdConnection), len(objDatenSam)))

    # clean up connections
    output('\tremoving dupes, invalids and self...')
    for objId, connectedIds in objIdConnection.items():
        connectedIds = list(set(connectedIds))  # remove dupe
        if objId in connectedIds:
            connectedIds.remove(objId)  # remove self
        for conId in connectedIds[:]:  # slice allows changes from within loop
            if conId not in objDaten.keys():
                connectedIds.remove(conId)  # remove invalid

        # delete or update
        if not connectedIds:
            del objIdConnection[objId]
        else:
            objIdConnection[objId] = connectedIds

    # add to objDaten
    output('\tadding connections to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['related'] = []
        if objId in objIdConnection.keys():
            v['related'] = objIdConnection.pop(objId)

    output(u"...done")