Esempio n. 1
0
def makePhotoAll(photoAllFile, photo_multi, logFile):
    """
    @toDO: if dupes are found then prompt manual cleanup then re-run
           makePhotoAll(), That way crash isn't complete.
    Given the photoAll data file read it and drop any entries without a
    commons connection. Also Simplify the data
    :param photoAllFile: path to photoAll data file
    :param photo_multi: photo_multi dict
    :param logFile: path to logfile
    :return: dict
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll "
                         u"analysis log have been corrected and the updated "
                         u"photoAll file saved...\n"
                         u"...by pressing enter when done")

    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Loading photoAll...")
    photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
                     'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'),
                                     photoAllHeader)
    originalSize = len(photoAll)

    for k, v in photoAll.items():
        link = v['PhoSystematikS']

        # drop any entries without files
        if not link:
            del photoAll[k]
            continue

        # simplify link
        if '%' in link:
            link = helpers.urldecode_utf8(link)
        link = helpers.external_2_internal_link(link, project='wikimedia')
        link = link[len('[[:commons:File:'):-len(']]')]
        v['PhoSystematikS'] = link
    output('PhotoAll reduced from %d to %d entries' %
           (originalSize, len(photoAll)))

    # check that none of PhoId from photo_multi occur in photo
    dupes = []
    for phoId in photo_multi.keys():
        phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId'])
        if phoMul in photoAll.keys():
            dupes.append(phoMul)
    if dupes:
        output(u'Found duplicates between photoAll and photo_multi. '
               u'This will most likely mess things up. Check the log at '
               u'%s for details.' % logFile)
        flog.write(u'* duplicates found in photo and photo_all\n'
                   u'phoId:MulId|commonsFile\n')
        for d in dupes:
            flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS']))

    flog.close()
    return photoAll
Esempio n. 2
0
def makePhotoAll(photoAllFile, photo_multi, logFile):
    """
    @toDO: if dupes are found then prompt manual cleanup then re-run
           makePhotoAll(), That way crash isn't complete.
    Given the photoAll data file read it and drop any entries without a
    commons connection. Also Simplify the data
    :param photoAllFile: path to photoAll data file
    :param photo_multi: photo_multi dict
    :param logFile: path to logfile
    :return: dict
    """
    # often requires manual fixing prior to crunch
    helpers.verboseInput(u"Confirm that any issues mentioned in the photoAll "
                         u"analysis log have been corrected and the updated "
                         u"photoAll file saved...\n"
                         u"...by pressing enter when done")

    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Loading photoAll...")
    photoAllHeader = 'PhoId|PhoObjId|PhoBeschreibungM|PhoAufnahmeortS|PhoSwdS|' \
                     'MulId|AdrVorNameS|AdrNameS|PhoSystematikS'
    photoAll = helpers.csvFileToDict(photoAllFile, ('PhoId', 'MulId'),
                                     photoAllHeader)
    originalSize = len(photoAll)

    for k, v in photoAll.items():
        link = v['PhoSystematikS']

        # drop any entries without files
        if not link:
            del photoAll[k]
            continue

        # simplify link
        if '%' in link:
            link = helpers.urldecode_utf8(link)
        link = helpers.external_2_internal_link(link, project='wikimedia')
        link = link[len('[[:commons:File:'):-len(']]')]
        v['PhoSystematikS'] = link
    output('PhotoAll reduced from %d to %d entries' % (originalSize,
                                                       len(photoAll)))

    # check that none of PhoId from photo_multi occur in photo
    dupes = []
    for phoId in photo_multi.keys():
        phoMul = u'%s:%s' % (phoId, photo_multi[phoId]['MulId'])
        if phoMul in photoAll.keys():
            dupes.append(phoMul)
    if dupes:
        output(u'Found duplicates between photoAll and photo_multi. '
               u'This will most likely mess things up. Check the log at '
               u'%s for details.' % logFile)
        flog.write(u'* duplicates found in photo and photo_all\n'
                   u'phoId:MulId|commonsFile\n')
        for d in dupes:
            flog.write('%s|%s\n' % (d, photoAll[d]['PhoSystematikS']))

    flog.close()
    return photoAll
Esempio n. 3
0
def analysePhotoAll(f, file_in):
    """
    Check that all PhoSystematikS are commonsfiles and each is unique
    """
    header, lines = helpers.open_csv_file(file_in)
    badUrls = []
    dupes = []
    sources = {}

    for l in lines:
        if not l:
            continue
        col = l.split('|')
        source = col[8].strip()  # PhoSystematikS
        phoId = col[0]  # PhoId
        mulId = col[5]  # MulId
        phoMul = u'%s:%s' % (phoId, mulId)
        if source:
            if '%' in source:
                source = helpers.urldecode_utf8(source)
            internal = helpers.external_2_internal_link(source,
                                                        project='wikimedia')
            if not internal.startswith('[[:commons:File:'):
                badUrls.append((phoMul, source))
            else:
                internal = internal[len('[[:commons:File:'):-len(']]')]
                if internal in sources.keys():
                    dupes.append(
                        (phoMul, sources[internal], internal.replace(' ',
                                                                     '_')))
                sources[internal] = phoMul

    f.write(u'\n\n<!--From: %s -->\n' % file_in)
    if badUrls:
        f.write(u'===BadUrls===\n')
        for b in badUrls:
            f.write(u'%s: %s\n' % b)
    if dupes:
        f.write(u'===DuplicateUrls===\n')
        f.write(u'phoId:mulId|phoId:mulId|Filename\n')
        for b in dupes:
            f.write(u'%s|%s|%s\n' % b)
Esempio n. 4
0
def ereignis_objDaten(ereignisFile, objDaten, logFile):
    """
    Given the ereignis data file and the objDaten data add a ereignis id
    field to objDaten.
    Also returns the ereignis data after
    * combining all objIds for the same ergId
    * dropping EroId
    :param ereignisFile: path to eregnis data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Trimming eregnis and adding eregnis to ObjDaten...")

    # handle eregnis
    ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS'
    ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader)
    originalSize = len(ereignis)

    # collect all ergId and drop any with invalid title
    # @toDO: Is keeping objId in eregnis really needed?
    #        Otherwise populate objIdConnection here
    foundErgId = {}
    for k, v in ereignis.items():  # allow removing entries from within loop
        ergId = v['ErgId']
        objId = v['EroObjId']
        title = v['ErgKurztitelS']
        if not title:  # remove empty
            del ereignis[k]
        elif ergId not in foundErgId.keys():  # keep this entry
            foundErgId[ergId] = k
            ereignis[k]['EroObjId'] = set([
                objId,
            ])
            ereignis[k].pop('EroId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            ereignis[foundErgId[ergId]]['EroObjId'].add(objId)
            del ereignis[k]
    output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis)))

    # handle urls in ereignis and convert set to list
    for k, v in ereignis.iteritems():
        objIds = v['EroObjId']
        url = v['ErgArtS']

        # convert set to list
        v['EroObjId'] = list(objIds)

        # handle urls
        if u'%' in url:
            url = helpers.urldecode_utf8(url)
        # convert external links to internal
        if 'wikipedia' in url:
            url = helpers.external_2_internal_link(url)
        elif url:
            flog.write(u'weird url: %s\n' % url)
        v['ErgArtS'] = url

    # invert to get per objId connections
    objIdConnection = {}
    for k, v in ereignis.iteritems():
        ergId = v['ErgId']
        objIds = v['EroObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ergId)

    # add to objDaten
    output('\tadding ergId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ergId'] = []
        if objId in objIdConnection.keys():
            v['ergId'] = objIdConnection.pop(objId)

    flog.close()
    output(u"...done")
    return ereignis
Esempio n. 5
0
def ereignis_objDaten(ereignisFile, objDaten, logFile):
    """
    Given the ereignis data file and the objDaten data add a ereignis id
    field to objDaten.
    Also returns the ereignis data after
    * combining all objIds for the same ergId
    * dropping EroId
    :param ereignisFile: path to eregnis data file
    :param objDaten: objDaten dict
    :param logFile: path to logfile
    :return: dict (and updates objDaten)
    """
    # setup
    flog = codecs.open(logFile, 'w', 'utf-8')  # logfile
    output(u"Trimming eregnis and adding eregnis to ObjDaten...")

    # handle eregnis
    ereignisHeader = 'EroId|ErgId|EroObjId|ErgKurztitelS|ErgArtS'
    ereignis = helpers.csvFileToDict(ereignisFile, 'EroId', ereignisHeader)
    originalSize = len(ereignis)

    # collect all ergId and drop any with invalid title
    # @toDO: Is keeping objId in eregnis really needed?
    #        Otherwise populate objIdConnection here
    foundErgId = {}
    for k, v in ereignis.items():  # allow removing entries from within loop
        ergId = v['ErgId']
        objId = v['EroObjId']
        title = v['ErgKurztitelS']
        if not title:  # remove empty
            del ereignis[k]
        elif ergId not in foundErgId.keys():  # keep this entry
            foundErgId[ergId] = k
            ereignis[k]['EroObjId'] = set([objId, ])
            ereignis[k].pop('EroId')  # drop unnecessary id
        else:  # keep only objId part of this entry
            ereignis[foundErgId[ergId]]['EroObjId'].add(objId)
            del ereignis[k]
    output('\tergIds: reduced from %d to %d' % (originalSize, len(ereignis)))

    # handle urls in ereignis and convert set to list
    for k, v in ereignis.iteritems():
        objIds = v['EroObjId']
        url = v['ErgArtS']

        # convert set to list
        v['EroObjId'] = list(objIds)

        # handle urls
        if u'%' in url:
            url = helpers.urldecode_utf8(url)
        # convert external links to internal
        if 'wikipedia' in url:
            url = helpers.external_2_internal_link(url)
        elif url:
            flog.write(u'weird url: %s\n' % url)
        v['ErgArtS'] = url

    # invert to get per objId connections
    objIdConnection = {}
    for k, v in ereignis.iteritems():
        ergId = v['ErgId']
        objIds = v['EroObjId']
        for objId in objIds:
            if objId not in objIdConnection.keys():
                objIdConnection[objId] = []
            objIdConnection[objId].append(ergId)

    # add to objDaten
    output('\tadding ergId to objDaten...')
    for k, v in objDaten.iteritems():
        objId = v['ObjId']
        v['ergId'] = []
        if objId in objIdConnection.keys():
            v['ergId'] = objIdConnection.pop(objId)

    flog.close()
    output(u"...done")
    return ereignis
Esempio n. 6
0
 def test_external_2_internal_link_non_wikipedia_string_with_param(self):
     input_value = u'http://commons.wikimedia.org/wiki/Some_title'
     expected = u'[[:commons:Some title]]'
     result = helpers.external_2_internal_link(input_value,
                                               project='wikimedia')
     self.assertEquals(result, expected)
Esempio n. 7
0
 def test_external_2_internal_link_non_wikipedia_string(self):
     input_value = u'http://se.wikimedia.org/wiki/Some_title'
     expected = u'http://se.wikimedia.org/wiki/Some_title'
     self.assertEquals(helpers.external_2_internal_link(input_value),
                       expected)
Esempio n. 8
0
 def test_external_2_internal_link_non_wiki_url_string(self):
     input_value = u'http://not.a.wiki/Some_title'
     expected = u'http://not.a.wiki/Some_title'
     self.assertEquals(helpers.external_2_internal_link(input_value),
                       expected)
Esempio n. 9
0
 def test_external_2_internal_link_https_svwiki_string(self):
     input_value = u'https://sv.wikipedia.org/wiki/Some_title'
     expected = u'[[:sv:Some title]]'
     self.assertEquals(helpers.external_2_internal_link(input_value),
                       expected)
Esempio n. 10
0
 def test_external_2_internal_link_on_empty_string(self):
     self.assertEquals(helpers.external_2_internal_link(''), '')