Esempio n. 1
0
    def openFileAsDictList(filename,
                           delimiter='|',
                           codec='utf-8',
                           headerCheck=None):
        '''
        opens a given pipe-separated csv file (utf-8)
        and returns a list of dicts, using header row for keys)
        '''
        header, lines = helpers.open_csv_file(filename,
                                              delimiter=delimiter,
                                              codec=codec)

        # verify header works
        if headerCheck is not None and headerCheck != header:
            print 'Header not same as comparison string!'
            exit()

        entryList = []
        for l in lines:
            if not l:  # empty line
                continue
            entry = {}
            parts = l.split(delimiter)
            for i, e in enumerate(header):
                entry[e] = parts[i]
            entryList.append(entry)
        return entryList
Esempio n. 2
0
def analysePhotoAll(f, file_in):
    """
    Check that all PhoSystematikS are commonsfiles and each is unique
    """
    header, lines = helpers.open_csv_file(file_in)
    badUrls = []
    dupes = []
    sources = {}

    for l in lines:
        if not l:
            continue
        col = l.split('|')
        source = col[8].strip()  # PhoSystematikS
        phoId = col[0]  # PhoId
        mulId = col[5]  # MulId
        phoMul = u'%s:%s' % (phoId, mulId)
        if source:
            if '%' in source:
                source = helpers.urldecode_utf8(source)
            internal = helpers.external_2_internal_link(source,
                                                        project='wikimedia')
            if not internal.startswith('[[:commons:File:'):
                badUrls.append((phoMul, source))
            else:
                internal = internal[len('[[:commons:File:'):-len(']]')]
                if internal in sources.keys():
                    dupes.append(
                        (phoMul, sources[internal], internal.replace(' ',
                                                                     '_')))
                sources[internal] = phoMul

    f.write(u'\n\n<!--From: %s -->\n' % file_in)
    if badUrls:
        f.write(u'===BadUrls===\n')
        for b in badUrls:
            f.write(u'%s: %s\n' % b)
    if dupes:
        f.write(u'===DuplicateUrls===\n')
        f.write(u'phoId:mulId|phoId:mulId|Filename\n')
        for b in dupes:
            f.write(u'%s|%s|%s\n' % b)
Esempio n. 3
0
 def file_to_dict(filename, idcol=0, verbose=False, careful=False):
     '''
     reads in a file and passes it to a dict where each row is in
     turn a dict
     '''
     listcols = isinstance(idcol, list)
     header, lines = helpers.open_csv_file(filename)
     dDict = {}
     unique = True
     for l in lines:
         if not l:  # empty line
             continue
         col = l.split('|')
         # id can either be one column or a combination of several
         if listcols:
             idno = u''
             for ic in idcol:
                 idno = u'%s:%s' % (idno, col[ic])
             idno = idno[1:]  # trim leading :
         else:
             idno = col[idcol]
         wDict = {}
         for i in range(0, len(col)):
             wDict[header[i]] = col[i]
         # test for uniqueness
         if careful:
             if idno in dDict.keys():
                 unique = False
         dDict[idno] = wDict
     if verbose:
         if careful:
             print 'read %s: %r items of length %r. Uniqueness is %s' % \
                 (filename, len(dDict), len(dDict.itervalues().next()),
                  unique)
         else:
             print 'read %s: %r items of length %r.' % \
                 (filename, len(dDict), len(dDict.itervalues().next()))
     return dDict
Esempio n. 4
0
 def file_to_dict(filename, idcol=0, verbose=False, careful=False):
     '''
     reads in a file and passes it to a dict where each row is in
     turn a dict
     '''
     listcols = isinstance(idcol, list)
     header, lines = helpers.open_csv_file(filename)
     dDict = {}
     unique = True
     for l in lines:
         if not l:  # empty line
             continue
         col = l.split('|')
         # id can either be one column or a combination of several
         if listcols:
             idno = u''
             for ic in idcol:
                 idno = u'%s:%s' % (idno, col[ic])
             idno = idno[1:]  # trim leading :
         else:
             idno = col[idcol]
         wDict = {}
         for i in range(0, len(col)):
             wDict[header[i]] = col[i]
         # test for uniqueness
         if careful:
             if idno in dDict.keys():
                 unique = False
         dDict[idno] = wDict
     if verbose:
         if careful:
             print 'read %s: %r items of length %r. Uniqueness is %s' % \
                 (filename, len(dDict), len(dDict.itervalues().next()),
                  unique)
         else:
             print 'read %s: %r items of length %r.' % \
                 (filename, len(dDict), len(dDict.itervalues().next()))
     return dDict
Esempio n. 5
0
    def openFileAsDictList(filename, delimiter='|', codec='utf-8',
                           headerCheck=None):
        '''
        opens a given pipe-separated csv file (utf-8)
        and returns a list of dicts, using header row for keys)
        '''
        header, lines = helpers.open_csv_file(
            filename, delimiter=delimiter, codec=codec)

        # verify header works
        if headerCheck is not None and headerCheck != header:
            print 'Header not same as comparison string!'
            exit()

        entryList = []
        for l in lines:
            if not l:  # empty line
                continue
            entry = {}
            parts = l.split(delimiter)
            for i, e in enumerate(header):
                entry[e] = parts[i]
            entryList.append(entry)
        return entryList
Esempio n. 6
0
def analyseYear(f, file_in):
    '''
    Exhibitanalyser:
    verifies that the year can be interpreted
    @todo: become stricter. Disallow space as year separator
    '''
    header, lines = helpers.open_csv_file(file_in)
    data = []
    exhibits = []
    # AobId
    # AusId
    # AusTitelS
    # AusOrtS
    # AusJahrS
    # AusDatumVonD
    # AusDatumBisD
    # AobObjId
    # AufAufgabeS
    for l in lines:
        if not l:
            continue
        col = l.split('|')
        if col[2].strip() == '':  # ignore exhibits wihtout names
            continue
        ExhibId = col[1]  # AusId
        if ExhibId in exhibits:
            continue
        exhibits.append(ExhibId)
        year = col[4].strip()  # AusJahrS
        lyear = len(year)
        yfrom = col[5].replace(u' 00:00:00', u'').strip()  # AusDatumVonD
        lyfrom = len(yfrom)
        ytil = col[6].replace(u' 00:00:00', u'').strip()  # AusDatumBisD
        lytil = len(ytil)
        lout = u'%s|%s|%s|%s' % (ExhibId, year, yfrom, ytil)
        # identify weird year formatting
        if lyear != 0:
            if lyear not in [4, 9, 7]:
                # if not YYYY or YYYY-YYYY or YYYY-YY
                data.append(u'error y1|%s' % lout)
            elif lyear == 9 and (year[4:5] != '-' and year[4:5] != ' '):
                # if not YYYY-YYYY or YYYY YYYY
                data.append(u'error y1|%s' % lout)  # y5
            elif lyear == 7 and (year[4:5] != '-' and year[4:5] != ' '):
                # if not YYYY-YY or YYYY YY
                data.append(u'error y1|%s' % lout)  # y6
            elif (lyear == 9) and (lyfrom != 0 or lytil != 0):
                if lyfrom != 0 and int(year[:4]) != int(yfrom[:4]):
                    data.append(u'error y3|%s' % lout)
                elif lytil != 0 and int(year[-4:]) != int(ytil[:4]):
                    data.append(u'error y3|%s' % lout)
            # elif (lyear == 4) and (lyfrom != 0):
            #    if int(year) != int(yfrom[:4]):
            #        data.append(u'error y2|%s' % lout)
            # elif lytil != 0 and int(ytil[:4]) != int(year):
            #    data.append(u'error y7|%s' % lout)
    # loop done
    f.write(u'\n\n<!--From: %s -->\n' % file_in)
    f.write(u'===year problems===\n')
    f.write(
        u'y1:\t Could not match JahrS to any YYYY or YYYY-YYYY or YYYY-YY\n')
    # f.write(u'y2:\t JahrS is not the same as starting year in Von-Bis range')
    # f.write(u'- unless amended Von will be used\n')
    f.write(u'y3:\t JahrS is span which doesn\'t match in Von-Bis range')
    f.write(u'- please amend as appropriate\n')
    f.write(u'#error\tAusId\tAusJahrS\tAusDatumVonD\tAusDatumBisD\n')
    for d in data:
        splits = d.split('|')
        txt = ''
        for s in splits:
            txt = u'%s\t%s' % (txt, s)
        f.write(u'%s\n' % txt[7:])
Esempio n. 7
0
def analyseMulti(f, file_in):
    '''
    Identifies dupes
    identifies images with filetype in the filename
    '''
    header, lines = helpers.open_csv_file(file_in)
    ids = []
    mults = []
    bad = []
    difftest = {}
    sameCount = 0
    diffCount = 0
    ccount = 0
    ndiffCount = 0
    ndict = {}
    # MulId
    # MulPhoId
    # MulPfadS
    # MulDateiS
    # MulExtentS
    f.write(u'\n\n<!--From: %s -->\n' % file_in)
    for l in lines:
        if not l:
            continue
        col = l.split('|')
        idd = col[1]  # MulPhoId
        fullname = ''.join([col[2], col[3], col[4]])
        # test if each filename has only one photoid
        if fullname in ndict.keys():
            if idd != ndict[fullname]:
                ndiffCount += 1
                # print idd, ndict[fullname]
        else:
            ndict[fullname] = idd
        # testing mullId/phoId duplication
        if idd in ids:
            ccount += 1
            mults.append(idd)
            tt = '|'.join([col[2], col[3], col[4]])
            if tt != difftest[idd]:
                diffCount += 1
                # print u'>%s\n<%s\n' %(tt, difftest[idd])
            else:
                sameCount += 1
        else:
            ids.append(idd)
            difftest[idd] = '|'.join([col[2], col[3], col[4]])
        name = col[3]  # MulDateiS
        if name[-4:-3] == '.':
            # If filetype in MulDateiS
            bad.append(name)
    # loop done
    mm = {}
    tot = 0
    for m in mults:
        if m in mm.keys():
            mm[m] += 1
            tot += 1
        else:
            mm[m] = 2
            tot += 2
    if mm:
        f.write(u'===duplicates===\n')
        f.write(u'#Total: %r\n' % tot)
        f.write(u'#MulPhoId|antal\n')
        sortMults = Common.sortedDict(mm)
        for s in sortMults:
            f.write(u'%s|%r\n' % (s[0], s[1]))
    if bad:
        f.write(u'===BadNames===\n')
        for b in bad:
            f.write(u'%s\n' % b)
    if not bad and not mm:
        f.write(u'there are no problems with multimedia file =)')
Esempio n. 8
0
def analysePhoto(A, f, file_in):
    '''
    Verifies that all licenses and sources can be parsed correctly and
    that there are no duplicates
    '''
    header, lines = helpers.open_csv_file(file_in)
    licenses = []
    sources = []
    mulls = []
    phids = {}
    nodupes = True
    dupePhoid = {}
    # PhoId
    # PhoObjId
    # PhoBeschreibungM
    # PhoAufnahmeortS
    # PhoSwdS
    # MulId
    # AdrVorNameS
    # AdrNameS
    # PhoSystematikS
    f.write(u'<!--From: %s -->\n' % file_in)
    for l in lines:
        if not l:
            continue
        col = l.split('|')
        lic = col[3]  # PhoAufnahmeortS
        source = col[4]  # PhoSwdS
        phid = col[0]  # PhoId
        if lic not in licenses:
            licenses.append(lic)
        if source not in sources:
            sources.append(source)
        mull = col[5]  # MulId
        if (mull in mulls) and nodupes:
            f.write(u'there are dupes in MullId\n')
            nodupes = False
        else:
            mulls.append(mull)
        # for comparing content
        if phid in phids.keys():
            # Duplicate photoids...
            tt = '|'.join(col[1:4] + col[6:])
            if tt != phids[phid]:
                # with different content
                dupePhoid[tt] = (phid, phids[phid])
        else:
            phids[phid] = '|'.join(col[1:4] + col[6:])
    # loop done
    # find incompatible licenses
    for s in sources:
        if s not in A.source.keys():
            f.write(u'Found an incompatible source: %s\n' % s)
    # find incompatible sources
    for l in licenses:
        if l not in A.lic.keys():
            f.write(u'Found an incompatible license: %s\n' % l)
    if nodupes:
        f.write(u'there are NO dupes in MullId =)\n')
    if dupePhoid:
        f.write(u'---Duplicate phoIds with different info---\n')
        for k, v in dupePhoid.iteritems():
            f.write(u'%s: %s <> %s\n' % (v[0], k, v[1]))