def get_artist(self, item):
        """Get formated artist info based on item and wikidata."""
        # formated string for unknown or anonymous
        artists = []

        wd_painting_artists = self.get_wd_painting_artists(item)
        lido_artists = item.get_artists()
        for nsid, artist_data in lido_artists.iteritems():
            artists.append(
                self.get_single_artist(nsid, artist_data, len(lido_artists),
                                       wd_painting_artists, item))

        if len(artists) == 0:
            return ''
        elif len(artists) == 1:
            return NatmusInfo.format_artist_name(artists[0])
        else:
            non_anons = common.trim_list(artists)
            if not non_anons:
                # multiple anons, simply output one
                return NatmusInfo.format_artist_name(artists[0])
            elif len(non_anons) == 1 and non_anons[0].get('qualifier'):
                # anons + one named artist with qualifier
                return NatmusInfo.format_artist_name(artists[0])
            else:
                # multiple named artists, just ignore any anons
                formatted_artists = \
                    [NatmusInfo.format_artist_name(artist) for artist in non_anons]
                return '\n '.join(formatted_artists)
Beispiel #2
0
def secondaryKeywordTest(lines):
    """
    How many files get keywords if we limit them by frequency
    """
    offset = 3
    num = 8  # 3, 4, 5, 6, 7, 8, 9, 10
    passNo = []
    for i in range(num):
        passNo.append(0)
    for l in lines:
        passed = []
        for i in range(num):
            passed.append(False)
        params = l.split('|')
        keywords = params[20].strip().split(',')
        keywords += params[22].strip().split(',')
        keywords = common.trim_list(keywords)
        for k in keywords:
            k = k.lower()
            for i in range(offset, num + offset):
                if not passed[i - offset] and keywordList[k] >= i:
                    passNo[i - offset] += 1
                    passed[i - offset] += True
    txt = u'frekvens: bilder utan kategori\n'
    for i in range(offset, num + offset):
        txt += u'%d: %d\n' % (i, len(lines) - passNo[i - offset])
    txt += u'(utav %d filer)' % len(lines)
    return txt
    def clean_sparql_output(data, key):
        """
        Takes the sparql output and outputs it as a dict with lists.

        Also converts any entity_urls to Qids.

        @param data: data to clean
        @param key: data value to use as key in the new dict
        @return: dict
        """
        entity_url = u'http://www.wikidata.org/entity/'
        if key not in data[0].keys():
            pywikibot.error(
                u"The expected key '%s' was not present in the sparql output "
                u"keys: %s" % (key, ', '.join(data[0].keys())))
        new_data = {}
        for d in data:
            k = d[key].replace(entity_url, '')
            new_data[k] = {}
            for kk, value in d.iteritems():
                value = value.split('|')
                for i, v in enumerate(value):
                    value[i] = v.replace(entity_url, '')
                new_data[k][kk] = common.trim_list(value)
        return new_data
Beispiel #4
0
def testKeywords(amnesord, motiv_amnesord, benamning):
    keywords = amnesord + motiv_amnesord + [benamning, ]
    keywords = common.trim_list(keywords)
    if len(keywords) < 1:
        return u'Inga ämnesord'

    for k in keywords:
        helpers.addOrIncrement(keywordList, k.lower())
    def add_identification_data(self, data):
        handled_tags = list()
        skipped_tags = [
            '',
        ]
        tag = u'lido:objectIdentificationWrap'

        # add title
        handled_tags.append(u'lido:titleWrap')
        titles = common.listify(data[u'lido:titleWrap'][u'lido:titleSet'])
        self.titles = get_lang_values_from_set(titles,
                                               (u'lido:appellationValue', ))

        # add incription
        handled_tags.append(u'lido:inscriptionsWrap')
        inscriptions = common.listify(
            data[u'lido:inscriptionsWrap'][u'lido:inscriptions'])
        self.inscriptions = get_lang_values_from_set(
            inscriptions, (u'lido:inscriptionTranscription', ))

        # add decription
        handled_tags.append(u'lido:objectDescriptionWrap')
        description_set = data[u'lido:objectDescriptionWrap'][
            u'lido:objectDescriptionSet']
        if not isinstance(description_set, OrderedDict):
            pywikibot.warning(
                "Weird things are happening in description field for %s:\n%s" %
                (self.source_file, description_set))
        descriptions = common.listify(
            description_set[u'lido:descriptiveNoteValue'])
        self.descriptions = get_lang_values_from_set(descriptions)

        # add measurements
        handled_tags.append(u'lido:objectMeasurementsWrap')
        measurement_set = data[u'lido:objectMeasurementsWrap'][
            u'lido:objectMeasurementsSet']
        if set(measurement_set.keys()) - set(
            [u'lido:displayObjectMeasurements', u'lido:objectMeasurements']):
            pywikibot.warning(
                "Weird things are happening in measurement field for %s:\n%s" %
                (self.source_file, measurement_set))
        self._debug(measurement_set.get(u'lido:displayObjectMeasurements'))
        measurements = common.trim_list(
            common.listify(
                measurement_set.get(u'lido:displayObjectMeasurements')))
        self._debug(measurements)
        self.add_meaurements(measurements)

        # ensure location is always Nationalmuesum
        handled_tags.append(u'lido:repositoryWrap')
        repository_viaf = data[u'lido:repositoryWrap']['lido:repositorySet'][
            'lido:repositoryName']['lido:legalBodyID']['#text']
        if repository_viaf != u'http://viaf.org/viaf/147742988':
            pywikibot.warning("Unexpected repoitory in %s: %s" %
                              (self.source_file, repository_viaf))

        flag_missed_tags(data, tag, handled_tags, skipped_tags)
    def load_data(self, in_file):
        """
        Load the provided data files.

        Outputs a tuple with lido data as a dict and image filenames as a list.

        @param in_file: the path to the metadata file
        @return: (dict, list)
        """
        lido_data = common.open_and_read_file(in_file[0], as_json=True)
        image_files = common.open_and_read_file(in_file[1]).split('\n')
        image_files = common.trim_list(image_files)

        return (lido_data, image_files)
    def get_type(self, item):
        """Get the object type of an item."""
        typ = ''

        # get previous wikidata info
        wd_data = self.wd_paintings.get(item.get_obj_id())
        if wd_data:
            types = []
            for t in wd_data.get('types'):
                types.append(self.type_mappings.get(t))
            types = common.trim_list(types)
            if len(types) == 1:
                typ = types[0]
            elif len(types) > 1:
                pywikibot.warning("Found %d matching types for %s" %
                                  (len(types), item.get_obj_id()))
        return typ
    def generate_content_cats(self, item):
        """
        Produce categories related to the media file contents.

        @param item: the metadata for the media file in question
        @return: list of categories (without "Category:" prefix)
        """
        cats = []

        wd_painting = self.wd_paintings.get(item.get_obj_id())
        if wd_painting and wd_painting.get('commons_cats'):
            # if commonscat(s) for the image then add no other content cats
            cats += common.trim_list(wd_painting.get('commons_cats'))
        else:
            for tracker in ('depicted', 'artist'):
                while True:
                    entry = item.get_from_tracker(tracker)
                    if not entry:
                        break
                    cats.append(entry)

        cats = list(set(cats))  # remove any duplicates
        return cats
    def add_creation(self, event):
        handled_tags = list()
        skipped_tags = ['lido:eventName', u'lido:eventType']
        tag = u'lido:event'

        # add creator(s)
        handled_tags.append(u'lido:eventActor')
        self.creator = {}
        self.handle_creators(
            common.trim_list(common.listify(event[u'lido:eventActor'])))

        # add creation_date
        handled_tags.append(u'lido:eventDate')
        self.creation_date = {}
        if event.get(u'lido:eventDate'):
            self.creation_date['earliest'] = event[u'lido:eventDate'][
                u'lido:date'].get(u'lido:earliestDate')
            self.creation_date['latest'] = event[u'lido:eventDate'][
                u'lido:date'].get(u'lido:latestDate')
            self.creation_date['text'] = get_lang_values_from_set(
                common.listify(event[u'lido:eventDate'][u'lido:displayDate']))

        # add creation place
        handled_tags.append(u'lido:eventPlace')
        self.creation_place = get_lang_values_from_set(
            common.listify(event[u'lido:eventPlace'][u'lido:place']
                           [u'lido:namePlaceSet']),
            (u'lido:appellationValue', ))

        # add materialtech
        handled_tags.append(u'lido:eventMaterialsTech')
        self.techniques = get_lang_values_from_set(
            common.listify(event[u'lido:eventMaterialsTech']),
            (u'lido:materialsTech', u'lido:termMaterialsTech', u'lido:term'))

        flag_missed_tags(event, tag, handled_tags, skipped_tags)
Beispiel #10
0
def csv_file_to_dict(filename,
                     key_col,
                     header_check,
                     non_unique=False,
                     keep=None,
                     lists=None,
                     delimiter='|',
                     list_delimiter=';',
                     codec='utf-8'):
    """
    Open a csv file and returns a dict of dicts, using the header row for keys.

    Non-unique columns are concatenated into lists. Note that this structure
    does not survive the csv_file_to_dict -> dict_to_csv_file roundtrip.

    @param filename: the file to open
    @param key_col: the (label of the) column to use as a key in the dict
        str or tuple of strs to combine (with a ":")
    @param header_check: a string to check against the header row
    @param non_unique: whether non-unique column headings are expected
        (defaults to False)
    @param keep: tuple of columns to keep (defaults to None=all)
    @param lists: tuple of columns to treat as lists (defaults to None=none)
    @param delimiter: the used delimiter (defaults to "|")
    @param list_delimiter: the used delimiter when encountering a list
    @param codec: the used encoding (defaults to "utf-8")
    @return: dict
    """
    # load and parse file
    header, lines = open_csv_file(filename, delimiter=delimiter, codec=codec)

    # verify header == headerCheck (including order)
    if header_check.split(delimiter) != header:
        raise MyError("Header missmatch.\nExpected: %s\nFound:%s" %
                      (header_check, delimiter.join(header)))

    # convert txt key to numeric key
    try:
        key_col_num = None
        if isinstance(key_col, tuple):
            key_col_num = []
            for key in key_col:
                key_col_num.append(header.index(key))
            key_col_num = tuple(key_col_num)
        else:
            key_col_num = header.index(key_col)
    except ValueError:
        raise MyError("key_col not found in header")

    # set up columns to keep and listify columns
    cols = find_cols(keep, 'keep', header, default_all=True)
    non_unique_cols = find_non_unique_cols(header, list(cols), non_unique)
    listify = find_cols(lists, 'lists', header, default_all=False)

    # verify key_col is valid
    validate_key_col(key_col, lists, non_unique_cols, list(cols), header)

    # load to dict
    d = dict()
    for l in lines:
        if not l:
            continue
        parts = l.split(delimiter)

        # set key
        key = None
        if isinstance(key_col_num, tuple):
            keys = []
            for key_num in key_col_num:
                keys.append(parts[key_num].strip())
            key = ':'.join(keys)
        else:
            key = parts[key_col_num].strip()

        # check uniqueness
        if key in d:
            raise MyError("Non-unique key found: %s" % key)

        d[key] = dict()
        for k, v in cols.items():
            if k in non_unique_cols:
                d[key][k] = []
                for nv in non_unique_cols[k]:
                    if k in listify:
                        d[key][k] += parts[nv].strip().split(list_delimiter)
                    else:
                        d[key][k].append(parts[nv].strip())
                d[key][k] = trim_list(d[key][k])
            else:
                if k in listify:
                    d[key][k] = trim_list(
                        parts[v].strip().split(list_delimiter))
                else:
                    d[key][k] = parts[v].strip()

    return d
Beispiel #11
0
def checkLine(line, idnos):
    if len(line) == 0:
        return '', ''

    log = []
    params = line.split('|')

    idno = params[0].strip()
    typ = params[1].strip()
    benamning = params[2].strip()
    material = params[3].strip().split(',')
    namn_konstnar = helpers.flip_name(params[4].strip())
    namn_konstnar_knav = params[5].strip()
    namn_konstruktor = [params[6].strip(), ]
    namn_konstruktor_knav = params[7].strip()
    namn_konstruktor.append(params[8].strip())
    namn_fotograf = params[9].strip()
    namn_tillverkare = [params[10].strip(), ]
    namn_tillverkare.append(params[11].strip())
    namn_tillverkare.append(params[12].strip())
    date_foto = params[13].strip()
    date_produktion = params[14].strip()
    avbildad_namn = [helpers.flip_name(params[15].strip()), ]
    avbildad_namn_knav = params[16].strip()
    avbildad_namn.append(params[17].strip())
    avbildad_namn.append(params[18].strip())
    avbildad_ort = params[19].strip()
    amnesord = params[20].strip().split(',')
    beskrivning = params[21].strip()
    motiv_amnesord = params[22].strip().split(',')
    motiv_beskrivning = params[23].strip()
    rattighet = params[24].strip()
    samling = params[25].strip()
    dimukod = params[26].strip()

    # cleanup lists
    material = common.trim_list(material)
    namn_tillverkare = common.trim_list(namn_tillverkare)
    avbildad_namn = common.trim_list(avbildad_namn)
    namn_konstruktor = common.trim_list(namn_konstruktor)
    amnesord = common.trim_list(amnesord)
    motiv_amnesord = common.trim_list(motiv_amnesord)

    # kNav
    if len(namn_konstnar_knav) > 0:
        addTokNavList(namn_konstnar_knav, namn_konstnar)
    if len(avbildad_namn_knav) > 0:
        addTokNavList(avbildad_namn_knav, avbildad_namn[0])
    if len(namn_konstruktor_knav) > 0:
        addTokNavList(avbildad_namn_knav,
                      helpers.flip_name(namn_konstruktor[0]))

    log.append(testId(idno, idnos))
    log.append(checkType(typ))
    log.append(testRight(rattighet))
    log.append(testCollection(samling))
    log.append(testKeywords(amnesord, motiv_amnesord, benamning))
    log.append(testDescription(beskrivning, motiv_beskrivning))
    for namn in namn_tillverkare:
        log.append(testName(namn))
    for namn in avbildad_namn:
        log.append(testName(namn))
    for namn in namn_konstruktor:
        log.append(testName(namn))
    log.append(testName(namn_fotograf))
    log.append(testName(namn_konstnar))
    log.append(testName(namn_fotograf))
    log.append(testDateRange(date_foto))
    log.append(testDateRange(date_produktion))

    # test filenames
    log.append(
        testNameGeneration(idno, typ, benamning, motiv_beskrivning,
                           avbildad_namn, avbildad_ort, date_foto,
                           date_produktion))

    # some counters
    if len(avbildad_ort) > 0:
        helpers.addOrIncrement(ortList, avbildad_ort)
    for m in material:
        helpers.addOrIncrement(materialList, m.lower())
    if len(benamning) > 0:
        helpers.addOrIncrement(benamningList, benamning.lower())

    # compile and return
    logtext = ''
    for l in log:
        if l:
            logtext += u'%s. ' % l
    return idno, logtext.strip()
Beispiel #12
0
def main(*args):
    """Command line entry-point."""
    usage = (
        'Usage:'
        '\tpython uploader.py -in_path:PATH -dir:PATH -cutoff:NUM\n'
        '\t-in_path:PATH path to the directory containing the media files or '
        'to the make_info output file if "-type" is set to url\n'
        '\t-type:STRING the type of upload to make. Must be either "FILES" '
        'or "URL". Defaults to FILES (optional)\n'
        '\t-dir:PATH specifies the path to the directory containing a '
        'user_config.py file (optional)\n'
        '\t-cutoff:NUM stop the upload after the specified number of files '
        '(optional)\n'
        '\t-confirm Whether to output a confirmation after each upload '
        'attempt (optional)\n'
        '\t-test Whether to do mock upload, simply outputting to commandline. '
        '(optional)\n'
        '\t-nochunk Whether to turn off chunked uploading, this is slow '
        'and does not support files > 100Mb (optional, type:FILES only)\n'
        '\t-only:PATH to file containing list of urls to upload, skipping all '
        'others. One entry per line. (optional, type:URL only)\n'
        '\t-skip:PATH to file containing list of urls to skip, uploading all '
        'others. Can be combined with "-only" for further filtering, e.g '
        '"-only:<list of vase images> -skip:<list of blue images>" to get '
        'non-blue vase images. One entry per line. (optional, type:URL only)\n'
        '\tExample:\n'
        '\tpython uploader.py -in_path:../diskkopia -cutoff:100\n')
    cutoff = None
    in_path = None
    test = False
    confirm = False
    chunked = True
    typ = 'files'
    only = None
    skip = None

    # Load pywikibot args and handle local args
    for arg in pywikibot.handle_args(args):
        option, sep, value = arg.partition(':')
        if option == '-cutoff':
            if common.is_pos_int(value):
                cutoff = int(value)
        elif option == '-in_path':
            in_path = value
        elif option == '-test':
            test = True
        elif option == '-confirm':
            confirm = True
        elif option == '-nochunk':
            chunked = False
        elif option == '-type':
            if value.lower() == 'url':
                typ = 'url'
            elif value.lower() not in ('url', 'files'):
                pywikibot.output(usage)
                return
        elif option == '-only':
            only = common.trim_list(
                common.open_and_read_file(value).split('\n'))
        elif option == '-skip':
            skip = common.trim_list(
                common.open_and_read_file(value).split('\n'))
        elif option == '-usage':
            pywikibot.output(usage)
            return

    if in_path:
        if typ == 'files':
            up_all(in_path,
                   cutoff=cutoff,
                   test=test,
                   verbose=confirm,
                   chunked=chunked)
        elif typ == 'url':
            up_all_from_url(in_path,
                            cutoff=cutoff,
                            only=only,
                            skip=skip,
                            test=test,
                            verbose=confirm)
    else:
        pywikibot.output(usage)
Beispiel #13
0
    def make_item_from_raw(entry, smm_info):
        """
        Given the raw metadata for an item, construct an SMMItem.

        @param entry: the raw metadata entry as a dict
        @param smm_info: the parent smm_info instance
        @return: SMMItem
        """
        d = {}
        # map to internal labels and flip names
        d['idno'] = entry[u'Identifikationsnr']
        d['typ'] = entry[u'Typ av objekt']
        d['benamning'] = entry[u'Benämning']
        d['material'] = entry[u'Material']
        d['namn_konstnar'] = helpers.flip_name(entry[u'Namn-Konstnär'])
        namn_konstnar_knav = entry[u'Konstnär-KulturNav']
        d['namn_konstruktor'] = helpers.flip_names(entry[u'Namn-Konstruktör'])
        namn_konstruktor_knav = entry[u'Konstruktör-KulturNav']
        d['namn_fotograf'] = helpers.flip_name(entry[u'Namn-Fotograf'])
        d['namn_tillverkare'] = helpers.flip_names(entry[u'Namn-Tillverkare'])
        d['date_foto'] = entry[u'Datering-Fotografering']
        d['date_produktion'] = entry[u'Datering-Produktion']
        avbildad_namn = entry[u'Avbildade namn']
        avbildad_namn_knav = entry[u'Avbildade-KulturNav']
        d['avbildad_ort'] = entry[u'Avbildade - orter']
        d['amnesord'] = entry[u'Ämnesord']
        d['beskrivning'] = entry[u'Beskrivning']
        d['motiv_amnesord'] = entry[u'Motiv-ämnesord']
        d['motiv_beskrivning'] = entry[u'Motiv-beskrivning']
        d['rattighet'] = entry[u'Rättigheter']
        d['samling'] = entry[u'Samling']
        d['dimukod'] = entry[u'Dimukode']

        # handle kulturNav
        if namn_konstnar_knav:
            smm_info.add_to_k_nav_list(namn_konstnar_knav, d['namn_konstnar'])
        if namn_konstruktor_knav:
            smm_info.add_to_k_nav_list(namn_konstruktor_knav,
                                       d['namn_konstruktor'][0])
        if avbildad_namn_knav:
            smm_info.add_to_k_nav_list(avbildad_namn_knav,
                                       helpers.flip_name(avbildad_namn[0]))

        # split avbildad_namn into people and ships/boat types
        # a person is anyone with a name like Last, First
        d['avbildad_person'] = []
        d['avbildat_fartyg'] = []
        for a in avbildad_namn:
            if a != helpers.flip_name(a):
                d['avbildad_person'].append(helpers.flip_name(a))
            else:
                d['avbildat_fartyg'].append(a)
        # add to dict, now with flipped names
        d['avbildad_namn'] = d['avbildad_person'] + d['avbildat_fartyg']

        # cleanup lists
        d['avbildad_person'] = common.trim_list(d['avbildad_person'])
        d['avbildat_fartyg'] = common.trim_list(d['avbildat_fartyg'])
        d['avbildad_namn'] = common.trim_list(d['avbildad_namn'])

        # cleanup blacklisted
        if d['date_foto'].strip('.').lower() in smm_info.bad_date:
            d['date_foto'] = ''
        if d['date_produktion'].strip('.').lower() in smm_info.bad_date:
            d['date_produktion'] = ''
        if d['namn_konstnar'].lower() in smm_info.bad_namn:
            d['namn_konstnar'] = ''
        if d['namn_fotograf'].lower() in smm_info.bad_namn:
            d['namn_fotograf'] = ''

        return SMMItem(d)