def get_artist(self, item): """Get formated artist info based on item and wikidata.""" # formated string for unknown or anonymous artists = [] wd_painting_artists = self.get_wd_painting_artists(item) lido_artists = item.get_artists() for nsid, artist_data in lido_artists.iteritems(): artists.append( self.get_single_artist(nsid, artist_data, len(lido_artists), wd_painting_artists, item)) if len(artists) == 0: return '' elif len(artists) == 1: return NatmusInfo.format_artist_name(artists[0]) else: non_anons = common.trim_list(artists) if not non_anons: # multiple anons, simply output one return NatmusInfo.format_artist_name(artists[0]) elif len(non_anons) == 1 and non_anons[0].get('qualifier'): # anons + one named artist with qualifier return NatmusInfo.format_artist_name(artists[0]) else: # multiple named artists, just ignore any anons formatted_artists = \ [NatmusInfo.format_artist_name(artist) for artist in non_anons] return '\n '.join(formatted_artists)
def secondaryKeywordTest(lines): """ How many files get keywords if we limit them by frequency """ offset = 3 num = 8 # 3, 4, 5, 6, 7, 8, 9, 10 passNo = [] for i in range(num): passNo.append(0) for l in lines: passed = [] for i in range(num): passed.append(False) params = l.split('|') keywords = params[20].strip().split(',') keywords += params[22].strip().split(',') keywords = common.trim_list(keywords) for k in keywords: k = k.lower() for i in range(offset, num + offset): if not passed[i - offset] and keywordList[k] >= i: passNo[i - offset] += 1 passed[i - offset] += True txt = u'frekvens: bilder utan kategori\n' for i in range(offset, num + offset): txt += u'%d: %d\n' % (i, len(lines) - passNo[i - offset]) txt += u'(utav %d filer)' % len(lines) return txt
def clean_sparql_output(data, key): """ Takes the sparql output and outputs it as a dict with lists. Also converts any entity_urls to Qids. @param data: data to clean @param key: data value to use as key in the new dict @return: dict """ entity_url = u'http://www.wikidata.org/entity/' if key not in data[0].keys(): pywikibot.error( u"The expected key '%s' was not present in the sparql output " u"keys: %s" % (key, ', '.join(data[0].keys()))) new_data = {} for d in data: k = d[key].replace(entity_url, '') new_data[k] = {} for kk, value in d.iteritems(): value = value.split('|') for i, v in enumerate(value): value[i] = v.replace(entity_url, '') new_data[k][kk] = common.trim_list(value) return new_data
def testKeywords(amnesord, motiv_amnesord, benamning): keywords = amnesord + motiv_amnesord + [benamning, ] keywords = common.trim_list(keywords) if len(keywords) < 1: return u'Inga ämnesord' for k in keywords: helpers.addOrIncrement(keywordList, k.lower())
def add_identification_data(self, data): handled_tags = list() skipped_tags = [ '', ] tag = u'lido:objectIdentificationWrap' # add title handled_tags.append(u'lido:titleWrap') titles = common.listify(data[u'lido:titleWrap'][u'lido:titleSet']) self.titles = get_lang_values_from_set(titles, (u'lido:appellationValue', )) # add incription handled_tags.append(u'lido:inscriptionsWrap') inscriptions = common.listify( data[u'lido:inscriptionsWrap'][u'lido:inscriptions']) self.inscriptions = get_lang_values_from_set( inscriptions, (u'lido:inscriptionTranscription', )) # add decription handled_tags.append(u'lido:objectDescriptionWrap') description_set = data[u'lido:objectDescriptionWrap'][ u'lido:objectDescriptionSet'] if not isinstance(description_set, OrderedDict): pywikibot.warning( "Weird things are happening in description field for %s:\n%s" % (self.source_file, description_set)) descriptions = common.listify( description_set[u'lido:descriptiveNoteValue']) self.descriptions = get_lang_values_from_set(descriptions) # add measurements handled_tags.append(u'lido:objectMeasurementsWrap') measurement_set = data[u'lido:objectMeasurementsWrap'][ u'lido:objectMeasurementsSet'] if set(measurement_set.keys()) - set( [u'lido:displayObjectMeasurements', u'lido:objectMeasurements']): pywikibot.warning( "Weird things are happening in measurement field for %s:\n%s" % (self.source_file, measurement_set)) self._debug(measurement_set.get(u'lido:displayObjectMeasurements')) measurements = common.trim_list( common.listify( measurement_set.get(u'lido:displayObjectMeasurements'))) self._debug(measurements) self.add_meaurements(measurements) # ensure location is always Nationalmuesum handled_tags.append(u'lido:repositoryWrap') repository_viaf = data[u'lido:repositoryWrap']['lido:repositorySet'][ 'lido:repositoryName']['lido:legalBodyID']['#text'] if repository_viaf != u'http://viaf.org/viaf/147742988': pywikibot.warning("Unexpected repoitory in %s: %s" % (self.source_file, repository_viaf)) flag_missed_tags(data, tag, handled_tags, skipped_tags)
def load_data(self, in_file): """ Load the provided data files. Outputs a tuple with lido data as a dict and image filenames as a list. @param in_file: the path to the metadata file @return: (dict, list) """ lido_data = common.open_and_read_file(in_file[0], as_json=True) image_files = common.open_and_read_file(in_file[1]).split('\n') image_files = common.trim_list(image_files) return (lido_data, image_files)
def get_type(self, item): """Get the object type of an item.""" typ = '' # get previous wikidata info wd_data = self.wd_paintings.get(item.get_obj_id()) if wd_data: types = [] for t in wd_data.get('types'): types.append(self.type_mappings.get(t)) types = common.trim_list(types) if len(types) == 1: typ = types[0] elif len(types) > 1: pywikibot.warning("Found %d matching types for %s" % (len(types), item.get_obj_id())) return typ
def generate_content_cats(self, item): """ Produce categories related to the media file contents. @param item: the metadata for the media file in question @return: list of categories (without "Category:" prefix) """ cats = [] wd_painting = self.wd_paintings.get(item.get_obj_id()) if wd_painting and wd_painting.get('commons_cats'): # if commonscat(s) for the image then add no other content cats cats += common.trim_list(wd_painting.get('commons_cats')) else: for tracker in ('depicted', 'artist'): while True: entry = item.get_from_tracker(tracker) if not entry: break cats.append(entry) cats = list(set(cats)) # remove any duplicates return cats
def add_creation(self, event): handled_tags = list() skipped_tags = ['lido:eventName', u'lido:eventType'] tag = u'lido:event' # add creator(s) handled_tags.append(u'lido:eventActor') self.creator = {} self.handle_creators( common.trim_list(common.listify(event[u'lido:eventActor']))) # add creation_date handled_tags.append(u'lido:eventDate') self.creation_date = {} if event.get(u'lido:eventDate'): self.creation_date['earliest'] = event[u'lido:eventDate'][ u'lido:date'].get(u'lido:earliestDate') self.creation_date['latest'] = event[u'lido:eventDate'][ u'lido:date'].get(u'lido:latestDate') self.creation_date['text'] = get_lang_values_from_set( common.listify(event[u'lido:eventDate'][u'lido:displayDate'])) # add creation place handled_tags.append(u'lido:eventPlace') self.creation_place = get_lang_values_from_set( common.listify(event[u'lido:eventPlace'][u'lido:place'] [u'lido:namePlaceSet']), (u'lido:appellationValue', )) # add materialtech handled_tags.append(u'lido:eventMaterialsTech') self.techniques = get_lang_values_from_set( common.listify(event[u'lido:eventMaterialsTech']), (u'lido:materialsTech', u'lido:termMaterialsTech', u'lido:term')) flag_missed_tags(event, tag, handled_tags, skipped_tags)
def csv_file_to_dict(filename, key_col, header_check, non_unique=False, keep=None, lists=None, delimiter='|', list_delimiter=';', codec='utf-8'): """ Open a csv file and returns a dict of dicts, using the header row for keys. Non-unique columns are concatenated into lists. Note that this structure does not survive the csv_file_to_dict -> dict_to_csv_file roundtrip. @param filename: the file to open @param key_col: the (label of the) column to use as a key in the dict str or tuple of strs to combine (with a ":") @param header_check: a string to check against the header row @param non_unique: whether non-unique column headings are expected (defaults to False) @param keep: tuple of columns to keep (defaults to None=all) @param lists: tuple of columns to treat as lists (defaults to None=none) @param delimiter: the used delimiter (defaults to "|") @param list_delimiter: the used delimiter when encountering a list @param codec: the used encoding (defaults to "utf-8") @return: dict """ # load and parse file header, lines = open_csv_file(filename, delimiter=delimiter, codec=codec) # verify header == headerCheck (including order) if header_check.split(delimiter) != header: raise MyError("Header missmatch.\nExpected: %s\nFound:%s" % (header_check, delimiter.join(header))) # convert txt key to numeric key try: key_col_num = None if isinstance(key_col, tuple): key_col_num = [] for key in key_col: key_col_num.append(header.index(key)) key_col_num = tuple(key_col_num) else: key_col_num = header.index(key_col) except ValueError: raise MyError("key_col not found in header") # set up columns to keep and listify columns cols = find_cols(keep, 'keep', header, default_all=True) non_unique_cols = find_non_unique_cols(header, list(cols), non_unique) listify = find_cols(lists, 'lists', header, default_all=False) # verify key_col is valid validate_key_col(key_col, lists, non_unique_cols, list(cols), header) # load to dict d = dict() for l in lines: if not l: continue parts = l.split(delimiter) # set key key = None if isinstance(key_col_num, tuple): keys = [] for key_num in key_col_num: keys.append(parts[key_num].strip()) key = ':'.join(keys) else: key = parts[key_col_num].strip() # check uniqueness if key in d: raise MyError("Non-unique key found: %s" % key) d[key] = dict() for k, v in cols.items(): if k in non_unique_cols: d[key][k] = [] for nv in non_unique_cols[k]: if k in listify: d[key][k] += parts[nv].strip().split(list_delimiter) else: d[key][k].append(parts[nv].strip()) d[key][k] = trim_list(d[key][k]) else: if k in listify: d[key][k] = trim_list( parts[v].strip().split(list_delimiter)) else: d[key][k] = parts[v].strip() return d
def checkLine(line, idnos): if len(line) == 0: return '', '' log = [] params = line.split('|') idno = params[0].strip() typ = params[1].strip() benamning = params[2].strip() material = params[3].strip().split(',') namn_konstnar = helpers.flip_name(params[4].strip()) namn_konstnar_knav = params[5].strip() namn_konstruktor = [params[6].strip(), ] namn_konstruktor_knav = params[7].strip() namn_konstruktor.append(params[8].strip()) namn_fotograf = params[9].strip() namn_tillverkare = [params[10].strip(), ] namn_tillverkare.append(params[11].strip()) namn_tillverkare.append(params[12].strip()) date_foto = params[13].strip() date_produktion = params[14].strip() avbildad_namn = [helpers.flip_name(params[15].strip()), ] avbildad_namn_knav = params[16].strip() avbildad_namn.append(params[17].strip()) avbildad_namn.append(params[18].strip()) avbildad_ort = params[19].strip() amnesord = params[20].strip().split(',') beskrivning = params[21].strip() motiv_amnesord = params[22].strip().split(',') motiv_beskrivning = params[23].strip() rattighet = params[24].strip() samling = params[25].strip() dimukod = params[26].strip() # cleanup lists material = common.trim_list(material) namn_tillverkare = common.trim_list(namn_tillverkare) avbildad_namn = common.trim_list(avbildad_namn) namn_konstruktor = common.trim_list(namn_konstruktor) amnesord = common.trim_list(amnesord) motiv_amnesord = common.trim_list(motiv_amnesord) # kNav if len(namn_konstnar_knav) > 0: addTokNavList(namn_konstnar_knav, namn_konstnar) if len(avbildad_namn_knav) > 0: addTokNavList(avbildad_namn_knav, avbildad_namn[0]) if len(namn_konstruktor_knav) > 0: addTokNavList(avbildad_namn_knav, helpers.flip_name(namn_konstruktor[0])) log.append(testId(idno, idnos)) log.append(checkType(typ)) log.append(testRight(rattighet)) log.append(testCollection(samling)) log.append(testKeywords(amnesord, motiv_amnesord, benamning)) log.append(testDescription(beskrivning, motiv_beskrivning)) for namn in namn_tillverkare: log.append(testName(namn)) for namn in avbildad_namn: log.append(testName(namn)) for namn in namn_konstruktor: log.append(testName(namn)) log.append(testName(namn_fotograf)) log.append(testName(namn_konstnar)) log.append(testName(namn_fotograf)) log.append(testDateRange(date_foto)) log.append(testDateRange(date_produktion)) # test filenames log.append( testNameGeneration(idno, typ, benamning, motiv_beskrivning, avbildad_namn, avbildad_ort, date_foto, date_produktion)) # some counters if len(avbildad_ort) > 0: helpers.addOrIncrement(ortList, avbildad_ort) for m in material: helpers.addOrIncrement(materialList, m.lower()) if len(benamning) > 0: helpers.addOrIncrement(benamningList, benamning.lower()) # compile and return logtext = '' for l in log: if l: logtext += u'%s. ' % l return idno, logtext.strip()
def main(*args): """Command line entry-point.""" usage = ( 'Usage:' '\tpython uploader.py -in_path:PATH -dir:PATH -cutoff:NUM\n' '\t-in_path:PATH path to the directory containing the media files or ' 'to the make_info output file if "-type" is set to url\n' '\t-type:STRING the type of upload to make. Must be either "FILES" ' 'or "URL". Defaults to FILES (optional)\n' '\t-dir:PATH specifies the path to the directory containing a ' 'user_config.py file (optional)\n' '\t-cutoff:NUM stop the upload after the specified number of files ' '(optional)\n' '\t-confirm Whether to output a confirmation after each upload ' 'attempt (optional)\n' '\t-test Whether to do mock upload, simply outputting to commandline. ' '(optional)\n' '\t-nochunk Whether to turn off chunked uploading, this is slow ' 'and does not support files > 100Mb (optional, type:FILES only)\n' '\t-only:PATH to file containing list of urls to upload, skipping all ' 'others. One entry per line. (optional, type:URL only)\n' '\t-skip:PATH to file containing list of urls to skip, uploading all ' 'others. Can be combined with "-only" for further filtering, e.g ' '"-only:<list of vase images> -skip:<list of blue images>" to get ' 'non-blue vase images. One entry per line. (optional, type:URL only)\n' '\tExample:\n' '\tpython uploader.py -in_path:../diskkopia -cutoff:100\n') cutoff = None in_path = None test = False confirm = False chunked = True typ = 'files' only = None skip = None # Load pywikibot args and handle local args for arg in pywikibot.handle_args(args): option, sep, value = arg.partition(':') if option == '-cutoff': if common.is_pos_int(value): cutoff = int(value) elif option == '-in_path': in_path = value elif option == '-test': test = True elif option == '-confirm': confirm = True elif option == '-nochunk': chunked = False elif option == '-type': if value.lower() == 'url': typ = 'url' elif value.lower() not in ('url', 'files'): pywikibot.output(usage) return elif option == '-only': only = common.trim_list( common.open_and_read_file(value).split('\n')) elif option == '-skip': skip = common.trim_list( common.open_and_read_file(value).split('\n')) elif option == '-usage': pywikibot.output(usage) return if in_path: if typ == 'files': up_all(in_path, cutoff=cutoff, test=test, verbose=confirm, chunked=chunked) elif typ == 'url': up_all_from_url(in_path, cutoff=cutoff, only=only, skip=skip, test=test, verbose=confirm) else: pywikibot.output(usage)
def make_item_from_raw(entry, smm_info): """ Given the raw metadata for an item, construct an SMMItem. @param entry: the raw metadata entry as a dict @param smm_info: the parent smm_info instance @return: SMMItem """ d = {} # map to internal labels and flip names d['idno'] = entry[u'Identifikationsnr'] d['typ'] = entry[u'Typ av objekt'] d['benamning'] = entry[u'Benämning'] d['material'] = entry[u'Material'] d['namn_konstnar'] = helpers.flip_name(entry[u'Namn-Konstnär']) namn_konstnar_knav = entry[u'Konstnär-KulturNav'] d['namn_konstruktor'] = helpers.flip_names(entry[u'Namn-Konstruktör']) namn_konstruktor_knav = entry[u'Konstruktör-KulturNav'] d['namn_fotograf'] = helpers.flip_name(entry[u'Namn-Fotograf']) d['namn_tillverkare'] = helpers.flip_names(entry[u'Namn-Tillverkare']) d['date_foto'] = entry[u'Datering-Fotografering'] d['date_produktion'] = entry[u'Datering-Produktion'] avbildad_namn = entry[u'Avbildade namn'] avbildad_namn_knav = entry[u'Avbildade-KulturNav'] d['avbildad_ort'] = entry[u'Avbildade - orter'] d['amnesord'] = entry[u'Ämnesord'] d['beskrivning'] = entry[u'Beskrivning'] d['motiv_amnesord'] = entry[u'Motiv-ämnesord'] d['motiv_beskrivning'] = entry[u'Motiv-beskrivning'] d['rattighet'] = entry[u'Rättigheter'] d['samling'] = entry[u'Samling'] d['dimukod'] = entry[u'Dimukode'] # handle kulturNav if namn_konstnar_knav: smm_info.add_to_k_nav_list(namn_konstnar_knav, d['namn_konstnar']) if namn_konstruktor_knav: smm_info.add_to_k_nav_list(namn_konstruktor_knav, d['namn_konstruktor'][0]) if avbildad_namn_knav: smm_info.add_to_k_nav_list(avbildad_namn_knav, helpers.flip_name(avbildad_namn[0])) # split avbildad_namn into people and ships/boat types # a person is anyone with a name like Last, First d['avbildad_person'] = [] d['avbildat_fartyg'] = [] for a in avbildad_namn: if a != helpers.flip_name(a): d['avbildad_person'].append(helpers.flip_name(a)) else: d['avbildat_fartyg'].append(a) # add to dict, now with flipped names d['avbildad_namn'] = d['avbildad_person'] + d['avbildat_fartyg'] # cleanup lists d['avbildad_person'] = common.trim_list(d['avbildad_person']) d['avbildat_fartyg'] = common.trim_list(d['avbildat_fartyg']) d['avbildad_namn'] = common.trim_list(d['avbildad_namn']) # cleanup blacklisted if d['date_foto'].strip('.').lower() in smm_info.bad_date: d['date_foto'] = '' if d['date_produktion'].strip('.').lower() in smm_info.bad_date: d['date_produktion'] = '' if d['namn_konstnar'].lower() in smm_info.bad_namn: d['namn_konstnar'] = '' if d['namn_fotograf'].lower() in smm_info.bad_namn: d['namn_fotograf'] = '' return SMMItem(d)