def parseArtwork(contents, pagename): ''' Given the contents of a wikipage this returns the artorks listed in it input: wikicode @ output: list of artwork-dict items ''' units = [] while(True): table, contents, lead_in = common.findUnit(contents, u'{{Skulpturlista-huvud}}', u'|}') if not table: break # try to isolate a header row header = '' lead_rows = lead_in.strip(' \n').split('\n') if lead_rows[-1].startswith(u'=='): header = lead_rows[-1].strip(u' =') while(True): unit, table, dummy = common.findUnit(table, u'{{Skulpturlista', u'}}', brackets={u'{{': u'}}'}) if not unit: break u = {u'id': '', u'namn': '', u'skulptör': '', u'årtal': '', u'material': '', u'plats': '', u'koordinater': '', u'bild': '', u'namn_link': '', u'skulptör_link': '', u'plats_link': '', u'lat': '', u'lon': '', u'header': header, u'page': pagename} while(True): part, unit, dummy = common.findUnit(unit, u'|', u'\n', brackets={u'[[': u']]', u'{{': u'}}'}) if not part: break if u'=' in part: part = part.replace(u'<small>', '').replace(u'</small>', '') part = part.strip(' \n\t') # can't use split as coord uses second equality sign pos = part.find(u'=') key = part[:pos].strip() value = part[pos+1:].strip() if len(value) > 0: if (key) in u.keys(): u[key] = value else: print u'Unrecognised parameter: %s = %s' % (key, value) units.append(u.copy()) # end units # end tables return units
def parseArtwork(contents, pagename): ''' Given the contents of a wikipage this returns the artorks listed in it input: wikicode @ output: list of artwork-dict items ''' units = [] while (True): table, contents, lead_in = common.findUnit(contents, u'{{Skulpturlista-huvud}}', u'|}') if not table: break # try to isolate a header row header = '' lead_rows = lead_in.strip(' \n').split('\n') if lead_rows[-1].startswith(u'=='): header = lead_rows[-1].strip(u' =') while (True): unit, table, dummy = common.findUnit(table, u'{{Skulpturlista', u'}}', brackets={u'{{': u'}}'}) if not unit: break params = {} u = { u'id': '', u'namn': '', u'skulptör': '', u'årtal': '', u'material': '', u'plats': '', u'koordinater': '', u'bild': '', u'namn_link': '', u'skulptör_link': '', u'plats_link': '', u'lat': '', u'lon': '', u'header': header, u'page': pagename } while (True): part, unit, dummy = common.findUnit(unit, u'|', u'\n', brackets={ u'[[': u']]', u'{{': u'}}' }) if not part: break if u'=' in part: part = part.replace(u'<small>', '').replace(u'</small>', '') part = part.strip(' \n\t') # can't use split as coord uses second equality sign pos = part.find(u'=') key = part[:pos].strip() value = part[pos + 1:].strip() if len(value) > 0: if (key) in u.keys(): u[key] = value else: print u'Unrecognised parameter: %s = %s' % (key, value) units.append(u.copy()) # end units # end tables return units
def listToObjects(objects, pagename, list_wd, contents): ''' parses a wikilist into relevant objects If object with same id already exists then a clash parameter is added Does not deal with wikitext/linking etc. ''' log = '' if not contents: log = u'The page %s is missing or invalid\n' % pagename return log # the following should be treated as booleans boolParams = [u'döljKommun', u'döljStadsdel', u'tidigare', u'visaId'] while(True): table, contents, lead_in = common.findUnit(contents, u'{{Offentligkonstlista-huvud', u'|}') if not table: break header = table[:table.find('\n')] table = table[len(header):] # read in header parameters headerDict = {u'län': None, u'kommun': None, u'stadsdel': None, u'tidigare': None, u'visaId': None} parts = header.split('|') for p in parts: if '=' in p: pp = p.split('=') for i in range(0, len(pp)): pp[i] = pp[i].strip(' \n\t}') if pp[0] in headerDict.keys(): if pp[0] in boolParams: headerDict[pp[0]] = True else: headerDict[pp[0]] = pp[1] else: log += u'Unrecognised header parameter: %s = %s (%s)\n' % (pp[0], pp[1], pagename) while(True): row, table, dummy = common.findUnit(table, u'{{Offentligkonstlista', u'}}', brackets={u'{{': u'}}'}) if not row: break params = { u'id': '', u'id-länk': '', u'titel': '', u'aka': '', u'artikel': '', u'konstnär': '', u'konstnär2': '', u'konstnär3': '', u'konstnär4': '', u'konstnär5': '', u'konstnär6': '', u'konstnär7': '', u'konstnär8': '', u'konstnär9': '', u'årtal': '', u'beskrivning': '', u'typ': '', u'material': '', u'fri': '', u'plats': '', u'inomhus': '', u'län': '', u'kommun': '', u'stadsdel': '', u'lat': '', u'lon': '', u'bild': '', u'commonscat': '', u'fotnot': '', u'fotnot-namn': '', u'döljKommun': False, u'döljStadsdel': False, u'visaId': False, u'fotnot2': '', u'fotnot2-namn': '', u'fotnot3': '', u'fotnot3-namn': '', u'page': pagename, u'list': list_wd, 'clash': None, 'header': headerDict} while(True): part, row, dummy = common.findUnit(row, u'|', None, brackets={u'[[': u']]', u'{{': u'}}'}) if not part: break if u'=' in part: p = part.split(u'=') p = [p[0], part[len(p[0])+1:]] # to avoid problems with = signs in urls etc. for i in range(0, len(p)): p[i] = p[i].strip(' \n\t') if p[0] in params.keys(): if p[0] in boolParams: params[p[0]] = True else: params[p[0]] = p[1] else: log += u'Unrecognised parameter: %s = %s (%s)\n' % (p[0], p[1], pagename) if params['id']: if params['id'] in objects.keys(): # until I can deal with mulitple entries objects[params['id']]['clash'] = pagename else: objects[params['id']] = params.copy() if log: return log