def __init__(self, endpoint: str = 'https://ugc.kulturarvsdata.se/', key: str = None) -> None: self.endpoint = endpoint + 'UGC-hub/' self.key = key self.headers = { 'User-Agent': 'SOCH-UGC.py' } self.soch = KSamsok() self.relation_types = list([ 'sameAs', 'isDescribedBy', 'describes', 'visualizes', 'hasPart', 'isPartOf', 'isVisualizedBy', 'isContainedIn', 'author', 'authorOf', 'hasBeenUsedIn', 'isRelatedTo', 'architectOf', 'architect', 'user', 'userOf', 'child', 'mother', 'father', 'photographerOf', 'photographer', 'isMentionedBy', 'mentions' ])
import requests from ksamsok import KSamsok from flask import Flask, request, Response from flask_restful import Api, Resource, abort # http://www.ianbicking.org/illusive-setdefaultencoding.html if sys.version[0] == '2': reload(sys) sys.setdefaultencoding('utf-8') app = Flask(__name__) api = Api(app) soch = KSamsok('test') # CORS headers @app.after_request def after_request(resp): resp.headers.add('Access-Control-Allow-Origin', '*') resp.headers.add('Access-Control-Allow-Methods', 'GET, HEAD, OPTIONS') resp.headers.add('X-Powered-By', 'K-Samsök REST Version 1.2.0') return resp class Record(Resource): def get(self, uri): if 'application/json+ld' in request.headers.get('Accept'): url = soch.formatUri(uri, 'jsonldurl')
class UGC: def __init__(self, endpoint: str = 'https://ugc.kulturarvsdata.se/', key: str = None) -> None: self.endpoint = endpoint + 'UGC-hub/' self.key = key self.headers = { 'User-Agent': 'SOCH-UGC.py' } self.soch = KSamsok() self.relation_types = list([ 'sameAs', 'isDescribedBy', 'describes', 'visualizes', 'hasPart', 'isPartOf', 'isVisualizedBy', 'isContainedIn', 'author', 'authorOf', 'hasBeenUsedIn', 'isRelatedTo', 'architectOf', 'architect', 'user', 'userOf', 'child', 'mother', 'father', 'photographerOf', 'photographer', 'isMentionedBy', 'mentions' ]) def get_total_items_count(self) -> str: url = '{}api?method=retrieve&scope=count&objectUri=all&format=json'.format(self.endpoint) data = self.make_get_request(url) return data['response']['relations']['numberOfRelations'] def get_item(self, item_id: Union[int, str]) -> Union[Dict, bool]: url = '{}api?method=retrieve&objectUri=all&contentId={}&scope=single&format=json'.format(self.endpoint, item_id) data = self.make_get_request(url) if data['response']['relations'][0]['id'] is 0: return False return data['response']['relations'][0] def search_items(self, uri: str = 'all', offset: int = 0, limit: int = 50) -> List: url = '{}api?method=retrieve&scope=all&objectUri={}&selectFrom={}&maxCount={}&format=json'.format(self.endpoint, uri, offset, limit) data = self.make_get_request(url) return data['response']['relations'] def delete_item(self, item_id: Union[int, str]) -> bool: url = '{}api?x-api={}&method=delete&objectId={}&format=json'.format(self.endpoint, self.key, item_id) data = self.make_get_request(url) if not self.key: raise ValueError('This action requires an API key.') if data['response']['result'] == 'SUCCESS': return True return False def create_item_relation(self, kulturarvsdata_uri: str, relation: str, target: str, user: str, comment: str = None) -> bool: kulturarvsdata_uri = self.soch.formatUri(kulturarvsdata_uri, 'rawurl') if not kulturarvsdata_uri: raise ValueError('{} is not an valid Kulturarvsdata URI.'.format(kulturarvsdata_uri)) if relation not in self.relation_types: raise ValueError('{} is not a valid relation type.'.format(relation)) if not self.valid_relation_target(target): raise ValueError('{} is not a valid target.'.format(target)) if not self.key: raise ValueError('This action requires an API key.') url = '{}api?x-api={}&method=insert&scope=relationAll&objectUri={}&user={}&relationType={}&relatedTo={}&format=json'.format(self.endpoint, self.key, kulturarvsdata_uri, user, relation, target) if comment: url = '{}&comment={}'.format(url, comment) data = self.make_get_request(url) if data['response']['result'] == 'SUCCESS': return True return False def valid_relation_target(self, target: str) -> bool: if target.startswith('http://kulturarvsdata.se/'): if not self.soch.formatUri(target, 'rawurl'): return False return True if target.startswith('https://commons.wikimedia.org/wiki/File:'): return True if target.startswith('https://commons.wikimedia.org/wiki/Category:'): return True if target.startswith('http://www.wikidata.org/entity/Q'): return True if target.startswith('http://commons.wikimedia.org/entity/M'): return True if target.startswith('http://kulturnav.org/'): return True if target.startswith('http://viaf.org/viaf/'): return True if target.startswith('http://vocab.getty.edu/ulan/'): return True if target.startswith('http://iconclass.org/'): return True if target.startswith('http://data.europeana.eu/'): return True if re.match(r'^https:\/\/libris\.kb\.se\/.{14,17}$', target): return True if re.match(r'^https:\/\/\w{2}\.wikipedia\.org\/wiki\/.+', target): return True return False def make_get_request(self, url: str) -> Dict: r = requests.get(url, headers = self.headers) # note UGC always returns 200 codes for now. if r.status_code is 401: raise PermissionError('Bad API key.') data = r.json() if 'error' in data['response']: raise Exception('Unknown error: {}'.format(data['response']['error'])) return data
import wikibot, re from ksamsok import KSamsok # # This bot is used to fix broken BBR links by guessing # the URIs and check if the new one verifies in KSamsok-py # # user-config.py: # family = 'wikidata' # mylang = 'wikidata' # sparql = 'SELECT ?item ?c WHERE { ?item wdt:P1260 ?c . FILTER(CONTAINS(?c, "bbr/")) }' culturalSerach = KSamsok('test') bot = wikibot.Bot(True) generator = wikibot.Generator.newSparQLGenerator(bot, sparql) for p in generator: item = wikibot.Item(p) uri = item.item.claims['P1260'][0].getTarget() if not culturalSerach.formatUri(uri, 'raw', True): # the uri was not valid # create other bbr links uri_a = re.sub('bbr', 'bbra', uri) uri_b = re.sub('bbr', 'bbrb', uri) uri_m = re.sub('bbr', 'bbrm', uri) if culturalSerach.formatUri(uri_a, 'raw', True): print('bbra') target = uri_a item.item.claims['P1260'][0].changeTarget(target)
def index(self): # setup pywikibot and initialize generator pywikibot.handle_args(sys.argv[1:]) site = pywikibot.Site() generator = pagegenerators.WikidataSPARQLPageGenerator( self.sparql, site) # setup instance of the KSamsok class # the api key is never used so no need to use another one soch = KSamsok('test') for i in generator: item = i.get() data = {} # get the raw wikidata uri without Q data['wikidata'] = re.sub(r'(?!\d).', '', str(i)) # make sure the item does not exist in our database if not self.primary_key_exists(data['wikidata']): # parse the kulturarvsdata uri or set to false if invalid data['kulturarvsdata'] = soch.formatUri( item['claims']['P1260'][0].getTarget(), 'raw', True) #TODO make a log of items with broken kulturarvsdata uris if data['kulturarvsdata']: # fetch stuff from the wikidata item data['wikipedia'] = item['sitelinks']['svwiki'] try: data['commons'] = item['claims']['P373'][0].getTarget() except (KeyError): data['commons'] = '' try: data['image'] = re.sub( r'\]\]', '', re.sub(r'\[\[commons:', '', str(item['claims']['P18'][0].getTarget()))) except (KeyError): data['image'] = '' coord_pair = item['claims']['P625'][0].getTarget() data['lat'] = coord_pair.lat data['lon'] = coord_pair.lon data['label'] = item['labels']['sv'] # fetch stuff from kulturarvsdata record = soch.getObject(data['kulturarvsdata']) if record['presentation']['description']: # if the string is too short to be useful drop it if len(record['presentation']['description']) > 30: data['description'] = record['presentation'][ 'description'] else: data['description'] = '' else: data['description'] = '' # fetch intro paragraphs from wikipedia #TODO if the connection to wikipedia fails then # the item should be dropped(may need to be refactored) try: r = requests.get( 'https://sv.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles=' + data['wikipedia']) result = r.json() # the page id is the unknown key the loop is for figuring it out # dictionaries does not support indexing as they are unsorted... for key in result['query']['pages']: data['wp_description'] = result['query']['pages'][ key]['extract'] except (KeyError): data['wp_description'] = '' try: if (data['image'] != ''): r = requests.get( 'https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=pageimages&piprop=thumbnail|name|original&pithumbsize=110&titles=File:' + data['image']) result = r.json() for key in result['query']['pages']: data['image_thumbnail'] = result['query'][ 'pages'][key]['thumbnail']['source'] data['image_original'] = result['query'][ 'pages'][key]['thumbnail']['original'] else: data['image_thumbnail'] = '' data['image_original'] = '' except (KeyError): data['image_thumbnail'] = '' data['image_original'] = '' # write and commit church to db self.c.execute( '''INSERT INTO `churches` ( `wikidata`, `label`, `kulturarvsdata`, `description`, `lat`, `lon`, `wikipedia`, `wp_description`, `commons`, `image`, `image_thumbnail`, `image_original` ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (data['wikidata'], data['label'], data['kulturarvsdata'], data['description'], data['lat'], data['lon'], data['wikipedia'], data['wp_description'], data['commons'], data['image'], data['image_thumbnail'], data['image_original'])) self.db_connection.commit()
import wikibot, re from ksamsok import KSamsok # # This bot is used to detect broken/invalid kulturarvsdata statements # THIS BOT MAKES THOUSANDS OF HTTP CALLS TO THE Swedish National Heritage Board # BE NICE. # # user-config.py: # family = 'wikidata' # mylang = 'wikidata' # bot = wikibot.Bot() soch = KSamsok('test') sparql = 'SELECT ?item ?value WHERE { ?item wdt:P1260 ?value . }' generator = wikibot.Generator.newSparQLGenerator(bot, sparql) for item in generator: item.get() if item.claims: if 'P1260' in item.claims: claims = [] for claim in item.claims['P1260']: target = claim.getTarget() if not soch.formatUri(target, 'raw', True): print(item)