Ejemplo n.º 1
0
    def __init__(self, endpoint: str = 'https://ugc.kulturarvsdata.se/', key: str = None) -> None:
        self.endpoint = endpoint + 'UGC-hub/'
        self.key = key

        self.headers = {
            'User-Agent': 'SOCH-UGC.py'
        }

        self.soch = KSamsok()

        self.relation_types = list([
            'sameAs',
            'isDescribedBy',
            'describes',
            'visualizes',
            'hasPart',
            'isPartOf',
            'isVisualizedBy',
            'isContainedIn',
            'author',
            'authorOf',
            'hasBeenUsedIn',
            'isRelatedTo',
            'architectOf',
            'architect',
            'user',
            'userOf',
            'child',
            'mother',
            'father',
            'photographerOf',
            'photographer',
            'isMentionedBy',
            'mentions'
        ])
Ejemplo n.º 2
0
import requests

from ksamsok import KSamsok

from flask import Flask, request, Response
from flask_restful import Api, Resource, abort

# http://www.ianbicking.org/illusive-setdefaultencoding.html
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding('utf-8')

app = Flask(__name__)
api = Api(app)
soch = KSamsok('test')


# CORS headers
@app.after_request
def after_request(resp):
    resp.headers.add('Access-Control-Allow-Origin', '*')
    resp.headers.add('Access-Control-Allow-Methods', 'GET, HEAD, OPTIONS')
    resp.headers.add('X-Powered-By', 'K-Samsök REST Version 1.2.0')
    return resp


class Record(Resource):
    def get(self, uri):
        if 'application/json+ld' in request.headers.get('Accept'):
            url = soch.formatUri(uri, 'jsonldurl')
Ejemplo n.º 3
0
class UGC:
    def __init__(self, endpoint: str = 'https://ugc.kulturarvsdata.se/', key: str = None) -> None:
        self.endpoint = endpoint + 'UGC-hub/'
        self.key = key

        self.headers = {
            'User-Agent': 'SOCH-UGC.py'
        }

        self.soch = KSamsok()

        self.relation_types = list([
            'sameAs',
            'isDescribedBy',
            'describes',
            'visualizes',
            'hasPart',
            'isPartOf',
            'isVisualizedBy',
            'isContainedIn',
            'author',
            'authorOf',
            'hasBeenUsedIn',
            'isRelatedTo',
            'architectOf',
            'architect',
            'user',
            'userOf',
            'child',
            'mother',
            'father',
            'photographerOf',
            'photographer',
            'isMentionedBy',
            'mentions'
        ])

    def get_total_items_count(self) -> str:
        url = '{}api?method=retrieve&scope=count&objectUri=all&format=json'.format(self.endpoint)
        data = self.make_get_request(url)

        return data['response']['relations']['numberOfRelations']

    def get_item(self, item_id: Union[int, str]) -> Union[Dict, bool]:
        url = '{}api?method=retrieve&objectUri=all&contentId={}&scope=single&format=json'.format(self.endpoint, item_id)
        data = self.make_get_request(url)

        if data['response']['relations'][0]['id'] is 0:
            return False

        return data['response']['relations'][0]

    def search_items(self, uri: str = 'all', offset: int = 0, limit: int = 50) -> List:
        url = '{}api?method=retrieve&scope=all&objectUri={}&selectFrom={}&maxCount={}&format=json'.format(self.endpoint, uri, offset, limit)
        data = self.make_get_request(url)

        return data['response']['relations']

    def delete_item(self, item_id: Union[int, str]) -> bool:
        url = '{}api?x-api={}&method=delete&objectId={}&format=json'.format(self.endpoint, self.key, item_id)
        data = self.make_get_request(url)

        if not self.key:
            raise ValueError('This action requires an API key.')

        if data['response']['result'] == 'SUCCESS':
            return True
        return False

    def create_item_relation(self, kulturarvsdata_uri: str, relation: str, target: str, user: str, comment: str = None) -> bool:
        kulturarvsdata_uri = self.soch.formatUri(kulturarvsdata_uri, 'rawurl')
        if not kulturarvsdata_uri:
            raise ValueError('{} is not an valid Kulturarvsdata URI.'.format(kulturarvsdata_uri))

        if relation not in self.relation_types:
            raise ValueError('{} is not a valid relation type.'.format(relation))

        if not self.valid_relation_target(target):
            raise ValueError('{} is not a valid target.'.format(target))

        if not self.key:
            raise ValueError('This action requires an API key.')

        url = '{}api?x-api={}&method=insert&scope=relationAll&objectUri={}&user={}&relationType={}&relatedTo={}&format=json'.format(self.endpoint, self.key, kulturarvsdata_uri, user, relation, target)

        if comment:
            url = '{}&comment={}'.format(url, comment)

        data = self.make_get_request(url)

        if data['response']['result'] == 'SUCCESS':
            return True
        return False

    def valid_relation_target(self, target: str) -> bool:
        if target.startswith('http://kulturarvsdata.se/'):
            if not self.soch.formatUri(target, 'rawurl'):
                return False
            return True

        if target.startswith('https://commons.wikimedia.org/wiki/File:'):
            return True

        if target.startswith('https://commons.wikimedia.org/wiki/Category:'):
            return True

        if target.startswith('http://www.wikidata.org/entity/Q'):
            return True

        if target.startswith('http://commons.wikimedia.org/entity/M'):
            return True

        if target.startswith('http://kulturnav.org/'):
            return True

        if target.startswith('http://viaf.org/viaf/'):
            return True

        if target.startswith('http://vocab.getty.edu/ulan/'):
            return True

        if target.startswith('http://iconclass.org/'):
            return True

        if target.startswith('http://data.europeana.eu/'):
            return True

        if re.match(r'^https:\/\/libris\.kb\.se\/.{14,17}$', target):
            return True

        if re.match(r'^https:\/\/\w{2}\.wikipedia\.org\/wiki\/.+', target):
            return True

        return False

    def make_get_request(self, url: str) -> Dict:
        r = requests.get(url, headers = self.headers)

        # note UGC always returns 200 codes for now.
        if r.status_code is 401:
            raise PermissionError('Bad API key.')

        data = r.json()
        if 'error' in data['response']:
            raise Exception('Unknown error: {}'.format(data['response']['error']))

        return data
Ejemplo n.º 4
0
import wikibot, re
from ksamsok import KSamsok

#
# This bot is used to fix broken BBR links by guessing
# the URIs and check if the new one verifies in KSamsok-py
#
# user-config.py:
# family = 'wikidata'
# mylang = 'wikidata'
#

sparql = 'SELECT ?item  ?c WHERE { ?item wdt:P1260 ?c . FILTER(CONTAINS(?c, "bbr/")) }'
culturalSerach = KSamsok('test')

bot = wikibot.Bot(True)
generator = wikibot.Generator.newSparQLGenerator(bot, sparql)
for p in generator:
    item = wikibot.Item(p)
    uri = item.item.claims['P1260'][0].getTarget()
    if not culturalSerach.formatUri(uri, 'raw', True):
        # the uri was not valid
        # create other bbr links
        uri_a = re.sub('bbr', 'bbra', uri)
        uri_b = re.sub('bbr', 'bbrb', uri)
        uri_m = re.sub('bbr', 'bbrm', uri)

        if culturalSerach.formatUri(uri_a, 'raw', True):
            print('bbra')
            target = uri_a
            item.item.claims['P1260'][0].changeTarget(target)
Ejemplo n.º 5
0
    def index(self):
        # setup pywikibot and initialize generator
        pywikibot.handle_args(sys.argv[1:])
        site = pywikibot.Site()
        generator = pagegenerators.WikidataSPARQLPageGenerator(
            self.sparql, site)

        # setup instance of the KSamsok class
        # the api key is never used so no need to use another one
        soch = KSamsok('test')

        for i in generator:
            item = i.get()
            data = {}

            # get the raw wikidata uri without Q
            data['wikidata'] = re.sub(r'(?!\d).', '', str(i))

            # make sure the item does not exist in our database
            if not self.primary_key_exists(data['wikidata']):
                # parse the kulturarvsdata uri or set to false if invalid
                data['kulturarvsdata'] = soch.formatUri(
                    item['claims']['P1260'][0].getTarget(), 'raw', True)
                #TODO make a log of items with broken kulturarvsdata uris
                if data['kulturarvsdata']:
                    # fetch stuff from the wikidata item
                    data['wikipedia'] = item['sitelinks']['svwiki']

                    try:
                        data['commons'] = item['claims']['P373'][0].getTarget()
                    except (KeyError):
                        data['commons'] = ''

                    try:
                        data['image'] = re.sub(
                            r'\]\]', '',
                            re.sub(r'\[\[commons:', '',
                                   str(item['claims']['P18'][0].getTarget())))
                    except (KeyError):
                        data['image'] = ''

                    coord_pair = item['claims']['P625'][0].getTarget()
                    data['lat'] = coord_pair.lat
                    data['lon'] = coord_pair.lon

                    data['label'] = item['labels']['sv']

                    # fetch stuff from kulturarvsdata
                    record = soch.getObject(data['kulturarvsdata'])

                    if record['presentation']['description']:
                        # if the string is too short to be useful drop it
                        if len(record['presentation']['description']) > 30:
                            data['description'] = record['presentation'][
                                'description']
                        else:
                            data['description'] = ''
                    else:
                        data['description'] = ''

                    # fetch intro paragraphs from wikipedia
                    #TODO if the connection to wikipedia fails then
                    # the item should be dropped(may need to be refactored)
                    try:
                        r = requests.get(
                            'https://sv.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles='
                            + data['wikipedia'])
                        result = r.json()

                        # the page id is the unknown key the loop is for figuring it out
                        # dictionaries does not support indexing as they are unsorted...
                        for key in result['query']['pages']:
                            data['wp_description'] = result['query']['pages'][
                                key]['extract']
                    except (KeyError):
                        data['wp_description'] = ''

                    try:
                        if (data['image'] != ''):
                            r = requests.get(
                                'https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=pageimages&piprop=thumbnail|name|original&pithumbsize=110&titles=File:'
                                + data['image'])
                            result = r.json()

                            for key in result['query']['pages']:
                                data['image_thumbnail'] = result['query'][
                                    'pages'][key]['thumbnail']['source']
                                data['image_original'] = result['query'][
                                    'pages'][key]['thumbnail']['original']
                        else:
                            data['image_thumbnail'] = ''
                            data['image_original'] = ''
                    except (KeyError):
                        data['image_thumbnail'] = ''
                        data['image_original'] = ''

                    # write and commit church to db
                    self.c.execute(
                        '''INSERT INTO `churches` (
                                        `wikidata`,
                                        `label`,
                                        `kulturarvsdata`,
                                        `description`,
                                        `lat`,
                                        `lon`,
                                        `wikipedia`,
                                        `wp_description`,
                                        `commons`,
                                        `image`,
                                        `image_thumbnail`,
                                        `image_original`
                                        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                        (data['wikidata'], data['label'],
                         data['kulturarvsdata'], data['description'],
                         data['lat'], data['lon'], data['wikipedia'],
                         data['wp_description'], data['commons'],
                         data['image'], data['image_thumbnail'],
                         data['image_original']))
                    self.db_connection.commit()
import wikibot, re
from ksamsok import KSamsok

#
# This bot is used to detect broken/invalid kulturarvsdata statements
# THIS BOT MAKES THOUSANDS OF HTTP CALLS TO THE Swedish National Heritage Board
# BE NICE.
#
# user-config.py:
# family = 'wikidata'
# mylang = 'wikidata'
#

bot = wikibot.Bot()
soch = KSamsok('test')
sparql = 'SELECT ?item ?value WHERE { ?item wdt:P1260 ?value . }'
generator = wikibot.Generator.newSparQLGenerator(bot, sparql)

for item in generator:
    item.get()
    if item.claims:
        if 'P1260' in item.claims:
            claims = []
            for claim in item.claims['P1260']:
                target = claim.getTarget()
                if not soch.formatUri(target, 'raw', True):
                    print(item)