def prepare(self, records): sparql = Sparql(SPARQL_ENDPOINT) # Get BCP 47 language code map self.language_code_map = {} raw_language_code_map = sparql.request(LANGUAGE_QUERY) for line in raw_language_code_map: self.language_code_map[sparql.format_value( line, "item")] = sparql.format_value(line, "code") # Extract all different locations locations = set() for record in records: if record["speaker"]["residence"] is not None: locations.add(record["speaker"]["residence"]) self.location_map = {} raw_location_map = sparql.request( LOCATION_QUERY.replace("$1", " wd:".join(locations))) for line in raw_location_map: country = sparql.format_value(line, "countryLabel") location = sparql.format_value(line, "locationLabel") self.location_map[sparql.format_value(line, "location")] = country if country != location: self.location_map[sparql.format_value( line, "location")] += (" (" + location + ")") return records
def feature_generator(self, data): data_name = (data.split('/')[-1]).split('.')[0] with codecs.open('features/ceccarelli/%s.svm' % data_name, 'w', encoding='utf-8') as data_write: with codecs.open(data, 'r', encoding='utf-8') as data_read: for i, line in enumerate(data_read): wiki_id_query, qid, wiki_id_candidate, relevance, doc_id = self.parse_ceccarelli_line( line) # print(wiki_id_query) uri_query = Sparql.get_uri_from_wiki_id(wiki_id_query) uri_candidate = Sparql.get_uri_from_wiki_id( wiki_id_candidate) self.write_line(uri_query, qid, uri_candidate, relevance, data_write, doc_id) print('finished writing features') print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Updates centroids of OSM relations', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument( '-s', '--cache-strategy', action='store', dest='cacheType', choices=['sparse', 'dense'], default='dense', help='Which node strategy to use (default: %(default)s)') parser.add_argument('-c', '--nodes-file', action='store', dest='cacheFile', default=None, help='File to store node cache.') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run) self.skipped = [] if self.options.cacheFile: if self.options.cacheType == 'sparse': idx = 'sparse_file_array,' + self.options.cacheFile else: idx = 'dense_file_array,' + self.options.cacheFile self.nodeCache = osmium.index.create_map(idx) else: self.nodeCache = None
def run(self): if self.sparql: sparql_query = Sparql(self.entities, self.config_file, self.dataset, self.sparql, self.default_graph, self.entity_class) sparql_query.get_property_graphs() self.e2v_walks_learn() # run node2vec for each property-specific graph
def menu_add_consumer(): getMoviesObject() for index, item in enumerate(moviesList): moviePrefix = item['movieprefix'].split('#')[1] print(f'{index+1} - {moviePrefix} ({item["moviename"]})') movieIndexId = int(input('Seleccione o index do filme: ')) - 1 selectedMovie = moviesList[movieIndexId]['movieprefix'].split('#')[1] consumerName = input('Nome do cliente: ') consumerNameId = 'mov:consumer_' + remove_characters_except_number_letter( consumerName) consumerInsert = Sparql() consumerExists = consumerInsert.ask( f'ASK {{ {consumerNameId} rdf:type mov:Consumer }}') if not consumerExists: consumerInsert.insertTriple(f'{consumerNameId} rdf:type mov:Consumer') consumerInsert.insertTriple( f'{consumerNameId} foaf:name "{consumerName}"') consumerInsert.insertTriple( f'mov:{selectedMovie} mov:alugadoPor {consumerNameId}')
def game(): logged_in = session.get('logged_in') if logged_in: visited = dbHandler.getUserVisitedList(session.get("userid")) if len(visited) < 5: flash("You have to learn first!") return redirect(url_for('home')) question = {} question["question"] = "start" question["correctAnswer"] = "" question["answers"] = [] scores = dbHandler.getUserScores(session.get("userid")) if request.method == "POST": answer = request.form['ans'] if answer == session["currentQuestion"]["correctAnswer"]: session["correctAnswers"] = session["correctAnswers"] + 1 session["numberOfQuestions"] = session["numberOfQuestions"] + 1 if session["numberOfQuestions"] < 10: s = Sparql() question = s.generate_question(session.get("userid")) session["currentQuestion"] = question return render_template("quiz.html", username=session.get("username"), question=question, scores=scores) else: flash("Your score is " + str(session["correctAnswers"]) + " correct answers out of 10!") dbHandler.addScore(session.get("userid"), session["correctAnswers"]) s = Sparql() question = s.generate_question(session.get("userid")) session["currentQuestion"] = question session["numberOfQuestions"] = 0 session["correctAnswers"] = 0 return render_template("quiz.html", username=session.get("username"), question=question, scores=scores) if request.method == "GET": session["currentQuestion"] = question session["numberOfQuestions"] = 0 session["correctAnswers"] = 0 s = Sparql() question = s.generate_question(session.get("userid")) session["currentQuestion"] = question return render_template("quiz.html", username=session.get("username"), question=question, scores=scores) else: return render_template("index.html")
def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Download and update stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') parser.add_argument('-b', '--go-backwards', action='store_true', dest='go_backwards', default=False, help='Go back up to (maxfiles) and exit') parser.add_argument( '-m', '--maxfiles', action='store', dest='max_files', default=1, type=int, help='Maximum number of pageview stat files to process at once') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run) self.pvstat = '<https://dumps.wikimedia.org/other/pageviews/>' self.stats_url = 'https://dumps.wikimedia.org/other/pageviews/{0:%Y}/{0:%Y-%m}/pageviews-{0:%Y%m%d-%H}0000.gz' # oldest file is https://dumps.wikimedia.org/other/pageviews/2015/2015-05/pageviews-20150501-010000.gz self.minimum_data_ts = datetime(2015, 5, 1, tzinfo=dt.timezone.utc)
def getMoviesObject(): sparql = Sparql() moviesSelect = sparql.select("""SELECT ?movie ?name WHERE { ?movie rdf:type mov:Movie . ?movie foaf:name ?name } """) movies = moviesSelect['results']['bindings'] for movie in movies: idMovie = movie['movie']['value'] name = movie['name']['value'] moviesList.append({'movieprefix': idMovie, 'moviename': name})
def getFilmesAlugados(): sparql = Sparql() moviesAlugados = sparql.select( """SELECT ?movieid ?moviename (GROUP_CONCAT(?cid;SEPARATOR=', ') AS ?consumerid) (GROUP_CONCAT(?cname; SEPARATOR=', ') as ?consumername) WHERE { ?movieid rdf:type mov:Movie . ?movieid foaf:name ?moviename . ?movieid mov:alugadoPor ?cid . ?cid foaf:name ?cname } GROUP BY ?movieid ?moviename """) movies = moviesAlugados['results']['bindings'] for movie in movies: moviename = movie['moviename']['value'] consumers = (movie['consumername']['value']).split(', ') filmesAlugados.append({'moviename': moviename, 'consumers': consumers})
def main(): sparql = Sparql() moviesSelect = sparql.select("""SELECT ?movie ?name WHERE { ?movie rdf:type mov:Movie . ?movie foaf:name ?name } """) movies = moviesSelect['results']['bindings'] for movie in movies: # print(movie) idMovie = movie['movie']['value'] name = movie['name']['value'] moviesList.append({'id': idMovie, 'name': name}) print(moviesList)
def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Download and update stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument( '-d', '--queries-dir', action='store', dest='queries_dir', default=str(Path(os.path.dirname(__file__)) / 'maintenance'), help='Do not modify RDF database.') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run)
def feature_generator(self, data): data_name = (data.split('/')[-1]).split('.')[0] with codecs.open('features/ceccarelli/%s.svm' %data_name,'w', encoding='utf-8') as data_write: with codecs.open(data,'r', encoding='utf-8') as data_read: for i, line in enumerate(data_read): wiki_id_query, qid, wiki_id_candidate, relevance, doc_id = self.parse_ceccarelli_line(line) print(wiki_id_query) uri_query = Sparql.get_uri_from_wiki_id(wiki_id_query) uri_candidate = Sparql.get_uri_from_wiki_id(wiki_id_candidate) self.write_line(uri_query, qid, uri_candidate, relevance, data_write, doc_id) print('finished writing features') print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Update key and tag usage stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, 'query' if opts.dry_run else False) self.date_subject = '<https://taginfo.openstreetmap.org>' self.url_stats = 'https://taginfo.openstreetmap.org/api/4/key/stats' self.url_keys = ' https://taginfo.openstreetmap.org/api/4/keys/all' self.ids = {}
def _define_properties(self): with codecs.open(self.config_file, 'r', encoding='utf-8') as config_read: property_file = json.loads(config_read.read()) try: self.properties = [i for i in property_file[self.dataset]] self.properties.append('feedback') except KeyError: # if no list of properties is specified, take them all if self.sparql: # get all the properties from the sparql endpoint sparql_query = Sparql(self.entities, self.config_file, self.dataset, self.sparql, self.default_graph, self.entity_class) self.properties = sparql_query.properties self.properties.append( 'feedback' ) # add the feedback property that is not defined in the graph else: # get everything you have in the folder path_to_graphs = 'datasets/%s/graphs' % self.dataset onlyfiles = [ f for f in listdir(path_to_graphs) if isfile(join(path_to_graphs, f)) ] self.properties = [ file.replace('.edgelist', '') for file in onlyfiles ] if 'feedback' in self.properties: # feedback property always the last one of the list self.properties.remove('feedback') self.properties.append('feedback')
class SparqlMaintainer(object): def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Download and update stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument( '-d', '--queries-dir', action='store', dest='queries_dir', default=str(Path(os.path.dirname(__file__)) / 'maintenance'), help='Do not modify RDF database.') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run) def run(self): dir = Path(self.options.queries_dir) self.log.info(f'Running scripts from {dir}') while True: queries = {} for file in dir.glob('*.sparql'): with file.open() as f: queries[file.stem] = f.read() suffix = '-test' for filename in sorted(queries.keys()): if filename.endswith(suffix): if filename[:-len(suffix)] not in queries: self.log.warning( f'File {filename} has no matching query (without the "{suffix}" suffix)' ) continue testfile = filename + suffix if testfile in queries: if not self.rdf_server.run('query', queries[testfile]): self.log.info( f'Skipping {filename} (test is negative)') continue self.log.info(f'Executing {filename}') self.rdf_server.run('update', queries[filename]) self.log.info(f'Done running {filename}') time.sleep(600)
def on_search_released(self): # Auto connected #---------------------------- query = Sparql.create_from_XML(self.as_XML()) results = query.execute(self.config.rdfstore)
def execute(self, record): # Normalize the record using ocwiktionary's titles conventions transcription = self.normalize(record["transcription"]) # Fetch the content of the page having the transcription for title (is_already_present, wikicode, basetimestamp) = self.get_entry(transcription, record["file"]) # Whether there is no entry for this record on ocwiktionary if wikicode == False: return False # Whether the record is already inside the entry if is_already_present == True: print(record["id"] + ": already on ocwiktionary") return False # Check if the record's language has a BCP 47 code, stop here if not if record["language"]["qid"] not in self.language_code_map: print(record["id"] + ": language code not found") return False lang = self.language_code_map[record["language"]["qid"]] motvar = re.search(r"^oc\-([^\-]*?)(\-|$)", lang) labelvar = False if motvar: codevar = motvar.group(1) if record["language"]["qid"] in self.language_label_map: labelvar = self.language_label_map[record["language"]["qid"]] lang = "oc" # Whether there is no section for the current language if "{=" + lang + "=}" not in wikicode: print(record["id"] + ": language section not found") return False motif = "" stringlg = "{=" + lang + "=}" for i in range(0, len(stringlg)): lettre = stringlg[i] if i > 0: motif = motif + "|" motif = motif + stringlg[0:i].replace("{", "\{") motif = motif + "[^" + stringlg[i].replace("{", "\{") + "]" motif = re.search( r"{{=" + lang + "=}}(([^{]|{[^{]|{{[^\-=]|{{-[^p]|{{-p[^r]|{{-pr[^o]|{{-pro[^n]|{{-pron[^-]|{{-pron-[^}]|{{-pron-}[^}])*?)({{=([^\=]*?)=}}|$)", wikicode, ) if motif: wikicode = re.sub( r"{{=" + lang + "=}}(([^{]|{[^{]|{{[^\-=]|{{-[^p]|{{-p[^r]|{{-pr[^o]|{{-pro[^n]|{{-pron[^-]|{{-pron-[^}]|{{-pron-}[^}])*?)({{=([^\=]*?)=}}|{{-sil-}}|{{-([^\-]*?)\-\|([a-z]+)}}|$)", "{{=" + lang + "=}}\g<1>{{-pron-}}\g<3>", wikicode, ) loccode = "" if record["speaker"]["residence"]: sparql = Sparql(SPARQL_ENDPOINT) self.location_map = {} raw_location_map = sparql.request( LOCATION_QUERY.replace("$1", " wd:" + record["speaker"]["residence"])) if len(raw_location_map) > 0: country = sparql.format_value(raw_location_map[0], "countryLabel") location = sparql.format_value(raw_location_map[0], "locationLabel") if country: loccode = country if location and location != country: loccode = loccode + " (" + location + ")" elif location: loccode = location else: loccode = "" if labelvar: loccode = loccode + " - " + labelvar if loccode != "": loccode = loccode + " : " codefichier = (loccode + "escotar « " + record["transcription"] + " » [[Fichièr:" + record["file"] + "]]") wikicode = re.sub( r"\{=" + lang + "=\}(([^\{]|\{[^=])*?)\{\{-pron-\}\}(([^\{]|\{[^\{]|\{\{[^\-])*?)(\{\{-|\{\{=|$)", "{=" + lang + "=}\g<1>{{-pron-}}\g<3>" + codefichier + "\n\g<5>", wikicode, ) # Save the result try: result = self.do_edit(transcription, wtp.parse(wikicode), basetimestamp) except Exception as e: # If we got an editconflict, just restart from the beginning if str(e).find("editconflict") > -1: self.execute(record) else: raise e if result == True: print(record["id"] + ": added to ocwiktionary - https://oc.wiktionary.org/wiki/" + transcription) return result
def __init__(self, options): super(RdfUpdateHandler, self).__init__(options) self.pending = {} self.pendingCounter = 0 self.rdf_server = Sparql(self.options.rdf_url, self.options.dry_run)
class RdfUpdateHandler(RdfHandler): def __init__(self, options): super(RdfUpdateHandler, self).__init__(options) self.pending = {} self.pendingCounter = 0 self.rdf_server = Sparql(self.options.rdf_url, self.options.dry_run) def finalize_object(self, obj, statements, obj_type): super(RdfUpdateHandler, self).finalize_object(obj, statements, obj_type) prefixed_id = osmutils.types[obj_type] + str(obj.id) if prefixed_id in self.pending: # Not very efficient, but if the same object is updated more than once within # the same batch, it does not get deleted because all deletes happen first self.flush() if statements: self.pending[prefixed_id] = [ prefixed_id + ' ' + s + '.' for s in osmutils.toStrings(statements) ] self.pendingCounter += len(statements) else: self.pending[prefixed_id] = False self.pendingCounter += 1 if self.pendingCounter > 5000: self.flush() def flush(self, seqid=0): sparql = '' if self.pending: # Remove all staetments with these subjects sparql += f''' DELETE {{ ?s ?p ?o . }} WHERE {{ VALUES ?s {{ {' '.join(self.pending.keys())} }} ?s ?p ?o . FILTER (osmm:task != ?p) }};''' # flatten list of lists, and if sublist is truthy, use it insertSparql = '\n'.join([ v for sublist in self.pending.values() if sublist for v in sublist ]) if insertSparql: sparql += f'INSERT {{ {insertSparql} }} WHERE {{}};\n' if seqid > 0: if self.last_timestamp.year < 2000: # Something majorly wrong raise Exception('last_timestamp was not updated') sparql += set_status_query('osmroot:', self.last_timestamp, 'version', seqid) if sparql: sparql = '\n'.join(osmutils.prefixes) + '\n\n' + sparql self.rdf_server.run('update', sparql) self.pendingCounter = 0 self.pending = {} elif self.pendingCounter != 0: # Safety check raise Exception(f'pendingCounter={self.pendingCounter}') def get_osm_schema_ver(self, repserv): result = query_status(self.rdf_server, '<https://www.openstreetmap.org>', 'version') ver = result['version'] if ver is not None: log.info(f'schema:version={ver}') return int(ver) mod_date = result['dateModified'] if mod_date is not None: log.info( f'schema:dateModified={mod_date}, shifting back and getting sequence ID' ) mod_date -= dt.timedelta(minutes=60) return repserv.timestamp_to_sequence(mod_date) log.error( 'Neither schema:version nor schema:dateModified are set for <https://www.openstreetmap.org>' ) return None def run(self): repserv = ReplicationServer(self.options.osm_updater_url) last_time = datetime.utcnow() if self.options.seqid: seqid = self.options.seqid else: seqid = self.get_osm_schema_ver(repserv) if seqid is None: raise Exception('Unable to determine sequence ID') log.info(f'Initial sequence id: {seqid}') state = None last_seqid = seqid while True: # must not read data newer than the published sequence id # or we might end up reading partial data sleep = True if state is None: state = repserv.get_state_info() if state is not None and seqid + 2 < state.sequence: log.info( f'Replication server has data up to #{state.sequence}') if state is not None and seqid <= state.sequence: try: diffdata = repserv.get_diff_block(seqid) except: diffdata = '' # We assume there are no empty diff files if len(diffdata) > 0: log.debug("Downloaded change %d. (size=%d)" % (seqid, len(diffdata))) if self.options.addWayLoc: self.apply_buffer(diffdata, repserv.diff_type, locations=True, idx=self.get_index_string()) else: self.apply_buffer(diffdata, repserv.diff_type) self.flush(seqid) seqid += 1 sleep = False seconds_since_last = (datetime.utcnow() - last_time).total_seconds() if seconds_since_last > 60: log.info( f'Processed {seqid - last_seqid - 1}, ' + f'todo {(state.sequence - seqid + 1 if state else "???")}; {self.format_stats()}' ) last_seqid = seqid - 1 last_time = datetime.utcnow() if state is not None and seqid > state.sequence: state = None # Refresh state if sleep: time.sleep(60)
class UpdateRelLoc(object): def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Updates centroids of OSM relations', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument( '-s', '--cache-strategy', action='store', dest='cacheType', choices=['sparse', 'dense'], default='dense', help='Which node strategy to use (default: %(default)s)') parser.add_argument('-c', '--nodes-file', action='store', dest='cacheFile', default=None, help='File to store node cache.') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run) self.skipped = [] if self.options.cacheFile: if self.options.cacheType == 'sparse': idx = 'sparse_file_array,' + self.options.cacheFile else: idx = 'dense_file_array,' + self.options.cacheFile self.nodeCache = osmium.index.create_map(idx) else: self.nodeCache = None def run(self): while True: self.run_once() time.sleep(600) # every 10 minutes def run_once(self): query = '''# Get relations without osmm:loc SELECT ?rel WHERE { ?rel osmm:type 'r' . FILTER NOT EXISTS { ?rel osmm:loc ?relLoc . } }''' # LIMIT 100000 result = self.rdf_server.run('query', query) self.skipped = [ 'osmrel:' + i['rel']['value'][len('https://www.openstreetmap.org/relation/'):] for i in result ] while True: rel_ids = self.skipped self.skipped = [] count = len(rel_ids) self.log.info(f'** Processing {count} relations') self.run_list(rel_ids) if len(self.skipped) >= count: self.log.info( f'** Unable to process {len(self.skipped)} relations, exiting' ) break else: self.log.info( f'** Processed {count - len(self.skipped)} out of {count} relations' ) self.log.info('done') def run_list(self, rel_ids): for chunk in chunks(rel_ids, 2000): self.fix_relations(chunk) def fix_relations(self, rel_ids): pairs = self.get_relation_members(rel_ids) insert_statements = [] for group in self.group_by_values(pairs): insert_statements.append(self.process_single_rel(*group)) if len(insert_statements) > 0: sparql = '\n'.join(osmutils.prefixes) + '\n\n' sparql += 'INSERT {\n' sparql += '\n'.join(insert_statements) sparql += '\n} WHERE {};' self.rdf_server.run('update', sparql) self.log.info(f'Updated {len(insert_statements)} relations') def get_relation_members(self, rel_ids): query = f'''# Get relation member's locations SELECT ?rel ?member ?loc WHERE {{ VALUES ?rel {{ {' '.join(rel_ids)} }} ?rel osmm:has ?member . OPTIONAL {{ ?member osmm:loc ?loc . }} }}''' result = self.rdf_server.run('query', query) return [( 'osmrel:' + i['rel']['value'][len('https://www.openstreetmap.org/relation/'):], i['member']['value'], i['loc']['value'] if 'loc' in i else '') for i in result] @staticmethod def process_single_rel(rel_id, member_points): points = MultiPoint([loads(p) for p in member_points]) return rel_id + ' ' + osmutils.formatPoint('osmm:loc', points.centroid) + '.' def group_by_values(self, tupples): """Yield a tuple (rid, [list of ids])""" points = None last_id = None skip = False for rid, ref, value in sorted(tupples): if last_id != rid: if last_id is not None and not skip: if not points: self.skipped.append(last_id) else: yield (last_id, points) skip = False points = [] last_id = rid if not skip: if value == '': if ref.startswith('https://www.openstreetmap.org/node/'): if self.nodeCache: node_id = ref[ len('https://www.openstreetmap.org/node/'):] try: point = self.nodeCache.get(int(node_id)) points.append( f'Point({point.lon} {point.lat})') except osmium._osmium.NotFoundError: pass elif ref.startswith('https://www.openstreetmap.org/way/'): pass # not much we can do about missing way's location elif ref.startswith( 'https://www.openstreetmap.org/relation/'): skip = True self.skipped.append(rid) else: raise ValueError('Unknown ref ' + ref) else: points.append(value) if last_id is not None and not skip: if not points: self.skipped.append(last_id) else: yield (last_id, points)
def get_records(query): sparql = Sparql(ENDPOINT) raw_records = sparql.request(query) records = [] for record in raw_records: records += [{ "id": sparql.format_value(record, "record"), "file": sparql.format_value(record, "file"), "date": sparql.format_value(record, "date"), "transcription": sparql.format_value(record, "transcription"), "qualifier": sparql.format_value(record, "qualifier"), "user": sparql.format_value(record, "linkeduser"), "speaker": { "id": sparql.format_value(record, "speaker"), "name": sparql.format_value(record, "speakerLabel"), "gender": sparql.format_value(record, "gender"), "residence": sparql.format_value(record, "residence"), }, "links": { "wikidata": sparql.format_value(record, "wikidataId"), "lexeme": sparql.format_value(record, "lexemeId"), "wikipedia": sparql.format_value(record, "wikipediaTitle"), "wiktionary": sparql.format_value(record, "wiktionaryEntry"), }, "language": { "iso": sparql.format_value(record, "languageIso"), "qid": sparql.format_value(record, "languageQid"), "wm": sparql.format_value(record, "languageWMCode"), }, }] return records
def home(): logged_in = session.get('logged_in') if logged_in: info = {} diseaseInfo = "" diseaseName = "" diseaseTreatment = "" name = [] symptoms = [] treatment = [] hasCauses = [] causeOf = [] image = "" info["symptoms"] = symptoms info["treatment"] = treatment info["hasCauses"] = hasCauses info["causeOf"] = causeOf info["info"] = diseaseInfo info["name"] = name info["image"] = image diseaseInfo = False userList = [] userListDb = dbHandler.getUserList(session.get("userid")) for l in userListDb: el = dbHandler.getElementByID(l[1]) userList.append(el[0][1]) b = True i = 0 mostVisited = [] visited = dbHandler.getElements("disease") while b == True: if i < 5 and len(visited) > i: mostVisited.append(visited[i][1]) i = i + 1 else: b = False suggestions = {} s = Sparql() suggestions = s.getSuggestions(session.get("userid")) if request.method == "GET": diseaseName = request.args.get("diseaseName") diseaseSymptoms = request.args.get("symptoms") diseaseTreatment = request.args.get("treatment") diseaseCauses = request.args.get("causes") diseaseType = request.args.get("type") diseaseCountry = request.args.get("country") diseaseClimate = request.args.get("climate") diseaseFood = request.args.get("food") first = True if (type(diseaseName) == str and len(diseaseName) != 0): s = Sparql() info = s.search_name(diseaseName) dbHandler.addElement(diseaseName, "disease") session["diseaseName"] = diseaseName diseaseInfo = True dbHandler.addVisitedDisease(session.get("userid"), diseaseName, "disease") if (type(diseaseSymptoms) == str and len(diseaseSymptoms) != 0): s = Sparql() if first == True: info = s.search_symptoms(diseaseSymptoms) first = False else: infoTemp = s.search_symptoms(diseaseSymptoms) info["name"] = list( set(info["name"]).intersection(infoTemp["name"])) symptoms = diseaseSymptoms.split(", ") for s in symptoms: dbHandler.addElement(s, "symptom") if (type(diseaseTreatment) == str and len(diseaseTreatment) != 0): s = Sparql() if first == True: info = s.search_treatment(diseaseTreatment) first = False else: infoTemp = s.search_treatment(diseaseTreatment) info["name"] = list( set(info["name"]).intersection(infoTemp["name"])) treatments = diseaseTreatment.split(", ") for t in treatments: dbHandler.addElement(t, "treatment") if (type(diseaseCauses) == str and len(diseaseCauses) != 0): s = Sparql() if first == True: info = s.search_bycauses(diseaseCauses) first = False else: infoTemp = s.search_bycauses(diseaseCauses) info["name"] = list( set(info["name"]).intersection(infoTemp["name"])) causes = diseaseCauses.split(", ") for c in causes: dbHandler.addElement(c, "causes") if (type(diseaseType) == str and len(diseaseType) != 0): s = Sparql() if first == True: info = s.search_bytype(diseaseType) first = False else: infoTemp = s.search_bytype(diseaseType) info["name"] = list( set(info["name"]).intersection(infoTemp["name"])) types = diseaseType.split(", ") for t in types: dbHandler.addElement(t, "types") if (type(diseaseCountry) == str and len(diseaseCountry) != 0): s = Sparql() if first == True: info["name"] = s.search_diseaseByCountry(diseaseCountry) first = False else: infoTemp = s.search_diseaseByCountry(diseaseCountry) info["name"] = list( set(info["name"]).intersection(infoTemp)) dbHandler.addElement(diseaseCountry, "country") if (type(diseaseClimate) == str and len(diseaseClimate) != 0): s = Sparql() if first == True: info["name"] = s.search_diseaseByClimate(diseaseClimate) first = False else: infoTemp = s.search_diseaseByClimate(diseaseClimate) info["name"] = list( set(info["name"]).intersection(infoTemp)) dbHandler.addElement(diseaseClimate, "climate") if (type(diseaseFood) == str and len(diseaseFood) != 0): s = Sparql() if first == True: info["name"] = s.search_diseaseByFood(diseaseFood) first = False else: infoTemp = s.search_diseaseByFood(diseaseFood) info["name"] = list( set(info["name"]).intersection(infoTemp)) dbHandler.addElement(diseaseFood, "food") return render_template("home.html", username=session.get("username"), disease=info["name"], symptoms=info["symptoms"], treatment=info["treatment"], hasCauses=info["hasCauses"], causeOf=info["causeOf"], info=info["info"], image=info["image"], userList=userList, visited=mostVisited, suggestions=suggestions, diseaseInfo=diseaseInfo) if request.method == "POST": dbHandler.addElementInList(session.get("userid"), session.get("diseaseName"), "disease") userList.append(session.get("diseaseName")) return render_template("home.html", userList=userList, visited=mostVisited, suggestions=suggestions) else: return redirect(url_for('index'))
async def sparql(ctx, *text): ctx.send( _("Your results are as follows.\n" "```\n" "{result}" "```\n").format(result=Sparql(text[0], text[1:])))
class UpdatePageViewStats(object): def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Download and update stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') parser.add_argument('-b', '--go-backwards', action='store_true', dest='go_backwards', default=False, help='Go back up to (maxfiles) and exit') parser.add_argument( '-m', '--maxfiles', action='store', dest='max_files', default=1, type=int, help='Maximum number of pageview stat files to process at once') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, opts.dry_run) self.pvstat = '<https://dumps.wikimedia.org/other/pageviews/>' self.stats_url = 'https://dumps.wikimedia.org/other/pageviews/{0:%Y}/{0:%Y-%m}/pageviews-{0:%Y%m%d-%H}0000.gz' # oldest file is https://dumps.wikimedia.org/other/pageviews/2015/2015-05/pageviews-20150501-010000.gz self.minimum_data_ts = datetime(2015, 5, 1, tzinfo=dt.timezone.utc) async def run(self): backwards = self.options.go_backwards while True: ver = query_status(self.rdf_server, self.pvstat) if ver is None: self.log.info( f'schema:dateModified is not set for {self.pvstat}') # Calculate last valid file. Assume current data will not be available for at least a few hours ver = datetime.utcnow() + dt.timedelta( minutes=50) - dt.timedelta(hours=5) ver = datetime(ver.year, ver.month, ver.day, ver.hour, tzinfo=dt.timezone.utc) self.log.info( f'Processing {"backwards" if backwards else "forward"} from {ver}' ) stats, timestamp = await self.process_files(ver, backwards) if timestamp is not None and len(stats) > 0: self.log.info(f'Updating {len(stats)} stats') self.save_stats(stats, timestamp) if backwards: # Do a single iteration only return self.log.info('Pausing...') time.sleep(1000) async def process_files(self, last_processed, backwards): stats = defaultdict(int) new_last = None conn = aiohttp.TCPConnector(limit=3) timeout = aiohttp.ClientTimeout(total=None, connect=None, sock_read=60, sock_connect=60) async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session: futures = [] for date in self.iterate_hours(last_processed, self.options.max_files, backwards): futures.append(self.process_file(session, date, stats)) if futures: done, _ = await asyncio.wait(futures) for fut in done: date, ok = fut.result() # always find the latest possible timestamp even if going backwards if ok and (new_last is None or date > new_last): new_last = date return stats, new_last def iterate_hours(self, last_processed, max_count, backwards=True): delta = dt.timedelta(hours=(-1 if backwards else 1)) done = 0 current = last_processed if not backwards: # Inclusive when going backwards, exclusive when going forward current += delta while current > self.minimum_data_ts if backwards else current < datetime.now( dt.timezone.utc): if done >= max_count: break yield current done += 1 current += delta async def process_file(self, session, date, stats): url = self.stats_url.format(date) try: async with session.get(url) as response: start = datetime.utcnow() if response.status != 200: self.log.warning(f'Url {url} returned {response.status}') return date, False for line in gzip.decompress(await response.read()).splitlines(): try: parts = line.decode('utf-8', 'strict').split(' ') page_url = self.page_url(parts[0], parts[1]) if page_url: stats[page_url] += int(parts[2]) except: self.log.error(f'Error parsing {url} line "{line}"') self.log.info( f'Finished processing {url} in {(datetime.utcnow() - start).total_seconds():.1f} seconds' ) return date, True except: self.log.warning(f'Failed to process {url}') return date, False def page_url(self, prefix, title): parts = prefix.split('.', 1) if len(parts) == 1: site = '.wikipedia.org/wiki/' # elif parts[1] == 'b': # site = '.wikibooks.org/wiki/' # elif parts[1] == 'd': # site = '.wiktionary.org/wiki/' # elif parts[1] == 'n': # site = '.wikinews.org/wiki/' # elif parts[1] == 'q': # site = '.wikiquote.org/wiki/' # elif parts[1] == 's': # site = '.wikisource.org/wiki/' # elif parts[1] == 'v': # site = '.wikiversity.org/wiki/' # elif parts[1] == 'voy': # site = '.wikivoyage.org/wiki/' else: return None if not reWikiLanguage.match(parts[0]): if parts[0] != 'test2': # This is the only number-containing prefix so far self.log.error( f'Skipping unexpected language prefix "{parts[0]}"') return None return make_wiki_url(parts[0], site, title) def save_stats(self, stats, timestamp): # From https://stackoverflow.com/questions/46030514/update-or-create-numeric-counters-in-sparql-upsert/46042692 done = 0 last_print = datetime.utcnow() for keys in chunks(stats.keys(), 1000): # (<...> 10) (<...> 15) ... values = ' '.join( ['(' + k + ' ' + str(stats[k]) + ')' for k in keys]) sparql = f''' PREFIX pvstat: {self.pvstat} DELETE {{ ?sitelink pvstat: ?outdated }} INSERT {{ ?sitelink pvstat: ?updated }} WHERE {{ VALUES (?sitelink ?increment) {{ {values} }} OPTIONAL {{?sitelink pvstat: ?outdated}} BIND ((IF(BOUND(?outdated), ?outdated + ?increment, ?increment)) AS ?updated) }}''' self.rdf_server.run('update', sparql) done += len(keys) if (datetime.utcnow() - last_print).total_seconds() > 60: self.log.info( f'Imported {done} pageview stats, pausing for a few seconds...' ) time.sleep(5000) last_print = datetime.utcnow() self.rdf_server.run('update', set_status_query(self.pvstat, timestamp)) self.log.info(f'Finished importing {done} pageview stats')
class UpdateUsageStats(object): ids: Dict[str, str] def __init__(self): self.log = logging.getLogger('osm2rdf') self.log.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) self.log.addHandler(ch) # create the top-level parser parser = argparse.ArgumentParser( description='Update key and tag usage stats', usage='python3 %(prog)s [options]') parser.add_argument( '--host', action='store', dest='rdf_url', default='http://localhost:9999/bigdata/namespace/wdq/sparql', help='Host URL to upload data. Default: %(default)s') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry_run', default=False, help='Do not modify RDF database.') opts = parser.parse_args() self.options = opts self.rdf_server = Sparql(opts.rdf_url, 'query' if opts.dry_run else False) self.date_subject = '<https://taginfo.openstreetmap.org>' self.url_stats = 'https://taginfo.openstreetmap.org/api/4/key/stats' self.url_keys = ' https://taginfo.openstreetmap.org/api/4/keys/all' self.ids = {} def run(self): while True: self.run_once() time.sleep(1000) def run_once(self): ts_taginfo = self.get_current_ts() ts_db = query_status( self.rdf_server, self.date_subject) if not self.options.dry_run else None if ts_taginfo is not None and ts_taginfo == ts_db: self.log.info(f'Data is up to date {ts_taginfo}, sleeping...') return if ts_db is None: self.log.info( f'schema:dateModified is not set for {self.date_subject}, performing first import' ) else: self.log.info(f'Loading taginfo data, last updated {ts_db}') stats, ts = self.get_stats() if stats: self.log.info(f'Updating {len(stats)} stats') self.save_stats(stats, ts) self.log.info('Import is done, waiting for new data...') def get_stats(self): if self.options.dry_run: with open('/home/yurik/dev/sophox/all.keys.json', 'r') as f: data = json.load(f) else: data = requests.get(self.url_keys).json() ts = parse_utc(data['data_until']) stats = {} for row in data['data']: stats[row['key']] = tuple([row[k] for k in info_keys]) return stats, ts def save_stats(self, stats, timestamp): # Resolve keys to IDs for keys in chunks([k for k in stats.keys() if k not in self.ids], 5000): sparql = f''' SELECT ?key ?id WHERE {{ VALUES ?key {{{' '.join([stringify(k) for k in keys])}}} ?id osmdt:P16 ?key. }}''' res = self.rdf_server.run('query', sparql) # http://wiki.openstreetmap.org/entity/Q103 self.ids.update({ v['key']['value']: v['id']['value'][len('http://wiki.openstreetmap.org/entity/'):] for v in res }) self.log.info(f'Total resolved keys is {len(self.ids)}, updating...') # Delete all usage counters sparql = f''' DELETE {{ ?s ?p ?o }} WHERE {{ VALUES ?p {{ {' '.join([f'osmm:{k}' for k in info_keys])} }} ?s ?p ?o . }}''' self.rdf_server.run('update', sparql) self.log.info(f'Existing counts deleted, importing...') done = 0 last_print = datetime.utcnow() for keys in chunks(stats.keys(), 5000): sparql = ( 'INSERT {\n' + '\n'.join([f'?id osmm:{k} ?{k}.' for k in info_keys]) + '\n} WHERE {\n' + f"VALUES (?id {' '.join([f'?{k}' for k in info_keys])}) {{\n" + '\n'.join([ f"(osmd:{self.ids[k]} {' '.join([str(stats[k][i]) for i in range(len(info_keys))])})" for k in keys if k in self.ids ]) + '\n} }') self.rdf_server.run('update', sparql) done += len(keys) if (datetime.utcnow() - last_print).total_seconds() > 60: self.log.info( f'Imported {done} pageview stats, pausing for a few seconds...' ) time.sleep(60) last_print = datetime.utcnow() self.rdf_server.run('update', set_status_query(self.date_subject, timestamp)) self.log.info(f'Finished importing {done} pageview stats') def get_current_ts(self): ts_str = requests.get(self.url_stats).json()['data_until'] return parse_utc(ts_str)