def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset pipeline, db, col, coltotemplates, projvartocols = self.translate() if self.query.limit > 0: if self.query.offset > 0: pipeline.append({"$limit": int(self.query.limit) + int(self.query.offset)}) pipeline.append({"$skip": int(self.query.offset)}) else: pipeline.append({"$limit": int(self.query.limit)}) # print("MongoDB query pipeline:") # pprint(pipeline) db, collection = self.mongo_client.get_db_coll_obj(db, col) result = collection.aggregate(pipeline, useCursor=True, batchSize=1000, allowDiskUse=True) for doc in result: for r in doc: if isinstance(doc[r], int): doc[r] = str(doc[r]) if r in projvartocols and r in coltotemplates: doc[r] = coltotemplates[r].replace('{' + projvartocols[r] + '}', doc[r].replace(" ", '_')) queue.put(doc) queue.put("EOF") return result
def __init__(self, sparql, mapping, datasource, rdfmts, star): self.mapping = mapping self.sparql = qp.parse(sparql) self.datasource = datasource self.rdfmts = rdfmts self.star = star self.prefixes = getPrefs(self.sparql.prefs)
def __init__(self, query, config, pushdownssqjoins=True): if isinstance(query, str): self.query = queryParser.parse(query) else: self.query = query self.prefixes = utils.getPrefs(self.query.prefs) self.config = config self.relevant_mts = {} self.decomposition = {} self.pushdownssqjoins = pushdownssqjoins
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): self.query = qp.parse(query) if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sparql = self.query pipeline, db, col, coltotemplates, projvartocols = self.translate() '''mquery, mproj, cmpquery = self.rewrite(sparql) mproj["_id"] = 0 pipeline = [] if len(mquery) > 0: pipeline.append({"$match": mquery}) pipeline.append({"$project": mproj}) ''' if sparql.limit > 0: if sparql.offset > 0: pipeline.append({"$limit": int(sparql.limit) + int(sparql.offset)}) pipeline.append({"$skip": int(sparql.offset)}) else: pipeline.append({"$limit": int(sparql.limit)}) print(pipeline) db, collection = self.mongo_client.get_db_coll_obj(db, col) result = collection.aggregate(pipeline, useCursor=True, batchSize=1000, allowDiskUse=True) for doc in result: for r in doc: if isinstance(doc[r], int): doc[r] = str(doc[r]) if projvartocols[r] in coltotemplates: doc[r] = coltotemplates[projvartocols[r]].replace('{' + projvartocols[r] + '}', doc[r].replace(" ", '_')) queue.put(doc) queue.put("EOF") return result
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset ds = self.star['datasource'] sqlquery, coltotemplates, projvartocols, filenametablename, filenameiteratormap = self.translate( ) print(sqlquery) try: if self.username is None: self.mysql = connector.connect(user='******', host=self.url) else: self.mysql = connector.connect(user=self.username, password=self.password, host=self.host, port=self.port) except connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("Database does not exist") else: print(err) except Exception as ex: print("Exception while connecting to Mysql", ex) try: cursor = self.mysql.cursor() db = "" for fn in filenameiteratormap: db = filenameiteratormap[fn]['iterator'] cursor.execute("use " + db) cursor.execute(sqlquery) header = [h[0] for h in cursor._description] for line in cursor: row = {} res = {} skip = False for i in range(len(line)): row[header[i]] = str(line[i]) for r in row: if row[r] == 'null': skip = True break if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] res[s] = val.replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: res[s] = coltotemplates[s].replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: res[r] = coltotemplates[r].replace( '{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] # if '_' in r and r[:r.find("_")] in projvartocols: # s = r[:r.find("_")] # p = r[r.find("_") + 1:] # if s in res: # val = res[s] # res[s] = val.replace('{' + p + '}', row[r].replace(" ", '_')) # else: # if '[' in coltotemplates[s]: # coltotemplates[s] = coltotemplates[s].replace( # coltotemplates[s][coltotemplates[s].rfind('['): coltotemplates[s].rfind(']') + 1], # '') # if '[' in coltotemplates[s]: # coltotemplates[s] = coltotemplates[s].replace( # coltotemplates[s][coltotemplates[s].rfind('['): coltotemplates[s].rfind(']') + 1], # '') # res[s] = coltotemplates[s].replace('{' + p + '}', row[r].replace(" ", '_')) # elif r in projvartocols and r in coltotemplates: # if '[' in coltotemplates[r]: # coltotemplates[r] = coltotemplates[r].replace( # coltotemplates[r][coltotemplates[r].find('['): coltotemplates[r].find(']') + 1], '') # if '[' in coltotemplates[r]: # coltotemplates[r] = coltotemplates[r].replace( # coltotemplates[r][coltotemplates[r].find('['): coltotemplates[r].find(']') + 1], '') # # p = coltotemplates[r][coltotemplates[r].find('{') + 1: coltotemplates[r].find('}')] # # res[r] = coltotemplates[r].replace('{' + p + '}', row[r].replace(" ", '_')) # else: # res[r] = row[r] if not skip: queue.put(res) except Exception as e: print("Exception ", e) pass queue.put("EOF")
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time # start = time() # print("Start:", start) if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)] if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters) # print(sqlquery) # totalres = 0 if sqlquery is None or len(sqlquery) == 0: queue.put("EOF") return [] try: start = time() try: self.drill = PyDrill(host=self.host, port=self.port) except Exception as ex: print("Exception while connecting to Drill", ex) queue.put("EOF") return if not self.drill.is_active(): print('Exception: Please run Drill first') queue.put("EOF") return # print("Drill Initialization cost:", time() - start) logger.info("Drill Initialization cost:" + str(time() - start)) start = time() if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] if len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) if isinstance(sqlquery, list): sqlquery = [sql for sql in sqlquery if sql is not None and len(sql) > 0] # logger.info(" UNION ".join(sqlquery)) processqueues = [] processes = [] res_dict = [] for sql in sqlquery: # processquery = Queue() # self.run_union(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict) # print(sql) processquery = Queue() processqueues.append(processquery) p = Process(target=self.run_union, args=(sql, queue, projvartocols, coltotemplates, limit, processquery, res_dict,)) p.start() processes.append(p) while len(processqueues) > 0: toremove = [] try: for q in processqueues: if q.get(False) == 'EOF': toremove.append(q) for p in processes: if p.is_alive(): p.terminate() except: pass for q in toremove: processqueues.remove(q) logger.info("Done running:") sw = " UNION ".join(sqlquery) logger.info(sw) else: card = 0 # if limit == -1: limit = 1000 if offset == -1: offset = 0 logger.info(sqlquery) # print(sqlquery) while True: query_copy = sqlquery + " LIMIT " + str(limit) + " OFFSET " + str(offset) cardinality = self.process_result(query_copy, queue, projvartocols, coltotemplates) card += cardinality if cardinality < limit: break offset = offset + limit # print("Exec in Drill took:", time() - start) logger.info("Exec in Drill took:" + str(time() - start)) except Exception as e: print("Exception ", e) pass # print('End:', time(), "Total results:", totalres) # print("Drill finished after: ", (time()-start)) queue.put("EOF")
def executeQuery(self, query, queue, limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query # print(query) self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) # print("Connection time: ", time()-start) query_filters = [ f for f in self.query.body.triples[0].triples if isinstance(f, Filter) ] # if limit > -1 or offset > -1: # self.query.limit = limit # self.query.offset = offset start = time() sqlquery, projvartocols, coltotemplates, filenametablename = self.translate( query_filters) # print(sqlquery) if sqlquery is None or len(sqlquery) == 0: queue.put("EOF") return [] # print(sqlquery, '\tTranslate took: ', time() - start) try: if isinstance(sqlquery, list): sqlquery = [ sql for sql in sqlquery if sql is not None and len(sql) > 0 ] if len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) if isinstance(sqlquery, list): try: sqlquery = [ sql for sql in sqlquery if sql is not None and len(sql) > 0 ] # logger.info(" UNION ".join(sqlquery)) processqueues = [] processes = [] res_dict = [] # logger.info("UNION started" + str(len(sqlquery))) for sql in sqlquery: processquery = Queue() # self.run_union(sql, filenametablename, queue, projvartocols, coltotemplates, limit, processquery,res_dict) processqueues.append(processquery) p = Process(target=self.run_union, args=( sql, filenametablename, queue, projvartocols, coltotemplates, limit, processquery, res_dict, )) p.daemon = True p.start() processes.append(p) while len(processqueues) > 0: toremove = [] try: for q in processqueues: v = q.get(False) if v == 'EOF': toremove.append(q) for p in processes: if p.is_alive(): p.terminate() except Empty: pass except Exception as e: print("Exception: ", e, len(processqueues)) pass for q in toremove: processqueues.remove(q) # logger.info("UNION Finished") logger.info("MySQL Done running UNION:") sw = " UNION ".join(sqlquery) logger.info(sw) except Exception as e: logger.error(" UNION ".join(sqlquery)) logger.error("Exception while running query" + str(e)) pass except IOError: logger.error("IOError while running query") pass else: try: if self.username is None: self.mysql = connector.connect(user='******', host=self.url) else: self.mysql = connector.connect(user=self.username, password=self.password, host=self.host, port=self.port) except connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print( "Something is wrong with your user name or password" ) logger.error( "Something is wrong with your user name or password" ) elif err.errno == errorcode.ER_BAD_DB_ERROR: print("Database does not exist") logger.error("Database does not exist") else: print(err) logger.error("Error:" + str(err)) queue.put('EOF') return except Exception as ex: print("Exception while connecting to Mysql", ex) logger.error("Exception while connecting to Mysql" + str(ex)) queue.put('EOF') return cursor = self.mysql.cursor() db = filenametablename cursor.execute("use " + db + ';') card = 0 # if limit == -1: limit = 100 if offset == -1: offset = 0 logger.info(sqlquery) # print(sqlquery) try: # rs = time() while True: query_copy = sqlquery + " LIMIT " + str( limit) + " OFFSET " + str(offset) cursor.execute(query_copy) cardinality = self.process_result( cursor, queue, projvartocols, coltotemplates) card += cardinality # if (time()-rs) > 20: # print(card, 'results found -..') if cardinality < limit: break offset = offset + limit logger.info("Running query: " + str(sqlquery) + " Non UNION DONE" + str(card)) except Exception as e: print("EXception: ", e) logger.error("Exception while running query" + str(e)) pass except IOError as ie: print("IO ERROR:", ie) logger.error("IOError while running query" + str(ie)) pass except Exception as e: print("Exception ", e) pass logger.info("Running query: " + str(query) + " DONE") logger.info("MySQL finished after: " + str(time() - start)) queue.put("EOF")
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset cypher, coltotemplates, projvartocols = self.translate() print("Cypher: ", cypher) print(coltotemplates, '||', projvartocols) if cypher is not None and len(cypher) > 5: session = self.neo_client.driver.session() results = session.run(cypher) for r in results: rr = dict(r) # check if collect is used in one of the variables lstps = [] for p in rr: if isinstance(rr[p], list): lstps.append(p) resulsts = [] if len(lstps) > 0: reslist = [rr.copy()] for p1 in lstps: rrp1 = rr[p1] for v in rrp1: for res in reslist: res[p1] = v resulsts.append(res.copy()) reslist = resulsts resulsts = [] resulsts = reslist for row in resulsts: res = {} for r in row: if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] res[s] = val.replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: res[s] = coltotemplates[s].replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: res[r] = coltotemplates[r].replace( '{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] queue.put(res) else: res = {} row = rr for r in row: if '_' in r and r[:r.find("_")] in projvartocols: s = r[:r.find("_")] if s in res: val = res[s] res[s] = val.replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) else: res[s] = coltotemplates[s].replace( '{' + r[r.find("_") + 1:] + '}', row[r].replace(" ", '_')) elif r in projvartocols and r in coltotemplates: res[r] = coltotemplates[r].replace( '{' + projvartocols[r] + '}', row[r].replace(" ", '_')) else: res[r] = row[r] queue.put(res) queue.put("EOF") return []
def executeQuery(self, query, queue=Queue(), limit=-1, offset=0): """ Entry point for query execution on csv files :param querystr: string query :return: """ from time import time if len(self.mappings) == 0: print("Empty Mapping") queue.put('EOF') return [] # querytxt = query self.query = qp.parse(query) self.prefixes = getPrefs(self.query.prefs) # query_filters = [f for f in self.query.body.triples[0].triples if isinstance(f, Filter)] if limit > -1 or offset > -1: self.query.limit = limit self.query.offset = offset sparql2sql = SPARQL2SQL(query, self.mappings, self.datasource, self.rdfmts, self.star) sqlquery, projvartocols, coltotemplates, filenametablename = sparql2sql.translate( ) # sqlquery, projvartocols, coltotemplates, filenametablename = self.translate(query_filters) # print(sqlquery) if self.spark is None: # url = 'spark://node3.research.tib.eu:7077' # self.mapping['url'] # params = { # "spark.driver.cores": "4", # "spark.executor.cores": "4", # "spark.cores.max": "6", # "spark.default.parallelism": "4", # "spark.executor.memory": "6g", # "spark.driver.memory": "6g", # "spark.driver.maxResultSize": "6g", # "spark.python.worker.memory": "4g", # "spark.local.dir": "/tmp", # "spark.debug.maxToStringFields": "500" # } params = self.params start = time() # self.config['params'] self.spark = SparkSession.builder.master('local[*]') \ .appName("OntarioSparkWrapper" + str(self.datasource.url) + query) for p in params: self.spark = self.spark.config(p, params[p]) self.spark = self.spark.getOrCreate() logger.info("SPARK Initialization cost:" + str(time() - start)) start = time() for filename, tablename in filenametablename.items(): # schema = self.make_schema(schemadict[filename]) # filename = "hdfs://node3.research.tib.eu:9000" + filename # filename = self.datasource.url + "/" + filename # filename = "/media/kemele/DataHD/LSLOD-flatfiles/" + filename # print(filename, tablename) if self.datasource.dstype == DataSourceType.LOCAL_JSON or \ self.datasource.dstype == DataSourceType.SPARK_JSON: df = self.spark.read.json(filename) elif self.datasource.dstype == DataSourceType.HADOOP_JSON: filename = "hdfs://node3.research.tib.eu:9000" + filename df = self.spark.read.json(filename) elif self.datasource.dstype == DataSourceType.HADOOP_TSV or \ self.datasource.dstype == DataSourceType.HADOOP_CSV: filename = "hdfs://node3.research.tib.eu:9000" + filename df = self.spark.read.csv(filename, inferSchema=True, sep='\t' if self.datasource.dstype == DataSourceType.HADOOP_TSV else ',', header=True) else: df = self.spark.read.csv( filename, inferSchema=True, sep='\t' if self.datasource.dstype == DataSourceType.LOCAL_TSV or self.datasource.dstype == DataSourceType.SPARK_TSV else ',', header=True) df.createOrReplaceTempView(tablename) logger.info("time for reading file" + str(filenametablename) + str(time() - start)) totalres = 0 try: runstart = time() if isinstance(sqlquery, list): # and len(sqlquery) > 3: sqlquery = " UNION ".join(sqlquery) # print(sqlquery) if isinstance(sqlquery, list): logger.info(" UNION ".join(sqlquery)) res_dict = [] for sql in sqlquery: cardinality = self.process_result(sql, queue, projvartocols, coltotemplates, res_dict) totalres += cardinality else: logger.info(sqlquery) cardinality = self.process_result(sqlquery, queue, projvartocols, coltotemplates) totalres += cardinality logger.info("Exec in SPARK took:" + str(time() - runstart)) except Exception as e: print("Exception ", e) pass # print('SPARK End:', time(), "Total results:", totalres) # print("SPARK finished after: ", (time()-start)) queue.put("EOF") self.spark.stop()