def load_queries(self): if not os.path.exists(DIRECTORY): os.makedirs(DIRECTORY) data_split = int(TOTAL_QUERY*0.6) validation_split = int(TOTAL_QUERY*0.2) test_split = int(TOTAL_QUERY*0.2) print "data_split", data_split print "validation_split", validation_split print "test_split", test_split f = open(DBPEDIA_QUERY_LOG,'rb') fq = open(DIRECTORY+"x_query.txt",'w') ft = open(DIRECTORY+"y_time.txt",'w') ff = open(DIRECTORY+"x_features.txt",'w') x_f_csv = csv.writer(ff) sparql = SPARQLWrapper(DBPEDIA_ENDPOINT) f_extractor = FeatureExtractor() sw1 = StopWatch() sw2 = StopWatch() print_log_split = int(TOTAL_QUERY/10) count =0 for line in f: if count%print_log_split==0: print count," queries processed in ",sw2.elapsed_seconds()," seconds" if(count>=TOTAL_QUERY): break if count == data_split: fq.close() ft.close() ff.close() fq = open(DIRECTORY+"xval_query.txt",'w') ft = open(DIRECTORY+"yval_time.txt",'w') ff = open(DIRECTORY+"xval_features.txt",'w') x_f_csv = csv.writer(ff) elif count == (data_split+validation_split): fq.close() ft.close() ff.close() fq = open(DIRECTORY+"xtest_query.txt",'w') ft = open(DIRECTORY+"ytest_time.txt",'w') ff = open(DIRECTORY+"xtest_features.txt",'w') x_f_csv = csv.writer(ff) try: row = line.split() query_log = row[6][1:-1] #print query_log par = urlparse.parse_qs(urlparse.urlparse(query_log).query) #util.url_decode(row[6]) sparql_query = par['query'][0] if sparql._parseQueryType(sparql_query) != SELECT: continue #print sparql_query #print row sparql_query = f_extractor.get_dbp_sparql(sparql_query) #print sparql_query feature_vector = f_extractor.get_features(sparql_query) if feature_vector == None: print "feature vector not found" continue sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sw1.reset() results = sparql.query().convert() elapsed = sw1.elapsed_milliseconds() result_rows = len(results["results"]["bindings"]) # if result_rows == 0: # continue # print "QUERY =", sparql_query # print "feature vector:",feature_vector # print elapsed, "seconds" # print results # print "rows", result_rows # print "-----------------------" fq.write(query_log+'\n') ft.write(str(elapsed)+'\n') x_f_csv.writerow(feature_vector) count += 1 except Exception as inst: print "Exception", inst f.close() fq.close() ft.close() ff.close() print count, "queries processed"
class SPARQLEndpoint: sparql_endpoint: SPARQLWrapper # # # def __init__(self, args=None): """ Create a SPARQL Endpoint (an instance of the class SPARQLWrapper) given the "standard" command line params that we're using for most command line utilities :param args: the CLI args, see set_cli_params """ self.args = args self.verbose = args.verbose self.data_source_code = args.data_source_code self.endpoint_base = args.sparql_endpoint self.database = args.sparql_endpoint_database self.sparql_endpoint = SPARQLWrapper( endpoint=f"{self.endpoint_url()}/query", updateEndpoint=f"{self.endpoint_url()}/update", returnFormat=JSON, ) self.sparql_endpoint.setCredentials(args.sparql_endpoint_userid, passwd=args.sparql_endpoint_passwd) # self.s3_endpoint.addDefaultGraph(graph_iri_for_dataset(self.data_source_code)) # self.s3_endpoint.setUseKeepAlive() def endpoint_url(self) -> str: return f"{self.endpoint_base}/{self.database}" def endpoint_url_for_queries(self) -> str: return f"{self.endpoint_base}/{self.database}/query" def user_id(self): return self.sparql_endpoint.user def password(self): return self.sparql_endpoint.passwd def execute_sparql_select_query(self, sparql_statement, mime_type=MIME_CSV): check_sparql_mime_type(mime_type) if self.verbose: log_item("Executing", sparql_statement) # noinspection PyProtectedMember log_item("Statement Type", self.sparql_endpoint._parseQueryType(sparql_statement)) self.sparql_endpoint.clearCustomHttpHeader("Accept") self.sparql_endpoint.setRequestMethod(URLENCODED) self.sparql_endpoint.setMethod(GET) self.sparql_endpoint.addCustomHttpHeader("Accept", mime_type) log_item("Query", sparql_statement.lstrip()) self.sparql_endpoint.setQuery(sparql_statement.lstrip()) # noinspection PyProtectedMember request = self.sparql_endpoint._createRequest() for (header_name, header_value) in request.header_items(): log_item(header_name, header_value) log_item("Is Update", self.sparql_endpoint.isSparqlUpdateRequest()) log_item("Full URL", request.full_url) # with urllib.request.urlopen(request) as f: # print(f.read().decode('utf-8')) return self._execute_query() def execute_csv_query(self, sparql_statement: str): return self.execute_sparql_query2(sparql_statement) def execute_sparql_query2( self, sparql_statement, graph_iri: str = None, mime: str = MIME_CSV) -> Optional[SPARQLResponse]: if self.verbose: log_item("Executing", sparql_statement) # noinspection PyProtectedMember log_item("Statement Type", self.sparql_endpoint._parseQueryType(sparql_statement)) endpoint_url = self.endpoint_url_for_queries() log_item("SPARQL Endpoint", endpoint_url) log_item("Accept Mime Type", mime) params = { "timeout": 10000, # ms "limit": 10000, "charset": "utf-8" } if graph_iri: params["graph"] = graph_iri params["reasoner"] = "true" log_item("Params", params) # # Using this method: https://www.w3.org/TR/sparql11-protocol/#query-via-post-direct # r = requests.post(endpoint_url, data=sparql_statement, auth=(self.user_id(), self.password()), params=params, headers={ 'Accept': mime, 'Accept-Encoding': '*;q=0, identity;q=1', 'Accept-Charset': '*;q=0, utf-8;q=1', 'Content-type': 'application/sparql_endpoint-query' }, stream=True) if r.status_code == 200: return SPARQLResponse(self, r, mime=mime) log_item('HTTP Status', r.status_code) return None def execute_sparql_statement(self, sparql_statement): if self.verbose: log_item("Executing", sparql_statement) statement_type = self.sparql_endpoint._parseQueryType(sparql_statement) log_item("Statement Type", statement_type) self.sparql_endpoint.clearCustomHttpHeader("Accept") self.sparql_endpoint.setMethod(POST) self.sparql_endpoint.setRequestMethod(URLENCODED) self.sparql_endpoint.addCustomHttpHeader("Accept", "text/boolean") self.sparql_endpoint.addParameter("reasoner", "true") log_item("Query", sparql_statement) self.sparql_endpoint.setQuery(sparql_statement) request = self.sparql_endpoint._createRequest() for (header_name, header_value) in request.header_items(): log_item(header_name, header_value) log_item("Is Update", self.sparql_endpoint.isSparqlUpdateRequest()) log_item("Full URL", request.full_url) # with urllib.request.urlopen(request) as f: # print(f.read().decode('utf-8')) return self._execute_query() def execute_construct(self, sparql_construct_statement: str) -> Optional[Graph]: self.sparql_endpoint.clearCustomHttpHeader("Accept") self.sparql_endpoint.setMethod(GET) self.sparql_endpoint.setReturnFormat( RDFXML) # the call to convert() below depends on this being RDFXML self.sparql_endpoint.setRequestMethod(URLENCODED) self.sparql_endpoint.addParameter("reasoner", "true") # # timeout higher than triple store time out # self.sparql_endpoint.setTimeout(10) # # millisecs. let triple store fail first so timeout earlier than HTTP # self.sparql_endpoint.addParameter("timeout", "2000") log_item("Query", sparql_construct_statement) self.sparql_endpoint.setQuery(sparql_construct_statement) # noinspection PyProtectedMember request = self.sparql_endpoint._createRequest() for (header_name, header_value) in request.header_items(): log_item(header_name, header_value) return self._execute_query() def _execute_query(self): try: result = self.sparql_endpoint.query() response = result.response log_item("Response Code", result.response.code) for (key, value) in result.response.info().items(): log_item(key, value) for (key, value) in result.info().items(): log_item(key, value) if result.response.code in (200, 201): return result except urllib.error.HTTPError as err: error("{} code={}".format(err, err.code)) except urllib.error.URLError as err: error("{} reason={}".format(err, err.reason)) except EndPointNotFound as err: error("{}".format(err)) dump(err) except QueryBadFormed: error( f"Bad formed SPARQL statement: {self.sparql_endpoint.queryString}" ) except Unauthorized: error("Unauthorized to access {}".format( self.sparql_endpoint.endpoint)) except ConnectionRefusedError: error("Could not connect to {}".format( self.sparql_endpoint.endpoint)) return None def handle_error(self, r: requests.Response) -> bool: log_item("URL", r.url) for key, value in r.headers.items(): log_item(key, value) if not self._handle_stardog_error(r): return False if not self._handle_ontotext_error(r): return False log_item("HTTP Status", r.status_code) if r.status_code == 200: return True if r.status_code == 201: return True return False def _handle_stardog_error(self, r: requests.Response) -> bool: """ In case we detect the response header SD-Error-Code we know it's a Stardog server. Then handle the various errors Stardog can give. """ stardog_error = r.headers.get('SD-Error-Code') if not stardog_error: return True if stardog_error == "UnknownDatabase": log_error(f"The database {self.database} does not exist") return False warning(f"Encountered unknown Stardog error {stardog_error}") return True # noinspection PyMethodMayBeStatic def _handle_ontotext_error(self, r: requests.Response) -> bool: # noqa return True
class PySPARQLWrapper: """This is a wrapper class that allows to query a SPARQL endpoint and process the results as a Spark DataFrame or as a GraphFrame. :param spark: An existing spark session. :type spark: :class:`pyspark.sql.SparkSession` :param sparql_endpoint: The SPARQL endpoint to be queried. :type sparql_endpoint: string """ def __init__(self, spark, sparql_endpoint): """ Constructor """ self.__spark = spark self.__SPARQLWrapper = SPARQLWrapper(sparql_endpoint) def query(self, query): """Executes the query against the SPARQL endpoind and, depending on the query type, returns a :class:`PySPARQL.SelectResult.PySPARQLSelectResult` or a :class:`PySPARQL.SelectResult.PySPARQLConstructResult`. :param query: A string representing the SPARQL query to be executed :type query: string :raises Exception: when the query type is not supported :rtype: :class:`PySPARQL.SelectResult.PySPARQLSelectResult` or :class:`PySPARQL.ConstructResult.PySPARQLConstructResult` """ query_type = self.__SPARQLWrapper._parseQueryType(query) if query_type == SELECT: return self.select(query) elif query_type == CONSTRUCT: return self.construct(query) else: raise Exception("{} query type not supported!".format(query_type)) def select(self, query): """Executes the `select` query against the SPARQL endpoind. :param query: A string representing the `select` SPARQL query to be executed :type query: string :rtype: :class:`PySPARQL.SelectResult.PySPARQLSelectResult` """ self.__SPARQLWrapper.resetQuery() self.__SPARQLWrapper.setReturnFormat(CSV) self.__SPARQLWrapper.setQuery(query) sparql_result = self.__SPARQLWrapper.query().convert() return PySPARQLSelectResult(self.__spark, sparql_result) def construct(self, query): """Executes the `construct` query against the SPARQL endpoind. :param query: A string representing the `construct` SPARQL query to be executed :type query: string :rtype: :class:`PySPARQL.ConstructResult.PySPARQLConstructResult` """ self.__SPARQLWrapper.resetQuery() self.__SPARQLWrapper.setReturnFormat(JSONLD) self.__SPARQLWrapper.setQuery(query) sparql_result = self.__SPARQLWrapper.query().convert() return PySPARQLConstructResult(self.__spark, sparql_result)