def load_queries(self):
        if not os.path.exists(DIRECTORY):
            os.makedirs(DIRECTORY)

        data_split = int(TOTAL_QUERY*0.6)
        validation_split = int(TOTAL_QUERY*0.2)
        test_split = int(TOTAL_QUERY*0.2)
        print "data_split", data_split
        print "validation_split", validation_split
        print "test_split", test_split

        f = open(DBPEDIA_QUERY_LOG,'rb')
        fq = open(DIRECTORY+"x_query.txt",'w')
        ft = open(DIRECTORY+"y_time.txt",'w')
        ff = open(DIRECTORY+"x_features.txt",'w')
        x_f_csv = csv.writer(ff)
        sparql = SPARQLWrapper(DBPEDIA_ENDPOINT)
        f_extractor = FeatureExtractor()
        
        
        
        sw1 = StopWatch()
        sw2 = StopWatch()
        print_log_split = int(TOTAL_QUERY/10)
        
      

        count =0 
        for line in f:
            if count%print_log_split==0:
                print count," queries processed in ",sw2.elapsed_seconds()," seconds"
        
            if(count>=TOTAL_QUERY):
                break

            if count == data_split:
                fq.close()
                ft.close()
                ff.close()
                fq = open(DIRECTORY+"xval_query.txt",'w')
                ft = open(DIRECTORY+"yval_time.txt",'w')
                ff = open(DIRECTORY+"xval_features.txt",'w')
                x_f_csv = csv.writer(ff)
            elif count == (data_split+validation_split):
                fq.close()
                ft.close()
                ff.close()
                fq = open(DIRECTORY+"xtest_query.txt",'w')
                ft = open(DIRECTORY+"ytest_time.txt",'w')
                ff = open(DIRECTORY+"xtest_features.txt",'w')
                x_f_csv = csv.writer(ff)


            try:
                row = line.split()
                query_log = row[6][1:-1]
                #print query_log
                par = urlparse.parse_qs(urlparse.urlparse(query_log).query)
                #util.url_decode(row[6])
                sparql_query = par['query'][0]

                if sparql._parseQueryType(sparql_query) != SELECT:
                    continue

                #print sparql_query
                #print row

                sparql_query = f_extractor.get_dbp_sparql(sparql_query)

                #print sparql_query
                


                feature_vector = f_extractor.get_features(sparql_query)

                if feature_vector == None:
                    print "feature vector not found"
                    continue



                sparql.setQuery(sparql_query)
                sparql.setReturnFormat(JSON)

                sw1.reset()
                results = sparql.query().convert()
                elapsed = sw1.elapsed_milliseconds()

                result_rows = len(results["results"]["bindings"])

                # if result_rows == 0:
                #     continue

                # print "QUERY =", sparql_query
                # print "feature vector:",feature_vector
                # print elapsed, "seconds"                
                # print results
                # print "rows", result_rows
                # print "-----------------------"

                fq.write(query_log+'\n')
                ft.write(str(elapsed)+'\n')
                x_f_csv.writerow(feature_vector)
                count += 1
            except Exception as inst:
                print "Exception", inst

            
        
        f.close()
        fq.close()
        ft.close()
        ff.close()
        print count, "queries processed"
Beispiel #2
0
class SPARQLEndpoint:
    sparql_endpoint: SPARQLWrapper

    #
    #
    #
    def __init__(self, args=None):
        """
        Create a SPARQL Endpoint (an instance of the class SPARQLWrapper) given the "standard"
        command line params that we're using for most command line utilities

        :param args: the CLI args, see set_cli_params
        """
        self.args = args
        self.verbose = args.verbose
        self.data_source_code = args.data_source_code
        self.endpoint_base = args.sparql_endpoint
        self.database = args.sparql_endpoint_database
        self.sparql_endpoint = SPARQLWrapper(
            endpoint=f"{self.endpoint_url()}/query",
            updateEndpoint=f"{self.endpoint_url()}/update",
            returnFormat=JSON,
        )
        self.sparql_endpoint.setCredentials(args.sparql_endpoint_userid,
                                            passwd=args.sparql_endpoint_passwd)
        # self.s3_endpoint.addDefaultGraph(graph_iri_for_dataset(self.data_source_code))
        # self.s3_endpoint.setUseKeepAlive()

    def endpoint_url(self) -> str:
        return f"{self.endpoint_base}/{self.database}"

    def endpoint_url_for_queries(self) -> str:
        return f"{self.endpoint_base}/{self.database}/query"

    def user_id(self):
        return self.sparql_endpoint.user

    def password(self):
        return self.sparql_endpoint.passwd

    def execute_sparql_select_query(self,
                                    sparql_statement,
                                    mime_type=MIME_CSV):

        check_sparql_mime_type(mime_type)

        if self.verbose:
            log_item("Executing", sparql_statement)
        # noinspection PyProtectedMember
        log_item("Statement Type",
                 self.sparql_endpoint._parseQueryType(sparql_statement))
        self.sparql_endpoint.clearCustomHttpHeader("Accept")
        self.sparql_endpoint.setRequestMethod(URLENCODED)
        self.sparql_endpoint.setMethod(GET)
        self.sparql_endpoint.addCustomHttpHeader("Accept", mime_type)
        log_item("Query", sparql_statement.lstrip())
        self.sparql_endpoint.setQuery(sparql_statement.lstrip())
        # noinspection PyProtectedMember
        request = self.sparql_endpoint._createRequest()
        for (header_name, header_value) in request.header_items():
            log_item(header_name, header_value)
        log_item("Is Update", self.sparql_endpoint.isSparqlUpdateRequest())
        log_item("Full URL", request.full_url)
        # with urllib.request.urlopen(request) as f:
        #     print(f.read().decode('utf-8'))
        return self._execute_query()

    def execute_csv_query(self, sparql_statement: str):
        return self.execute_sparql_query2(sparql_statement)

    def execute_sparql_query2(
            self,
            sparql_statement,
            graph_iri: str = None,
            mime: str = MIME_CSV) -> Optional[SPARQLResponse]:

        if self.verbose:
            log_item("Executing", sparql_statement)

        # noinspection PyProtectedMember
        log_item("Statement Type",
                 self.sparql_endpoint._parseQueryType(sparql_statement))

        endpoint_url = self.endpoint_url_for_queries()

        log_item("SPARQL Endpoint", endpoint_url)
        log_item("Accept Mime Type", mime)

        params = {
            "timeout": 10000,  # ms
            "limit": 10000,
            "charset": "utf-8"
        }
        if graph_iri:
            params["graph"] = graph_iri
        params["reasoner"] = "true"
        log_item("Params", params)
        #
        # Using this method: https://www.w3.org/TR/sparql11-protocol/#query-via-post-direct
        #
        r = requests.post(endpoint_url,
                          data=sparql_statement,
                          auth=(self.user_id(), self.password()),
                          params=params,
                          headers={
                              'Accept': mime,
                              'Accept-Encoding': '*;q=0, identity;q=1',
                              'Accept-Charset': '*;q=0, utf-8;q=1',
                              'Content-type':
                              'application/sparql_endpoint-query'
                          },
                          stream=True)
        if r.status_code == 200:
            return SPARQLResponse(self, r, mime=mime)
        log_item('HTTP Status', r.status_code)
        return None

    def execute_sparql_statement(self, sparql_statement):
        if self.verbose:
            log_item("Executing", sparql_statement)
        statement_type = self.sparql_endpoint._parseQueryType(sparql_statement)
        log_item("Statement Type", statement_type)
        self.sparql_endpoint.clearCustomHttpHeader("Accept")
        self.sparql_endpoint.setMethod(POST)
        self.sparql_endpoint.setRequestMethod(URLENCODED)
        self.sparql_endpoint.addCustomHttpHeader("Accept", "text/boolean")
        self.sparql_endpoint.addParameter("reasoner", "true")
        log_item("Query", sparql_statement)
        self.sparql_endpoint.setQuery(sparql_statement)
        request = self.sparql_endpoint._createRequest()
        for (header_name, header_value) in request.header_items():
            log_item(header_name, header_value)
        log_item("Is Update", self.sparql_endpoint.isSparqlUpdateRequest())
        log_item("Full URL", request.full_url)
        # with urllib.request.urlopen(request) as f:
        #     print(f.read().decode('utf-8'))
        return self._execute_query()

    def execute_construct(self,
                          sparql_construct_statement: str) -> Optional[Graph]:

        self.sparql_endpoint.clearCustomHttpHeader("Accept")
        self.sparql_endpoint.setMethod(GET)
        self.sparql_endpoint.setReturnFormat(
            RDFXML)  # the call to convert() below depends on this being RDFXML
        self.sparql_endpoint.setRequestMethod(URLENCODED)
        self.sparql_endpoint.addParameter("reasoner", "true")
        #
        # timeout higher than triple store time out
        # self.sparql_endpoint.setTimeout(10)
        #
        # millisecs. let triple store fail first so timeout earlier than HTTP
        # self.sparql_endpoint.addParameter("timeout", "2000")
        log_item("Query", sparql_construct_statement)
        self.sparql_endpoint.setQuery(sparql_construct_statement)
        # noinspection PyProtectedMember
        request = self.sparql_endpoint._createRequest()
        for (header_name, header_value) in request.header_items():
            log_item(header_name, header_value)
        return self._execute_query()

    def _execute_query(self):
        try:
            result = self.sparql_endpoint.query()
            response = result.response
            log_item("Response Code", result.response.code)
            for (key, value) in result.response.info().items():
                log_item(key, value)
            for (key, value) in result.info().items():
                log_item(key, value)
            if result.response.code in (200, 201):
                return result
        except urllib.error.HTTPError as err:
            error("{} code={}".format(err, err.code))
        except urllib.error.URLError as err:
            error("{} reason={}".format(err, err.reason))
        except EndPointNotFound as err:
            error("{}".format(err))
            dump(err)
        except QueryBadFormed:
            error(
                f"Bad formed SPARQL statement: {self.sparql_endpoint.queryString}"
            )
        except Unauthorized:
            error("Unauthorized to access {}".format(
                self.sparql_endpoint.endpoint))
        except ConnectionRefusedError:
            error("Could not connect to {}".format(
                self.sparql_endpoint.endpoint))
        return None

    def handle_error(self, r: requests.Response) -> bool:
        log_item("URL", r.url)
        for key, value in r.headers.items():
            log_item(key, value)
        if not self._handle_stardog_error(r):
            return False
        if not self._handle_ontotext_error(r):
            return False
        log_item("HTTP Status", r.status_code)
        if r.status_code == 200:
            return True
        if r.status_code == 201:
            return True
        return False

    def _handle_stardog_error(self, r: requests.Response) -> bool:
        """
        In case we detect the response header SD-Error-Code we know it's a Stardog server.
        Then handle the various errors Stardog can give.
        """
        stardog_error = r.headers.get('SD-Error-Code')
        if not stardog_error:
            return True
        if stardog_error == "UnknownDatabase":
            log_error(f"The database {self.database} does not exist")
            return False
        warning(f"Encountered unknown Stardog error {stardog_error}")
        return True

    # noinspection PyMethodMayBeStatic
    def _handle_ontotext_error(self, r: requests.Response) -> bool:  # noqa
        return True
Beispiel #3
0
class PySPARQLWrapper:
    """This is a wrapper class that allows to query a SPARQL endpoint and 
    process the results as a Spark DataFrame or as a GraphFrame.

    :param spark: An existing spark session.
    :type spark: :class:`pyspark.sql.SparkSession`
    :param sparql_endpoint: The SPARQL endpoint to be queried.
    :type sparql_endpoint: string
    """
    def __init__(self, spark, sparql_endpoint):
        """ Constructor
        """

        self.__spark = spark
        self.__SPARQLWrapper = SPARQLWrapper(sparql_endpoint)

    def query(self, query):
        """Executes the query against the SPARQL endpoind and, depending on 
        the query type, returns a :class:`PySPARQL.SelectResult.PySPARQLSelectResult` or a
        :class:`PySPARQL.SelectResult.PySPARQLConstructResult`.

        :param query: A string representing the SPARQL query to be executed
        :type query: string
        :raises Exception: when the query type is not supported
        :rtype: :class:`PySPARQL.SelectResult.PySPARQLSelectResult` or
            :class:`PySPARQL.ConstructResult.PySPARQLConstructResult`
        
        """

        query_type = self.__SPARQLWrapper._parseQueryType(query)

        if query_type == SELECT:
            return self.select(query)
        elif query_type == CONSTRUCT:
            return self.construct(query)
        else:
            raise Exception("{} query type not supported!".format(query_type))

    def select(self, query):
        """Executes the `select` query against the SPARQL endpoind.

        :param query: A string representing the `select` SPARQL query 
            to be executed
        :type query: string
        :rtype: :class:`PySPARQL.SelectResult.PySPARQLSelectResult`
        """

        self.__SPARQLWrapper.resetQuery()
        self.__SPARQLWrapper.setReturnFormat(CSV)
        self.__SPARQLWrapper.setQuery(query)
        sparql_result = self.__SPARQLWrapper.query().convert()
        return PySPARQLSelectResult(self.__spark, sparql_result)

    def construct(self, query):
        """Executes the `construct` query against the SPARQL endpoind.

        :param query: A string representing the `construct` SPARQL query 
            to be executed
        :type query: string
        :rtype: :class:`PySPARQL.ConstructResult.PySPARQLConstructResult`
        """
        self.__SPARQLWrapper.resetQuery()
        self.__SPARQLWrapper.setReturnFormat(JSONLD)
        self.__SPARQLWrapper.setQuery(query)
        sparql_result = self.__SPARQLWrapper.query().convert()
        return PySPARQLConstructResult(self.__spark, sparql_result)