def __init__(self,
                 url=None,
                 local_directory=None,
                 container_dir=None,
                 **kwargs):

        self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH)
        self.ext_url = pick(url, self.default_url)
        self.local_url = pick(kwargs.get('local_url'), self.default_url)
        self.url = None
        self.active = kwargs.get('active', True)

        if not kwargs.get('delay_check'):
            self.check_status
        if self.url:
            kwargs['es_url'] = self.url
        else:
            kwargs['es_url'] = self.ext_url
        super(Elastic, self).__init__(**kwargs)
        self.container_dir = container_dir


        if self.ext_url is None:
            msg = ["A Elasticsearch url must be defined. Either pass 'url'",
                   "or initialize the 'RdfConfigManager'"]
            raise AttributeError(" ".join(msg))
    def __init__(self,
                 url=None,
                 namespace=None,
                 namespace_params=None,
                 local_directory=None,
                 container_dir=None,
                 graph=None,
                 **kwargs):

        self.local_directory = pick(local_directory, CFG.dirs.data)
        self.ext_url = pick(url, self.default_url)
        self.local_url = pick(kwargs.get('local_url'), self.default_url)
        self.log_level = log.level
        log.setLevel(kwargs.get("log_level", log.level))
        self.namespace = pick(namespace, self.default_ns)
        self.namespace_params = namespace_params
        self.container_dir = container_dir
        self.graph = pick(graph, self.default_graph)
        self.url = None
        self.active = kwargs.get('active', True)

        if self.ext_url is None:
            msg = [
                "A Blazegraph url must be defined. Either pass 'url'",
                "or initialize the 'RdfConfigManager'"
            ]
            raise AttributeError(" ".join(msg))
        if not kwargs.get('delay_check'):
            self.check_status
        self.__set_mgr__(**kwargs)
    def __init__(self,
                 url=None,
                 local_directory=None,
                 container_dir=None,
                 **kwargs):

        self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH)
        self.ext_url = pick(url, self.default_url)
        self.local_url = pick(kwargs.get('local_url'), self.default_url)
        self.url = None
        self.active = kwargs.get('active', True)

        if not kwargs.get('delay_check'):
            self.check_status
        if self.url:
            kwargs['es_url'] = self.url
        else:
            kwargs['es_url'] = self.ext_url
        super(Elastic, self).__init__(**kwargs)
        self.container_dir = container_dir

        if self.ext_url is None:
            msg = [
                "A Elasticsearch url must be defined. Either pass 'url'",
                "or initialize the 'RdfConfigManager'"
            ]
            raise AttributeError(" ".join(msg))
    def create_namespace(self, namespace=None, params=None):
        """ Creates a namespace in the triplestore

        args:
            namespace: the name of the namspace to create
            params: Dictionary of Blazegraph paramaters. defaults are:

                    {'axioms': 'com.bigdata.rdf.axioms.NoAxioms',
                     'geoSpatial': False,
                     'isolatableIndices': False,
                     'justify': False,
                     'quads': False,
                     'rdr': False,
                     'textIndex': False,
                     'truthMaintenance': False}
        """
        namespace = pick(namespace, self.namespace)
        params = pick(params, self.namespace_params)
        if not namespace:
            raise ReferenceError("No 'namespace' specified")
        _params = {
            'axioms': 'com.bigdata.rdf.axioms.NoAxioms',
            'geoSpatial': False,
            'isolatableIndices': False,
            'justify': False,
            'namespace': namespace,
            'quads': True,
            'rdr': False,
            'textIndex': False,
            'truthMaintenance': False
        }
        if params:
            _params.update(params)
        content_type = "text/plain"
        url = self._make_url("prepareProperties").replace("/sparql", "")
        params = ["%s=%s" % (map_val,
                             json.dumps(_params[map_key]).replace("\"", "")) \
                  for map_key, map_val in self.ns_property_map.items()]
        params = "\n".join(params)
        result = requests.post(url=url,
                               headers={"Content-Type": content_type},
                               data=params)
        data = result.text
        content_type = "application/xml"
        url = self._make_url("x").replace("/x/sparql", "")
        result = requests.post(url=url,
                               headers={"Content-Type": content_type},
                               data=data)
        if result.status_code == 201:
            log.warning(result.text)
            return result.text
        else:
            raise RuntimeError(result.text)
    def reset_namespace(self, namespace=None, params=None):
        """ Will delete and recreate specified namespace

        args:
            namespace(str): Namespace to reset
            params(dict): params used to reset the namespace
        """
        namespace = pick(namespace, self.namespace)
        params = pick(params, self.namespace_params)
        log.warning(" Reseting namespace '%s' at host: %s", namespace,
                    self.url)
        try:
            self.delete_namespace(namespace)
        except KeyError:
            pass
        self.create_namespace(namespace, params)
    def load_data(self,
                  data,
                  datatype="ttl",
                  namespace=None,
                  graph=None,
                  is_file=False,
                  **kwargs):
        """ loads data via file stream from python to triplestore

        Args:
          data: The data or filepath to load
          datatype(['ttl', 'xml', 'rdf']): the type of data to load
          namespace: the namespace to use
          graph: the graph to load the data to.
          is_file(False): If true python will read the data argument as a
              filepath, determine the datatype from the file extension,
              read the file and send it to blazegraph as a datastream
        """
        if kwargs.get('debug'):
            log.setLevel(logging.DEBUG)
        time_start = datetime.datetime.now()
        datatype_map = {
            'ttl': 'turtle',
            'xml': 'xml',
            'rdf': 'xml',
            'nt': 'nt',
            'n3': 'n3',
            'nquads': 'nquads',
            'hturtle': 'hturtle'
        }
        if is_file:
            datatype = data.split(os.path.extsep)[-1]
            file_name = data
            log.debug('starting data load of %s', file_name)
            data = open(data, 'rb').read()
        try:
            content_type = datatype_map[datatype]
        except KeyError:
            raise NotImplementedError("'%s' is not an implemented data fromat",
                                      datatype)
        conn = self.conn
        if namespace:
            conn = self.tstore.get_namespace(namespace)
        else:
            namespace = self.namespace
        graph = pick(graph, self.graph)
        start = datetime.datetime.now()
        try:
            result = conn.parse(data=data, publicID=graph, format=content_type)
        except:
            if is_file:
                print("Datafile ", file_name)
            raise
        if is_file:
            log.info(" loaded %s into rdflib namespace '%s'", file_name,
                     namespace)
        else:
            log.info(" loaded data into rdflib namespace '%s' in time: %s",
                     namespace, (datetime.datetime.now() - start))
            return result
    def load_local_file(self, file_path, namespace=None, graph=None, **kwargs):
        """ Uploads data to the Blazegraph Triplestore that is stored in files
            in directory that is available locally to blazegraph

            args:
                file_path: full path to the file
                namespace: the Blazegraph namespace to load the data
                graph: uri of the graph to load the data. Default is None

            kwargs:
                container_dir: the directory as seen by blazegraph - defaults to
                        instance attribute if not passed
        """
        time_start = datetime.datetime.now()
        url = self._make_url(namespace)
        params = {}
        if graph:
            params['context-uri'] = graph
        new_path = []
        container_dir = pick(kwargs.get('container_dir'), self.container_dir)
        if container_dir:
            new_path.append(self.container_dir)
        new_path.append(file_path)
        params['uri'] = "file:///%s" % os.path.join(*new_path)
        log.debug(" loading %s into blazegraph", file_path)
        result = requests.post(url=url, params=params)
        if result.status_code > 300:
            raise SyntaxError(result.text)
        log.info("loaded '%s' in time: %s blazegraph response: %s", file_path,
                 datetime.datetime.now() - time_start,
                 self.format_response(result.text))
        return result
    def load_data(self,
                  data,
                  datatype="ttl",
                  namespace=None,
                  graph=None,
                  is_file=False,
                  **kwargs):
        """
        Loads data via file stream from python to triplestore

        Args:
        -----
          data: The data or filepath to load
          datatype(['ttl', 'xml', 'rdf']): the type of data to load
          namespace: the namespace to use
          graph: the graph to load the data to.
          is_file(False): If true python will read the data argument as a
              filepath, determine the datatype from the file extension,
              read the file and send it to blazegraph as a datastream
        """
        log.setLevel(kwargs.get("log_level", self.log_level))
        time_start = datetime.datetime.now()
        datatype_map = {
            'ttl': 'text/turtle',
            'xml': 'application/rdf+xml',
            'rdf': 'application/rdf+xml',
            'nt': 'text/plain'
        }
        if is_file:
            datatype = data.split(os.path.extsep)[-1]
            file_name = data
            log.debug('starting data load of %s', file_name)
            data = open(data, 'rb').read()
        else:
            try:
                data = data.encode('utf-8')
            except AttributeError:
                # data already encoded
                pass
        try:
            content_type = datatype_map[datatype]
        except KeyError:
            raise NotImplementedError("'%s' is not an implemented data format",
                                      datatype)
        context_uri = pick(graph, self.graph)
        result = requests.post(url=self._make_url(namespace),
                               headers={"Content-Type": content_type},
                               params={"context-uri": context_uri},
                               data=data)
        if result.status_code == 200:
            if is_file:
                log.info(" loaded %s into blazegraph - %s", file_name,
                         self.format_response(result.text))
            else:
                log.info(" loaded data - %s",
                         self.format_response(result.text))
            log.setLevel(self.log_level)
            return result
        else:
            raise SyntaxError(result.text)
    def reset_namespace(self, namespace=None, params=None):
        """ Will delete and recreate specified namespace

        args:
            namespace(str): Namespace to reset
            params(dict): params used to reset the namespace
        """
        namespace = pick(namespace, self.namespace)
        params = pick(params, self.namespace_params)
        log.warning(" Reseting namespace '%s' at host: %s",
                 namespace,
                 self.url)
        try:
            self.delete_namespace(namespace)
        except KeyError:
            pass
        self.create_namespace(namespace, params)
Example #10
0
    def reset_namespace(self, namespace=None, params=None):
        """ Will delete and recreate specified namespace

        args:
            namespace(str): Namespace to reset
            params(dict): params used to reset the namespace
        """
        log = logging.getLogger("%s.%s" %
                                (self.log_name, inspect.stack()[0][3]))
        log.setLevel(self.log_level)
        namespace = pick(namespace, self.namespace)
        params = pick(params, self.namespace_params)
        log.warning(" Reseting namespace '%s' at host: %s", namespace,
                    self.url)
        try:
            self.delete_namespace(namespace)
        except RuntimeError:
            pass
        self.create_namespace(namespace, params)
Example #11
0
    def __init__(self,
                 url=None,
                 namespace=None,
                 namespace_params=None,
                 local_directory=None,
                 container_dir=None,
                 graph=None,
                 **kwargs):

        self.active = kwargs.get('active', True)
        self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH, "")
        self.url = "No Url for Rdflib tstore"
        self.namespace = pick(namespace, self.default_ns)
        self.namespace_params = namespace_params
        self.container_dir = container_dir
        self.graph = pick(graph, self.default_graph)
        try:
            self.conn = self.tstore.get_namespace(self.namespace)
        except KeyError:
            self.tstore.create_namespace(self.namespace)
            self.conn = self.tstore.get_namespace(self.namespace)
        self.__set_mgr__(**kwargs)
Example #12
0
    def __init__(self,
                 url=None,
                 namespace=None,
                 namespace_params=None,
                 local_directory=None,
                 container_dir=None,
                 graph=None,
                 **kwargs):

        self.active = kwargs.get('active', True)
        self.local_directory = pick(local_directory, CFG.LOCAL_DATA_PATH, "")
        self.url = "No Url for Rdflib tstore"
        self.namespace = pick(namespace, self.default_ns)
        self.namespace_params = namespace_params
        self.container_dir = container_dir
        self.graph = pick(graph, self.default_graph)
        try:
            self.conn = self.tstore.get_namespace(self.namespace)
        except KeyError:
            self.tstore.create_namespace(self.namespace)
            self.conn = self.tstore.get_namespace(self.namespace)
        self.__set_mgr__(**kwargs)
Example #13
0
    def _make_url(self, namespace=None, url=None, **kwargs):
        """ Creates the REST Url based on the supplied namespace

        args:
            namespace: string of the namespace
        kwargs:
            check_status_call: True/False, whether the function is called from
                    check_status. Used to avoid recurrsion error
        """
        if not kwargs.get("check_status_call"):
            if not self.url:
                self.check_status
        rtn_url = self.url
        if url:
            rtn_url = url
        if rtn_url is None:
            rtn_url = self.ext_url
        namespace = pick(namespace, self.namespace)
        if namespace:
            rtn_url = os.path.join(rtn_url.replace("sparql", ""), "namespace",
                                   namespace, "sparql").replace("\\", "/")
        elif not rtn_url.endswith("sparql"):
            rtn_url = os.path.join(rtn_url, "sparql").replace("\\", "/")
        return rtn_url
Example #14
0
    def __predicate_object_map__(self, map_iri):
        """Iterates through rr:predicateObjectMaps for this TripleMap
        creating a SimpleNamespace for each triple map and assigning the
        constant, template, parentTripleMap, reference as properties.

        Args:

        -----
                map_iri:  rdflib.URIRef, TripleMap IRI

        Returns:

        --------
                list:  List of predicate_object Namespace objects
        """
        pred_obj_maps = []
        for pred_obj_map_bnode in self.rml.objects(
                subject=map_iri,
                predicate=NS_MGR.rr.predicateObjectMap.rdflib):
            pred_obj_map = SimpleNamespace()
            pred_obj_map.predicate = self.rml.value(
                subject=pred_obj_map_bnode,
                predicate=NS_MGR.rr.predicate.rdflib)
            obj_map_bnode = self.rml.value(
                subject=pred_obj_map_bnode,
                predicate=NS_MGR.rr.objectMap.rdflib)
            if obj_map_bnode is None:
                continue
            pred_obj_map.constant = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rr.constant.rdflib)
            pred_obj_map.template = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rr.template.rdflib)
            pred_obj_map.parentTriplesMap = self.rml.value(
                subject=obj_map_bnode,
                predicate=NS_MGR.rr.parentTriplesMap.rdflib)
            if pred_obj_map.parentTriplesMap is not None:
                self.parents.add(str(pred_obj_map.parentTriplesMap))
            pred_obj_map.reference = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rr.reference.rdflib)
            pred_obj_map.datatype = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rr.datatype.rdflib)
            pred_obj_map.query = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rml.query.rdflib)
            pred_obj_map.json_query = self.rml.value(
                subject=obj_map_bnode, predicate=NS_MGR.rml.reference.rdflib)
            json_key = None
            if hasattr(self.triple_maps[str(map_iri)].logicalSource,
                       'json_key'):
                json_key = self.triple_maps[str(
                    map_iri)].logicalSource.json_key
            pred_obj_map.json_key = pick(
                self.rml.value(subject=obj_map_bnode,
                               predicate=NS_MGR.rml.key.rdflib), json_key)
            # BIBCAT Extensions
            pred_obj_map.delimiters = []
            if pred_obj_map.json_query:
                self.use_json_qry = True
            for obj in self.rml.objects(subject=obj_map_bnode,
                                        predicate=NS_MGR.kds.delimiter.rdflib):
                pred_obj_map.delimiters.append(obj)
            pred_obj_maps.append(pred_obj_map)
        return pred_obj_maps
Example #15
0
    def query(self,
              sparql,
              mode="get",
              namespace=None,
              rtn_format="json",
              **kwargs):
        """
        Runs a sparql query and returns the results

        Args:
        -----
            sparql: the sparql query to run
            namespace: the namespace to run the sparql query against
            mode: ['get'(default), 'update'] the type of sparql query
            rtn_format: ['json'(default), 'xml'] format of query results

        Kwargs:
        -------
            debug(bool): If True sets logging level to debug
        """
        namespace = pick(namespace, self.namespace)
        if kwargs.get("log_level"):
            log.setLevel(kwargs['log_level'])
        if kwargs.get("debug"):
            log.setLevel(logging.DEBUG)
        if rtn_format not in self.qry_formats:
            raise KeyError("rtn_format was '%s'. Allowed values are %s" % \
                           (rtn_format, self.qry_results_formats))
        url = self._make_url(namespace)
        if 'prefix' not in sparql.lower():
            sparql = "%s\n%s" % (NSM.prefix(), sparql)

        if mode == "get":

            data = {"query": sparql}  #, "format": rtn_format}
        elif mode == "update":
            data = {"update": sparql}
        else:
            raise NotImplementedError("'mode' != to ['get', 'update']")

        headers = {'Accept': self.qry_formats[rtn_format]}
        start = datetime.datetime.now()
        try:
            result = requests.post(url, data=data, headers=headers)
        except requests.exceptions.ConnectionError:
            result = requests.post(self._make_url(namespace, self.local_url),
                                   data=data,
                                   headers=headers)
        log.debug(
            format_multiline([
                "", "url='{url}'", """mode='{mode}', namespace='{namespace}',
                                    rtn_format='{rtn_format}'""",
                "**** SPAQRL QUERY ****", "", "{sparql}",
                "Query Time: {q_time}"
            ],
                             url=url,
                             mode=mode,
                             namespace=namespace,
                             rtn_format=rtn_format,
                             sparql=sparql,
                             q_time=(datetime.datetime.now() - start),
                             **kwargs))

        if result.status_code == 200:
            try:
                if rtn_format == "json":
                    bindings = result.json().get('results',
                                                 {}).get('bindings', [])
                elif rtn_format == 'xml':
                    xml_doc = etree.XML(result.text)
                    bindings = xml_doc.findall("results/bindings")
                else:
                    bindings = result.text
                try:
                    log.debug("result count: %s", len(bindings))
                except TypeError:
                    pass
                return bindings
            except json.decoder.JSONDecodeError:
                if mode == 'update':
                    return BeautifulSoup(result.text, 'lxml').get_text()
                return result.text
        else:
            raise SyntaxError("%s\n\n%s\n\n%s" %
                              (sparql, add_sparql_line_nums(sparql),
                               result.text[result.text.find("java."):]))
Example #16
0
    def load_data(self,
                  data,
                  datatype="ttl",
                  namespace=None,
                  graph=None,
                  is_file=False,
                  **kwargs):
        """ loads data via file stream from python to triplestore

        Args:
          data: The data or filepath to load
          datatype(['ttl', 'xml', 'rdf']): the type of data to load
          namespace: the namespace to use
          graph: the graph to load the data to.
          is_file(False): If true python will read the data argument as a
              filepath, determine the datatype from the file extension,
              read the file and send it to blazegraph as a datastream
        """
        if kwargs.get('debug'):
            log.setLevel(logging.DEBUG)
        time_start = datetime.datetime.now()
        datatype_map = {
            'ttl': 'turtle',
            'xml': 'xml',
            'rdf': 'xml',
            'nt': 'nt',
            'n3': 'n3',
            'nquads': 'nquads',
            'hturtle': 'hturtle'
        }
        if is_file:
            datatype = data.split(os.path.extsep)[-1]
            file_name = data
            log.debug('starting data load of %s', file_name)
            data = open(data, 'rb').read()
        try:
            content_type = datatype_map[datatype]
        except KeyError:
            raise NotImplementedError("'%s' is not an implemented data fromat",
                                      datatype)
        conn = self.conn
        if namespace:
            conn = self.tstore.get_namespace(namespace)
        else:
            namespace = self.namespace
        graph = pick(graph, self.graph)
        start = datetime.datetime.now()
        try:
            result = conn.parse(data=data, publicID=graph, format=content_type)
        except:
            if is_file:
                print("Datafile ", file_name)
            raise
        if is_file:
            log.info (" loaded %s into rdflib namespace '%s'",
                      file_name,
                      namespace)
        else:
            log.info(" loaded data into rdflib namespace '%s' in time: %s",
                     namespace,
                     (datetime.datetime.now() - start))
            return result