Example #1
0
	def __loadRDF(self, source, text, endpoint, rdf_format):
		"""
		After a graph has been loaded successfully, set up all params
		"""
		
		# LOAD THE GRAPH
				
		if text:
			self.IS_TEXT = True
			rdf_format = rdf_format or "turtle"
		
		
		elif endpoint:
			self.IS_ENDPOINT = True
			# replace graph with ConjunctiveGraph
			self.rdfgraph = rdflib.ConjunctiveGraph(store=SPARQLStore(source))			
			self.graphuri = source	# default uri is www location

				
		else:

			if type(source) == type("string"):
				self.IS_URL = True				
				if source.startswith("www."): #support for lazy people
					source = "http://%s" % str(source)
				self.graphuri = source	# default uri is www location
				rdf_format = rdf_format or guess_fileformat(source)

			elif type(source) == file:
				self.IS_FILE = True				
				self.graphuri = source.name # default uri is filename
				rdf_format = rdf_format or guess_fileformat(source.name)
			
			else:
				raise Exception("You passed an unknown object. Only URIs and files are accepted.") 
			
		#FINALLY, TRY LOADING:		

		try:
			if self.IS_TEXT:			
				self.rdfgraph.parse(data=source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from text" % len(self.rdfgraph))
			elif self.IS_ENDPOINT:
				printDebug("Accessing SPARQL Endpoint <%s>" % self.graphuri)
				printDebug("(note: support for sparql endpoints is still experimental)")
			else:
				self.rdfgraph.parse(source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from <%s>" % (len(self.rdfgraph), self.graphuri))
			# set up the query helper too
			self.queryHelper = QueryHelper(self.rdfgraph)	
			
		
		except:
			printDebug("\nError Parsing Graph (assuming RDF serialization was *%s*)\n" % (rdf_format))	 
			raise
Example #2
0
class Graph(object):
	"""
	Object that scan an rdf graph for schema definitions (aka 'ontologies') 
	
	In [1]: import ontospy2
	INFO:rdflib:RDFLib Version: 4.2.0

	In [2]: g = ontospy2.Graph("npgcore_latest.ttl")
	Loaded 3478 triples
	Ontologies found: 1
	
	"""

	def __init__(self, source, text=False, endpoint=False, rdf_format=None):
		"""
		Load the graph in memory, then setup all necessary attributes.
		"""
		super(Graph, self).__init__() 

		self.rdfgraph = rdflib.Graph()			
			
		self.graphuri	= None
		self.queryHelper = None # instantiated after we have a graph
		
		self.ontologies = []
		self.classes = []	
		self.namespaces = []
		
		self.properties = [] 
		self.annotationProperties = [] 
		self.objectProperties = []
		self.datatypeProperties = []
		
		self.toplayer = []
		self.toplayerProperties = []
		
		# keep track of the rdf source		
		self.IS_ENDPOINT = False
		self.IS_FILE = False
		self.IS_URL = False
		self.IS_TEXT = False
		
		# finally		
		self.__loadRDF(source, text, endpoint, rdf_format)
		# extract entities into
		self._scan()

	
	def __repr__(self):
		return "<OntoSPy Graph (%d triples)>" % (len(self.rdfgraph))
				


	
	def __loadRDF(self, source, text, endpoint, rdf_format):
		"""
		After a graph has been loaded successfully, set up all params
		"""
		
		# LOAD THE GRAPH
				
		if text:
			self.IS_TEXT = True
			rdf_format = rdf_format or "turtle"
		
		
		elif endpoint:
			self.IS_ENDPOINT = True
			# replace graph with ConjunctiveGraph
			self.rdfgraph = rdflib.ConjunctiveGraph(store=SPARQLStore(source))			
			self.graphuri = source	# default uri is www location

				
		else:

			if type(source) == type("string"):
				self.IS_URL = True				
				if source.startswith("www."): #support for lazy people
					source = "http://%s" % str(source)
				self.graphuri = source	# default uri is www location
				rdf_format = rdf_format or guess_fileformat(source)

			elif type(source) == file:
				self.IS_FILE = True				
				self.graphuri = source.name # default uri is filename
				rdf_format = rdf_format or guess_fileformat(source.name)
			
			else:
				raise Exception("You passed an unknown object. Only URIs and files are accepted.") 
			
		#FINALLY, TRY LOADING:		

		try:
			if self.IS_TEXT:			
				self.rdfgraph.parse(data=source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from text" % len(self.rdfgraph))
			elif self.IS_ENDPOINT:
				printDebug("Accessing SPARQL Endpoint <%s>" % self.graphuri)
				printDebug("(note: support for sparql endpoints is still experimental)")
			else:
				self.rdfgraph.parse(source, format=rdf_format)
				printDebug("----------\nLoaded %d triples from <%s>" % (len(self.rdfgraph), self.graphuri))
			# set up the query helper too
			self.queryHelper = QueryHelper(self.rdfgraph)	
			
		
		except:
			printDebug("\nError Parsing Graph (assuming RDF serialization was *%s*)\n" % (rdf_format))	 
			raise




	def serialize(self, rdf_format="turtle"):
		""" Shortcut that outputs the graph """
		return self.rdfgraph.serialize(format=rdf_format)
			
	
	def sparql(self, stringa):
		""" wrapper around a sparql query """
		qres = self.rdfgraph.query(stringa)
		return list(qres)
			

	def __extractNamespaces(self):
		""" 
		Extract graph namespaces.
		Namespaces are given in this format:

			In [01]: for x in graph.namespaces():
					....:			print x
			('xml', rdflib.URIRef('http://www.w3.org/XML/1998/namespace'))
			('', rdflib.URIRef('http://cohereweb.net/ontology/cohere.owl#'))
			(u'owl', rdflib.URIRef('http://www.w3.org/2002/07/owl#'))
			('rdfs', rdflib.URIRef('http://www.w3.org/2000/01/rdf-schema#'))
			('rdf', rdflib.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#'))
			(u'xsd', rdflib.URIRef('http://www.w3.org/2001/XMLSchema#'))

		We assume that a base namespace is implied by an empty prefix		
		"""

		exit = []

		if self.IS_ENDPOINT==True:
			return False

		else:
			
			if self.graphuri not in [y for x,y in self.rdfgraph.namespaces()]:
				# if not base namespace is set, try to simulate one 
				self.rdfgraph.bind("_file_", rdflib.Namespace(self.graphuri))
	
			self.namespaces = sorted(self.rdfgraph.namespaces())
		


	
	# ------------	
	# === main method === #	 
	# ------------
	
	def _scan(self, source=None, text=False, endpoint=False, rdf_format=None):
		""" 
		scan a source of RDF triples 
		build all the objects to deal with the ontology/ies pythonically
				
		In [1]: g.scan("npgcore_latest.ttl")
		Ontologies found: 1
		Out[3]: [<OntoSPy: Ontology object for uri *http://ns.nature.com/terms/*>]
		
		"""
		
		if source: # add triples dynamically
			self.__loadRDF(source, text, endpoint, rdf_format)
		
		printDebug("started scanning...\n----------")
					
		self.__extractNamespaces()
		
		self.__extractOntologies()
		printDebug("Ontologies found: %d" % len(self.ontologies))
						
		self.__extractClasses()
		printDebug("Classes found...: %d" % len(self.classes))
		
		self.__extractProperties()
		printDebug("Properties found: %d" % len(self.properties))
		printDebug("Annotation......: %d" % len(self.annotationProperties))
		printDebug("Datatype........: %d" % len(self.datatypeProperties))
		printDebug("Object..........: %d" % len(self.objectProperties))
		
		self.__computeTopLayer()

			
		




	
	
	def __extractOntologies(self, exclude_BNodes = False, return_string=False):
		"""
		returns Ontology class instances
		
        [ a owl:Ontology ;
            vann:preferredNamespacePrefix "bsym" ;
            vann:preferredNamespaceUri "http://bsym.bloomberg.com/sym/" ],
			
			
				
		"""
		out = []
	
		qres = self.queryHelper.getOntology()

		if qres:
			# NOTE: SPARQL returns a list of rdflib.query.ResultRow (~ tuples..)
			
			for candidate in qres:
				if isBlankNode(candidate[0]):
					if exclude_BNodes:
						continue
					else:
						checkDC_ID = [x for x in self.rdfgraph.objects(candidate[0], rdflib.namespace.DC.identifier)]
						if checkDC_ID:
							out += [Ontology(checkDC_ID[0])]
						else:
							vannprop = rdflib.URIRef("http://purl.org/vocab/vann/preferredNamespaceUri")
							vannpref = rdflib.URIRef("http://purl.org/vocab/vann/preferredNamespacePrefix")
							checkDC_ID = [x for x in self.rdfgraph.objects(candidate[0], vannprop)]
							if checkDC_ID:
								checkDC_prefix = [x for x in self.rdfgraph.objects(candidate[0], vannpref)]
								if checkDC_prefix:
									out += [Ontology(checkDC_ID[0], prefPrefix=checkDC_prefix[0])]
								else:
									out += [Ontology(checkDC_ID[0])]
						
				else:
					out += [Ontology(candidate[0])]
			
			
		else:
			pass
			# printDebug("No owl:Ontologies found")
			
		#finally		
		self.ontologies = out
		# add all annotations/triples
		for onto in self.ontologies:
			onto.triples = self.queryHelper.entityTriples(onto.uri)
		


	##################
	#  
	#  METHODS for MANIPULATING RDFS/OWL CLASSES 
	# 
	#  RDFS:class vs OWL:class cf. http://www.w3.org/TR/owl-ref/ section 3.1
	#
	##################


	def __extractClasses(self):
		""" 
		2015-06-04: removed sparql 1.1 queries
		2015-05-25: optimized via sparql queries in order to remove BNodes
		2015-05-09: new attempt 
		
		Note: queryHelper.getAllClasses() returns a list of tuples, 
		(class, classRDFtype) 
		so in some cases that's duplicates if a class is both RDFS.CLass and OWL.Class
		In this case we keep only OWL.Class as it is more informative.
		"""
		self.classes = [] # @todo: keep adding? 
		
		qres = self.queryHelper.getAllClasses()

		for candidate in qres:
			
			test_existing_cl = self.getClass(uri=candidate[0])
			if not test_existing_cl:
				# create it
				self.classes += [OntoClass(candidate[0], candidate[1], self.namespaces)]
			else:
				# update it
				if candidate[1] == rdflib.OWL.Class:
					# prefer OWL.Class over RDFS.Class
					test_existing_cl.rdftype = rdflib.OWL.Class 
					
				
		
		#add more data
		for aClass in self.classes:
			
			aClass.triples = self.queryHelper.entityTriples(aClass.uri)
			aClass._buildGraph() # force construction of mini graph
			
			aClass.queryHelper = self.queryHelper
			
			# attach to an ontology 
			for uri in aClass.getValuesForProperty(rdflib.RDFS.isDefinedBy):
				onto = self.getOntology(str(uri))
				if onto:
					onto.classes += [aClass]
					aClass.ontology = onto
					
			# add direct Supers				
			directSupers = self.queryHelper.getClassDirectSupers(aClass.uri)
			
			for x in directSupers:
				superclass = self.getClass(uri=x[0])
				if superclass: 
					aClass._parents.append(superclass)
					
					# add inverse relationships (= direct subs for superclass)
					if aClass not in superclass.children():
						 superclass._children.append(aClass)
			



	def __extractProperties(self):
		""" 
		2015-06-04: removed sparql 1.1 queries
		2015-06-03: analogous to get classes	
		
		# instantiate properties making sure duplicates are pruned
		# but the most specific rdftype is kept 
		# eg OWL:ObjectProperty over RDF:property
			
		"""
		self.properties = [] # @todo: keep adding? 
		self.annotationProperties = [] 
		self.objectProperties = []
		self.datatypeProperties = [] 
		
		qres = self.queryHelper.getAllProperties()
				
		for candidate in qres:

			test_existing_prop = self.getProperty(uri=candidate[0])
			if not test_existing_prop:
				# create it
				self.properties += [OntoProperty(candidate[0], candidate[1], self.namespaces)]
			else:
				# update it
				if candidate[1] and (test_existing_prop.rdftype == rdflib.RDF.Property):
					test_existing_prop.rdftype = inferMainPropertyType(candidate[1])


		#add more data
		for aProp in self.properties:
			
			if aProp.rdftype == rdflib.OWL.DatatypeProperty:
				self.datatypeProperties += [aProp]
			elif aProp.rdftype == rdflib.OWL.AnnotationProperty:
				self.annotationProperties += [aProp]
			elif aProp.rdftype == rdflib.OWL.ObjectProperty:
				self.objectProperties += [aProp]
			else:
				pass
			
			aProp.triples = self.queryHelper.entityTriples(aProp.uri)
			aProp._buildGraph() # force construction of mini graph

			# attach to an ontology [2015-06-15: no property type distinction yet]
			for uri in aProp.getValuesForProperty(rdflib.RDFS.isDefinedBy):
				onto = self.getOntology(str(uri))
				if onto:
					onto.properties += [aProp]
					aProp.ontology = onto
					
					
					
			self.__buildDomainRanges(aProp)
			
			# add direct Supers				
			directSupers = self.queryHelper.getPropDirectSupers(aProp.uri)
			
			for x in directSupers:
				superprop = self.getProperty(uri=x[0])
				if superprop: 
					aProp._parents.append(superprop)
				
					# add inverse relationships (= direct subs for superprop)
					if aProp not in superprop.children():
						 superprop._children.append(aProp)
		
					
					

	def getClass(self, id=None, uri=None, match=None):
		""" 
		get the saved-class with given ID or via other methods...
		
		Note: it tries to guess what is being passed..
	
		In [1]: g.getClass(uri='http://www.w3.org/2000/01/rdf-schema#Resource')
		Out[1]: <Class *http://www.w3.org/2000/01/rdf-schema#Resource*>
		
		In [2]: g.getClass(10)
		Out[2]: <Class *http://purl.org/ontology/bibo/AcademicArticle*> 

		In [3]: g.getClass(match="person")
		Out[3]: 
		[<Class *http://purl.org/ontology/bibo/PersonalCommunicationDocument*>,
		 <Class *http://purl.org/ontology/bibo/PersonalCommunication*>,
		 <Class *http://xmlns.com/foaf/0.1/Person*>]
		
		"""
		
		if not id and not uri and not match:
			return None
			
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.classes:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
		else:
			for x in self.classes:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None


	def getProperty(self, id=None, uri=None, match=None):
		""" 
		get the saved-class with given ID or via other methods...
		
		Note: analogous to getClass method		
		"""
		
		if not id and not uri and not match:
			return None
			
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.properties:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
		else:
			for x in self.properties:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None
			

	def getOntology(self, id=None, uri=None, match=None):
		""" 
		get the saved-ontology with given ID or via other methods...	
		"""
		
		if not id and not uri and not match:
			return None
			
		if type(id) == type("string"):
			uri = id
			id = None
			if not uri.startswith("http://"):
				match = uri
				uri = None
		if match:
			if type(match) != type("string"):
				return []
			res = []
			for x in self.ontologies:
				if match.lower() in x.uri.lower():
					res += [x]
			return res
		else:
			for x in self.ontologies:
				if id and x.id == id:
					return x
				if uri and x.uri.lower() == uri.lower():
					return x
			return None
			
								

	def __computeTopLayer(self):

		exit = []
		for c in self.classes:
			if not c.parents():
				exit += [c]
		self.toplayer = exit # sorted(exit, key=lambda x: x.id) # doesnt work

		# properties 
		exit = []
		for c in self.properties:
			if not c.parents():
				exit += [c]
		self.toplayerProperties = exit # sorted(exit, key=lambda x: x.id) # doesnt work
		

	def printClassTree(self, element = None, showids=True, labels=False):
		""" 
		Print nicely into stdout the class tree of an ontology 
		
		Note: indentation is made so that ids up to 3 digits fit in, plus a space.
		[123]1--
		[1]123--
		[12]12--
		"""
		
		if not element:	 # first time
			for x in self.toplayer:
				printGenericTree(x, 0, showids, labels)
		
		else:
			printGenericTree(element, 0, showids, labels)		


	def printPropertyTree(self, element = None, level=0, showids=True, labels=False):
		""" 
		Print nicely into stdout the property tree of an ontology 
		
		Note: indentation is made so that ids up to 3 digits fit in, plus a space.
		[123]1--
		[1]123--
		[12]12--
		"""
		
		if not element:	 # first time
			for x in self.toplayerProperties:
				printGenericTree(x, 0, level, showids)
		
		else:
			printGenericTree(element, 0, showids, labels)
			
			

	###########

	# METHODS for MANIPULATING RDFS/OWL PROPERTIES

	###########



	def __buildDomainRanges(self, aProp):			
		"""
		extract domain/range details and add to Python objects
		"""
		domains = aProp.rdfgraph.objects(None, rdflib.RDFS.domain)
		ranges =  aProp.rdfgraph.objects(None, rdflib.RDFS.range)
		
		for x in domains:
			if not isBlankNode(x):
				aClass = self.getClass(uri=str(x))
				if aClass:
					aProp.domains += [aClass]
					aClass.domain_of += [aProp]
				else:
					aProp.domains += [x]  # edge case: it's not an OntoClass instance?
				
		for x in ranges:
			if not isBlankNode(x):
				aClass = self.getClass(uri=str(x))
				if aClass:
					aProp.ranges += [aClass]
					aClass.range_of += [aProp]
				else:
					aProp.ranges += [x]