Python Server.tagText Examples

Programming Language: Python
Namespace/Package Name: xmlrpclib
Class/Type: Server
Method/Function: tagText
Examples at hotexamples.com: 1
Python Server.tagText - 1 examples found. These are the top rated real world Python examples of xmlrpclib.Server.tagText extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
Server(30)
getPage(2)
storePage(2)
search(2)
get_workers(1)
get_queue(1)
getUserMessages(1)
hit(1)
is_alive(1)
put(1)
list_packages(1)
loadBot(1)
getTypeList(1)
manage_project_export_queue(1)
mul(1)
newfile(1)
odd(1)
parse(1)
ping(1)
random(1)
quit(1)
run_group(1)
userLogout(1)
userLogin(1)
tagText(1)
sync_run_group(1)
sync_run(1)
send_message(1)
sendUserMessage(1)
run_bzr_command(1)
getSpaces(1)
run(1)
rpc_test_service(1)
repeat(1)
removeSpace(1)
removePermissionFromSpace(1)
removePage(1)
recommend(1)
getTextNounStatements(1)
getSpace(1)
getSpacePermissionSets(1)
authService(1)
display(1)
delContact(1)
convertFile(1)
complete(1)
chop_in_half(1)
checkForUpdates(1)
bap_version(1)
Example #1
Show file
File: annotator.py Project: miquelmillan/auto-annotator
class Annotator:
	def __init__(self):
		self.nounPhrases = []
		self.ontoURL =  configuration.ONTOLOGY_SERVICE_URL + "/" + configuration.ONTOLOGY_NAME
		self.tagger = Server("http://" + configuration.TAG_SERVICE_URL + ":" + str(configuration.TAG_SERVICE_PORT), verbose=0)
	
	def startProcess(self, url):
		self.__log("Starting process")
		self.__log("Preparing the Tagger")
		self.__log("Reading ontology")
		
		ontologia = ontology.OntologyReader(configuration.ONTOLOGY_SERVICE_URL)
		ontologia.getOntologyClassesByURL(self.ontoURL)
		
		#read the web page
		webPage = web.WebText(url,self.tagger)
		self.__log("Analysing web contents")
		
		# parse, analyse and identify things in the web page
		taggedText = webPage.TaggedText()
	
		# detect into the text the possible entities - text patterns
		self.__log("Discarding non-relevant ones")
		
		NounPhrases = webPage.getTextNounPhrases()
		goodResults = []
		#filter good results
		for phrase in NounPhrases:
			self.__log(phrase['text'] + " " + str(phrase['results']) + " " + str(phrase['matchingPercentage']))
			
			if phrase['results'] > configuration.NUMBER_OF_RESULTS_FOR_RELEVANCE \
			and ( phrase['matchings'] > configuration.NUMBER_OF_MATCHINGS \
			and phrase['matchingPercentage'] > configuration.MATCHING_PERCENTAGE ):
				found = False
				for result in goodResults:
					if result['text'] == phrase["text"]:
						found = True
				
				if not found:
					goodResults.append(phrase)
					
		self.__log("Discarded: " + str(len(NounPhrases) - len(goodResults)) + " Noun-Phrases")
		
		self.__log("Extracting class candidates for the definitive " + str(len(goodResults)) + " Noun-Phrases")
		
		self.nounPhrases = goodResults	
		# search for class candidates
		self.getClassCandidates()
		
		self.__log("Class candidates extracted")

		# relate the entities with the ontology
		self.annotate(self.ontoURL,ontologia)
		
		#Once the NounPhrases are detected mark them with its rellevance
		for phrase in self.getNounPhrases():
			self.__log(phrase['text'] + " is a " + str(phrase['is_a']))
		
		annotatedPage = self.microFormat(self.getNounPhrases(), webPage)

		return annotatedPage
	
	def getNounPhrases (self):
		return self.nounPhrases
	
	def getClassCandidates(self):
		temp = []
		for phrase in self.nounPhrases:
			self.__log("Looking for " + phrase['text'] + " class candidates.")
		
			p = {}
			p = self.__patternIsA(phrase)
			p = self.__patternAndOther(p)
			p = self.__patternOrOther(p)
			p = self.__patternLikeOther(p)
			p = self.__patternSuchAs(p)
			p = self.__patternEspecially(p)
			p = self.__patternIncluding(p)
			temp.append(p)

		self.nounPhrases = temp
		
	def microFormat(self, NounPhrases, webPage):
		annotatedPage = webPage.getTextRawContent()
		
		for phrase in NounPhrases:
			rawText = ""
			words = phrase['text'].split(" ")
			for word in words:
				rawText += webPage.getRawWord(word)
			
			rawText = rawText.strip()
			if phrase['is_a'] != "":
				replacement = "<a href=\"" + phrase['is_a'] + "\" class=\"" + phrase['is_a'] + "\">" + rawText + "</a>"
				
				self.__log( "<<" + rawText +">> <<" + str(phrase['text']) + ">>")
				self.__log( replacement)
				
				#create a new annotation machine and annotate the text
				annotateMachine = annotation.Annotation(rawText,replacement)
				annotateMachine.feed(annotatedPage)
				#string with the web page
				annotatedPage = annotateMachine.output()
			else:
				self.__log(phrase['text'] + " hasn't any class assigned, so it won't be annotated.")
				
		return annotatedPage
		
	def annotate(self, url ,onto):
		ontoNames = onto.getOntologyNames()
		temp = []
		# 2.- If not, use wordnet. Get the last word and compare with each one in Ontology
		for phrase in self.nounPhrases:
			#for each Noun Phrase
			#1.- search it directly into the ontology
			self.__log("Step 3: Ontology length: " + str(len(ontoNames)))
			self.__log("Step 3: candidates number: " + str(len(phrase['candidates'])))
			Similars = self.__searchInOntology(phrase['candidates'], ontoNames)
			
			#1.1.- if no ones are found search the most similar
			if len(Similars) == 0:
				#1.2.- If not found, search the most similar
				Similars = []
				self.__log("Searching candidates for NE: " + phrase['text'])
				Similars = self.__getMostSimilars(phrase['candidates'],ontoNames)
				
				
			#1.3.- If (similars.length > 1) use PMI-IR, else ...
			if len(Similars) == 0:
				self.__log("No candidates in ontology for " + phrase['text'])
			else:
				self.__log("Most appropiate candidates::")
				#calculate PMI-IR
				highestValue = 0.0
				finalOne = ["",-1.0,"",0.0]
			
				for similar in Similars:
					ontologyValue = similar[0].partition("[")[2].partition("]")[0].replace("_"," ")
					#PMI-IR calculus
					query1 = live.LiveQuery("\""+ phrase['text'] + "\"" + " AND " + "\"" + ontologyValue + "\"")
					query2 = live.LiveQuery("\""+ ontologyValue + "\"")
					
					if float(query2.getNumberResults()) <= 0.0:
						similar[3] = 0.0
					else:
						similar[3] = float(float(query1.getNumberResults()) / float(query2.getNumberResults()))
					
						#end PMI-IR calculus
						self.__log(phrase['text'] + "-" + ontologyValue + ". Values: " + str(float(query1.getNumberResults())) + "-" + str(float(query2.getNumberResults())))
						self.__log("		" + similar[0] + " " +str(similar[1]) + " " +similar[2] + " " +str(similar[3]))
	
						#take the one with highest value
						if similar[3] > highestValue:
							highestValue = similar[3]
							finalOne = similar
						
				self.__log("PMI-IR: Chosen one "+ finalOne[0] + " with value "+ str(finalOne[3]))
				phrase['is_a'] = finalOne[0]

	def __searchInOntology(self, candidates, ontologyNames):
		similars = []
		stemmer = nltk.WordnetStemmer()
		
		for candidate in candidates:
			#for each candidate of each Noun Phrase
			for key, value in ontologyNames.iteritems():
				#for each ontology class
				Key = self.__getKeyText(candidate)
				realValue = value.partition("[")[2].partition("]")[0].replace("_"," ")
				realValue = realValue.lower().split(" ")

				for word in realValue:
					if word == Key:
						similars.append([value, 1.0, candidate, 0.0])
		
		return similars

	def __getMostSimilars(self, candidates, ontologyNames):
		similarities=[]
		mostSimilars=[]
		stemmer = nltk.WordnetStemmer()

		for candidate in candidates:
			#for each candidate of each Noun Phrase
			self.__log("		Step 3: searching the similarities for " + candidate)
			for key, value in ontologyNames.iteritems():
				#for each ontology class
				Key = self.__getKeyText(candidate)
				realValue = value.partition("[")[2].partition("]")[0].replace("_"," ")
				realValue = realValue.lower().split(" ")
				
				try:
					similarity = wordnet.N[stemmer.stem(Key.lower())][0].wup_similarity(wordnet.N[realValue.strip()][0])
					similarities.append([value, similarity, candidate, 0.0])
				except:
					similarities.append([value, -1.0, candidate, 0.0])
	
		#search the ones with highest value of similarity and filter the repeated ones
		for similarity in similarities:
			if similarity[1] > configuration.SIMILARITY_THRESHOLD:
				found = False
				for s in mostSimilars:
					if s[0] == similarity[0]:
						found = True
				if not found:
					mostSimilars.append(similarity)
				else:
					found = False

		return mostSimilars
	
	def __getKeyText(self, key):
		text = key.split()
		cleanText = ""
		stemmer = nltk.WordnetStemmer()
		for word in text:
			type = word.partition("/")[2]
			word = word.partition("/")[0]
			word = word.replace(",","").replace(";","").replace(":","").replace(".","").lower()
			
			if type == "NN" or type == "NNP":
				cleanText = word
			if type == "NNS":
				cleanText = stemmer.stem(word)
				if cleanText == None:
					cleanText = word


		return cleanText	

	def __addCandidate(self, phrase, candidate):
		cleanText = candidate
		try:
			phrase['candidates'].index(cleanText)
		except:
			if candidate.find(phrase['textTagged']) == -1:
				phrase['candidates'].append(cleanText)

	def __patternIsA(self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
			      
		#put phrase into the corresponding pattern
		
		queryText = "\"" + phrase['text'] + " is a *\""
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition(phrase['text'] + " is a ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)
			#NOW IN LOWER CASE
			parts = value.partition(phrase['text'].lower() + " is a ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)
		return phrase
	
	def __patternAndOther(self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"" + phrase['text'] + " and other *\""
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition(phrase['text'] + " and other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)		
			#NOW IN LOWER CASE
			parts = value.partition(phrase['text'].lower() + " and other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)		
	
		return phrase

	def __patternOrOther(self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"" + phrase['text'] + " or other *\""
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition(phrase['text'] + " or other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)		
			#NOW IN LOWER CASE
			parts = value.partition(phrase['text'].lower() + " or other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)			
	
		return phrase

	def __patternLikeOther(self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"" + phrase['text'] + ", like other *\""
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition(phrase['text'] + ", like other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)

			#NOW IN LOWER CASE
			parts = value.partition(phrase['text'].lower() + ", like other ")
			#get the concept from the corresponding place
			candidate = self.__getFirstCandidate(parts[2])
			if candidate != None:
				self.__addCandidate(phrase, candidate)	
		return phrase

	def __patternSuchAs(self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"* such as " + phrase['text'] + "\""   
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition( "such as " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)
						
			#NOW IN LOWER CASE
			parts = value.partition( "such as " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)
		return phrase
	
	def __patternEspecially (self, phrase):
		#dictionary = {'textTagged' : candidate, \
			     #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"* especially " + phrase['text'] + "\""   
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition( "especially " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)

			#NOW IN LOWER CASE
			parts = value.partition( "especially " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)
		return phrase
		
	def __patternIncluding (self, phrase):
		#dictionary = {'textTagged' : candidate, \
			      #'text' : text, \
			      #'matchings' : query.getMatchingResults(text) , \
			      #'matchingPercentage' : query.getMatchingPercentage(text), \
			      #'results' : query.getNumberResults(), \
			      #'candidates' : [], \
			      #'is_a: ""}
		#put phrase into the corresponding pattern
		
		queryText = "\"* including " + phrase['text'] + "\""
		
		query = live.LiveQuery(queryText)
		results = query.getResults()
		
		for key,value in results.iteritems():
			#split the text from the pattern
			parts = value.partition( "including " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)
			#NOW IN LOWER CASE
			parts = value.partition( "including " + phrase['text'])
			#get the concept from the corresponding place
			if parts[0] != parts:
				candidate = self.__getLastCandidate(parts[0])
				if candidate != None:
					self.__addCandidate(phrase, candidate)

		return phrase

	
	def __getFirstCandidate(self, text):
		part = text.partition(".")
		part = part[0].partition(",")
		part = part[0].partition("?")
		part = part[0].partition("\n")
		part = part[0].partition(";")
		part = part[0].partition(":")
		part = part[0].partition("|")
		part = part[0].partition(" -")
		part = part[0].partition("- ")
			
		if len(part[0]) > 0:
			nounPhrases = self.tagger.getTextNounStatements(self.tagger.tagText(part[0]))
			if len(nounPhrases) > 0:
				return nounPhrases[0]
		else:
			return None
		
	def __getLastCandidate(self, text):
		if len(text) > 0:
			nounPhrases = self.tagger.getTextNounStatements(self.tagger.tagText(text))
			if len(nounPhrases)>0:
				return nounPhrases[len(nounPhrases)-1]
		
		return None
		
	def __log(self, chain):
		print time.ctime() + ": " + chain