class Annotator: def __init__(self): self.nounPhrases = [] self.ontoURL = configuration.ONTOLOGY_SERVICE_URL + "/" + configuration.ONTOLOGY_NAME self.tagger = Server("http://" + configuration.TAG_SERVICE_URL + ":" + str(configuration.TAG_SERVICE_PORT), verbose=0) def startProcess(self, url): self.__log("Starting process") self.__log("Preparing the Tagger") self.__log("Reading ontology") ontologia = ontology.OntologyReader(configuration.ONTOLOGY_SERVICE_URL) ontologia.getOntologyClassesByURL(self.ontoURL) #read the web page webPage = web.WebText(url,self.tagger) self.__log("Analysing web contents") # parse, analyse and identify things in the web page taggedText = webPage.TaggedText() # detect into the text the possible entities - text patterns self.__log("Discarding non-relevant ones") NounPhrases = webPage.getTextNounPhrases() goodResults = [] #filter good results for phrase in NounPhrases: self.__log(phrase['text'] + " " + str(phrase['results']) + " " + str(phrase['matchingPercentage'])) if phrase['results'] > configuration.NUMBER_OF_RESULTS_FOR_RELEVANCE \ and ( phrase['matchings'] > configuration.NUMBER_OF_MATCHINGS \ and phrase['matchingPercentage'] > configuration.MATCHING_PERCENTAGE ): found = False for result in goodResults: if result['text'] == phrase["text"]: found = True if not found: goodResults.append(phrase) self.__log("Discarded: " + str(len(NounPhrases) - len(goodResults)) + " Noun-Phrases") self.__log("Extracting class candidates for the definitive " + str(len(goodResults)) + " Noun-Phrases") self.nounPhrases = goodResults # search for class candidates self.getClassCandidates() self.__log("Class candidates extracted") # relate the entities with the ontology self.annotate(self.ontoURL,ontologia) #Once the NounPhrases are detected mark them with its rellevance for phrase in self.getNounPhrases(): self.__log(phrase['text'] + " is a " + str(phrase['is_a'])) annotatedPage = self.microFormat(self.getNounPhrases(), webPage) return annotatedPage def getNounPhrases (self): return self.nounPhrases def getClassCandidates(self): temp = [] for phrase in self.nounPhrases: self.__log("Looking for " + phrase['text'] + " class candidates.") p = {} p = self.__patternIsA(phrase) p = self.__patternAndOther(p) p = self.__patternOrOther(p) p = self.__patternLikeOther(p) p = self.__patternSuchAs(p) p = self.__patternEspecially(p) p = self.__patternIncluding(p) temp.append(p) self.nounPhrases = temp def microFormat(self, NounPhrases, webPage): annotatedPage = webPage.getTextRawContent() for phrase in NounPhrases: rawText = "" words = phrase['text'].split(" ") for word in words: rawText += webPage.getRawWord(word) rawText = rawText.strip() if phrase['is_a'] != "": replacement = "<a href=\"" + phrase['is_a'] + "\" class=\"" + phrase['is_a'] + "\">" + rawText + "</a>" self.__log( "<<" + rawText +">> <<" + str(phrase['text']) + ">>") self.__log( replacement) #create a new annotation machine and annotate the text annotateMachine = annotation.Annotation(rawText,replacement) annotateMachine.feed(annotatedPage) #string with the web page annotatedPage = annotateMachine.output() else: self.__log(phrase['text'] + " hasn't any class assigned, so it won't be annotated.") return annotatedPage def annotate(self, url ,onto): ontoNames = onto.getOntologyNames() temp = [] # 2.- If not, use wordnet. Get the last word and compare with each one in Ontology for phrase in self.nounPhrases: #for each Noun Phrase #1.- search it directly into the ontology self.__log("Step 3: Ontology length: " + str(len(ontoNames))) self.__log("Step 3: candidates number: " + str(len(phrase['candidates']))) Similars = self.__searchInOntology(phrase['candidates'], ontoNames) #1.1.- if no ones are found search the most similar if len(Similars) == 0: #1.2.- If not found, search the most similar Similars = [] self.__log("Searching candidates for NE: " + phrase['text']) Similars = self.__getMostSimilars(phrase['candidates'],ontoNames) #1.3.- If (similars.length > 1) use PMI-IR, else ... if len(Similars) == 0: self.__log("No candidates in ontology for " + phrase['text']) else: self.__log("Most appropiate candidates::") #calculate PMI-IR highestValue = 0.0 finalOne = ["",-1.0,"",0.0] for similar in Similars: ontologyValue = similar[0].partition("[")[2].partition("]")[0].replace("_"," ") #PMI-IR calculus query1 = live.LiveQuery("\""+ phrase['text'] + "\"" + " AND " + "\"" + ontologyValue + "\"") query2 = live.LiveQuery("\""+ ontologyValue + "\"") if float(query2.getNumberResults()) <= 0.0: similar[3] = 0.0 else: similar[3] = float(float(query1.getNumberResults()) / float(query2.getNumberResults())) #end PMI-IR calculus self.__log(phrase['text'] + "-" + ontologyValue + ". Values: " + str(float(query1.getNumberResults())) + "-" + str(float(query2.getNumberResults()))) self.__log(" " + similar[0] + " " +str(similar[1]) + " " +similar[2] + " " +str(similar[3])) #take the one with highest value if similar[3] > highestValue: highestValue = similar[3] finalOne = similar self.__log("PMI-IR: Chosen one "+ finalOne[0] + " with value "+ str(finalOne[3])) phrase['is_a'] = finalOne[0] def __searchInOntology(self, candidates, ontologyNames): similars = [] stemmer = nltk.WordnetStemmer() for candidate in candidates: #for each candidate of each Noun Phrase for key, value in ontologyNames.iteritems(): #for each ontology class Key = self.__getKeyText(candidate) realValue = value.partition("[")[2].partition("]")[0].replace("_"," ") realValue = realValue.lower().split(" ") for word in realValue: if word == Key: similars.append([value, 1.0, candidate, 0.0]) return similars def __getMostSimilars(self, candidates, ontologyNames): similarities=[] mostSimilars=[] stemmer = nltk.WordnetStemmer() for candidate in candidates: #for each candidate of each Noun Phrase self.__log(" Step 3: searching the similarities for " + candidate) for key, value in ontologyNames.iteritems(): #for each ontology class Key = self.__getKeyText(candidate) realValue = value.partition("[")[2].partition("]")[0].replace("_"," ") realValue = realValue.lower().split(" ") try: similarity = wordnet.N[stemmer.stem(Key.lower())][0].wup_similarity(wordnet.N[realValue.strip()][0]) similarities.append([value, similarity, candidate, 0.0]) except: similarities.append([value, -1.0, candidate, 0.0]) #search the ones with highest value of similarity and filter the repeated ones for similarity in similarities: if similarity[1] > configuration.SIMILARITY_THRESHOLD: found = False for s in mostSimilars: if s[0] == similarity[0]: found = True if not found: mostSimilars.append(similarity) else: found = False return mostSimilars def __getKeyText(self, key): text = key.split() cleanText = "" stemmer = nltk.WordnetStemmer() for word in text: type = word.partition("/")[2] word = word.partition("/")[0] word = word.replace(",","").replace(";","").replace(":","").replace(".","").lower() if type == "NN" or type == "NNP": cleanText = word if type == "NNS": cleanText = stemmer.stem(word) if cleanText == None: cleanText = word return cleanText def __addCandidate(self, phrase, candidate): cleanText = candidate try: phrase['candidates'].index(cleanText) except: if candidate.find(phrase['textTagged']) == -1: phrase['candidates'].append(cleanText) def __patternIsA(self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"" + phrase['text'] + " is a *\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition(phrase['text'] + " is a ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition(phrase['text'].lower() + " is a ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternAndOther(self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"" + phrase['text'] + " and other *\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition(phrase['text'] + " and other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition(phrase['text'].lower() + " and other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternOrOther(self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"" + phrase['text'] + " or other *\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition(phrase['text'] + " or other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition(phrase['text'].lower() + " or other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternLikeOther(self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"" + phrase['text'] + ", like other *\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition(phrase['text'] + ", like other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition(phrase['text'].lower() + ", like other ") #get the concept from the corresponding place candidate = self.__getFirstCandidate(parts[2]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternSuchAs(self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"* such as " + phrase['text'] + "\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition( "such as " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition( "such as " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternEspecially (self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"* especially " + phrase['text'] + "\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition( "especially " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition( "especially " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __patternIncluding (self, phrase): #dictionary = {'textTagged' : candidate, \ #'text' : text, \ #'matchings' : query.getMatchingResults(text) , \ #'matchingPercentage' : query.getMatchingPercentage(text), \ #'results' : query.getNumberResults(), \ #'candidates' : [], \ #'is_a: ""} #put phrase into the corresponding pattern queryText = "\"* including " + phrase['text'] + "\"" query = live.LiveQuery(queryText) results = query.getResults() for key,value in results.iteritems(): #split the text from the pattern parts = value.partition( "including " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) #NOW IN LOWER CASE parts = value.partition( "including " + phrase['text']) #get the concept from the corresponding place if parts[0] != parts: candidate = self.__getLastCandidate(parts[0]) if candidate != None: self.__addCandidate(phrase, candidate) return phrase def __getFirstCandidate(self, text): part = text.partition(".") part = part[0].partition(",") part = part[0].partition("?") part = part[0].partition("\n") part = part[0].partition(";") part = part[0].partition(":") part = part[0].partition("|") part = part[0].partition(" -") part = part[0].partition("- ") if len(part[0]) > 0: nounPhrases = self.tagger.getTextNounStatements(self.tagger.tagText(part[0])) if len(nounPhrases) > 0: return nounPhrases[0] else: return None def __getLastCandidate(self, text): if len(text) > 0: nounPhrases = self.tagger.getTextNounStatements(self.tagger.tagText(text)) if len(nounPhrases)>0: return nounPhrases[len(nounPhrases)-1] return None def __log(self, chain): print time.ctime() + ": " + chain