def recoursiveFind(self, sentenceDoc, subject, verb, root): pred = verb.word adv = findVerbModifier(sentenceDoc) flag = True if adv: pred = pred + adv.orth_ flag = False root = adv for child in root.children: if child.dep_ == "prep": for proj in child.children: if proj.dep_ == "pobj": temp = self.nounArray.findWord(proj.orth_) if flag: newWord = datastructure.Word(child.orth_) newWord.addType(child.pos_) newWord.addUri(wordUri.findUri(newWord)) #newWord.addUri(newWord.word + "URI") print(subject.uri, "- " + newWord.uri + " -", temp.uri) self.file.write(subject.uri + "; " + newWord.uri + "; " + temp.uri + "\n") else: newWord = datastructure.Word(adv.orth_) newWord.addType(adv.pos_) newWord.addUri(wordUri.findUri(newWord)) #newWord.addUri(newWord.word + "URI") print(subject.uri, "- " + newWord.uri + " -", temp.uri) self.file.write(subject.uri + "; " + newWord.uri + "; " + temp.uri + "\n") break
def findThird(self, sentenceDoc, subject, verb, children, flag): for child in children: if child.dep_ == "appos" or child.dep_ == "pobj": temp = self.nounArray.findWord(child.orth_) if temp is None: w = datastructure.Word(child.orth_) w.addType(child.pos_) w.addUri(wordUri.findUri(w)) #w.addUri(w.word + "URI") print(subject.uri, "- " + verb.uri + " -", w.uri) self.writeOtter(subject.uri, verb.uri, w.uri) else: print(subject.uri, "- " + verb.uri + " -", temp.uri) self.writeOtter(subject.uri, verb.uri, temp.uri) #self.recoursiveFind(sentenceDoc, subject, verb, child) if child.dep_ == "prep" or child.dep_ == "acomp": if not flag: verb = datastructure.Word(child.orth_) verb.addType(child.pos_) verb.addUri(wordUri.findUri(verb)) verbChildren = [] for ch in child.children: verbChildren.append(ch) self.findThird(sentenceDoc, subject, verb, verbChildren, True)
def findThird(self, sentenceDoc, subject, verb, children): found = False for child in children: if child.dep_ == "attr": temp = self.nounArray.findWord(child.orth_) if temp is None: w = datastructure.Word(child.orth_) w.addType(child.pos_) w.addUri(wordUri.findUri(child.lemma_)) #w.addUri(w.word + "URI") print(subject.uri, "- " + verb.uri + " -", w.uri) self.file.write(subject.uri + "; " + verb.uri + "; " + w.uri + "\n") else: print(subject.uri, "- " + verb.uri + " -", temp.uri) self.file.write(subject.uri + "; " + verb.uri + "; " + temp.uri + "\n") self.recoursiveFind(sentenceDoc, subject, verb, child) found = True if not found: for word in sentenceDoc: if word.dep_ == "ROOT": verbDoc = word break self.recoursiveFind(sentenceDoc, subject, verb, verbDoc)
def findNouns(self): nounArray = WordArray() entities = list(self.doc.ents) # print("There were {} entities found".format(len(entities))) # print(entities) for e in [ entity for entity in entities if entity.label_ not in [ 'DATE', 'TIME', 'PERCENT', 'CARDINAL', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL' ] ]: temp = datastructure.Noun(e.orth_) temp.addCategory(e.label_) end = True for w in self.doc: if end: string = e.orth_.split() if w.orth_ in string: temp.addType(w.pos_) temp.addUri(wordUri.findUri(temp)) end = False nounArray.addWord(temp) for word in self.doc.noun_chunks: temp = datastructure.Noun(word.orth_) temp.addCategory("UNKNOWN") temp.addType("NOUN") if temp not in nounArray: temp.addUri(wordUri.findUri(temp)) nounArray.addWord(temp) print(temp) for word in self.doc: # print(nc.orth_) if word.pos_ == 'NOUN' or word.pos_ == "PROPN": temp = datastructure.Noun(word.orth_) temp.addCategory("UNKNOWN") temp.addType(word.pos_) temp.addUri(wordUri.findUri(temp)) if temp not in nounArray: nounArray.addWord(temp) nounArray = purifyNounArray(nounArray) nounArray = removeArticles(nounArray, self.doc) return nounArray
def findVerbs(self): verbsArray = WordArray() temp = [] for word in self.doc: if word.pos_ == 'VERB' and word.orth_ not in temp: temp.append(word.orth_) verb = datastructure.Verb(word.orth_) verb.addRoot(word.lemma_) verb.addType(word.pos_) verb.addUri(wordUri.findUri(verb)) verbsArray.addWord(verb) return verbsArray
def removeArticles(nounArray, doc): removeList = ["det", "amod", "nummod"] for word in nounArray: if len(word.word.split()) > 1 and word.type != "PROPN": for w in doc: if w.orth_ in word.word.split() and w.dep_ in removeList: temp = word.word.split() string = "" for t in temp: if w.orth_ != t: string = string + t + " " word.word = string[:-1] word.addUri(wordUri.findUri(word)) return nounArray