def Where_Which_module(sent, sent_features): question = [] simple_ques = [] sNLP = StanfordNLP() # print(sent_features) # dep_parse = sNLP.dependency_parse(sent) # dep_parse = dep_parse.__next__() # # dep_parse_list = list(dep_parse.triples()) parse = sNLP.parse(sent) # parse.pretty_print() # # for t in dep_parse_list: # print(t) # print(sNLP.ner(sent)) # print(sNLP.pos(sent)) where_which_inFirstPP(sent, parse, simple_ques) if len(simple_ques) > 0: for bool, thisSent, nerSet, thisPP in simple_ques: dep_tree = sNLP.dependency_parse(thisSent) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) if bool: case = thisPP.split(" ")[0] type = "" if "COUNTRY" in nerSet: type = "country" elif "LOCATION" in nerSet: type = "location" elif "CITY" in nerSet: type = "city" else: type = "place" return([construct_where_which(thisSent, dep_tree_list,case,type)]) else: where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question) return(question)
def categorizeQs(sents, sent_to_Q_dict): # print(sents) sent_features = {} sNLP = StanfordNLP() normal_ners = sNLP.ner(sents) normal_ner_set = {t[1] for t in normal_ners} aux_Flag = max([1 if w in sents else 0 for w in aux_words]) # print(aux_Flag) if aux_Flag == 1: thisQues = Binary_QG.bin_question(sents) for p_b in thisQues: if p_b is not None: sent_to_Q_dict["Binary"].append((sents, p_b)) why_flag = max([1 if w in sents else 0 for w in why_keys]) # print(why_flag) if why_flag == 1: thisQues = Why_QG.why_q(sents) if thisQues is not None: sent_to_Q_dict["Why"].append((sents, thisQues)) thisQues = What_Who_QG.What_Who_module(sents) for p_ in thisQues: if p_ is not None: sent_to_Q_dict["What_Who"].append((sents, p_)) if 'LOCATION' in normal_ner_set or 'COUNTRY' in normal_ner_set or 'CITY' in normal_ner_set: thisQ = Where_Which_QG.Where_Which_module(sents, sent_features) for p in thisQ: if p is not None: sent_to_Q_dict["Where_Which"].append((sents, p)) if 'DATE' in normal_ner_set or 'TIME' in normal_ner_set: thisQ = When_QG.When_module(sents, sent_features) if thisQ is not None: sent_to_Q_dict["When"].append((sents, thisQ))
class QA(): def __init__(self): self.sNLP = StanfordNLP() self.dropType = {} self.typeNer = {} self.typePro = {} self.initQstType() self.candidateAnswer = [] self.candidateSentence = [] self.qgPipeline = QGPipeline() self.threshold = 90 def initQstType(self): self.typeSet = ['WHADJP', 'WHADVP', 'WHPP', 'WHAVP', 'WHNP'] self.dropType['WHADJP'] = ['NP', 'CD'] self.dropType['WHAVP'] = ['PP', 'SBAR'] self.dropType['WHADVP'] = ['PP', 'SBAR'] self.dropType['WHPP'] = ['PP'] self.dropType['WHNP'] = ['NP'] self.dropType['UK'] = ['NP', 'NN'] self.auxWord = ['did', 'do', 'does', 'is', 'are', 'were', 'was'] self.typePro['where'] = ['in', 'at', 'on', 'behind', 'next'] self.typeNer['when'] = ['DATE'] self.typeNer['where'] = [ 'CITY', 'STATE_OR_PROVINCE', 'ORGANIZATION', 'LOCATION', 'COUNTRY' ] def decideType(self, myParent): if self.qstFlag: return for node in myParent: #node.pretty_print() if self.qstFlag: return if isinstance(node, str): continue if node.label() in self.typeSet: self.thisType = node.label() myParent.remove(node) self.qstFlag = True self.decideType(node) if node.label() == 'ROOT': self.qstSim = node.leaves() self.qstSim = ' '.join(self.qstSim[:-1]) def parseDep(self, x): a = x[0][0].lower() a = WordNetLemmatizer().lemmatize(a) b = x[2][0].lower() b = WordNetLemmatizer().lemmatize(b) return (a, b) def bin_answer(self, question, sent): #print(question, sent) qstTree = self.sNLP.dependency_parse(question) qstTree = qstTree.__next__() qstTree = list(qstTree.triples()) sentTree = self.sNLP.dependency_parse(sent) sentTree = sentTree.__next__() sentTree = list(sentTree.triples()) #print(qstTree, sentTree) qstSub = [] sentSub = [] flag = False neg = False for x in qstTree: # print(x) if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: qstSub.append(self.parseDep(x)) if x[1] == 'neg': neg = True for x in sentTree: if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: sentSub.append(self.parseDep(x)) if self.parseDep(x) in qstSub: flag = True #print(qstSub) #print(sentSub) if flag: if neg: return ('No', 100) else: return ('Yes', 100) bin_tags = set( ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has']) question = question.lower() sent = sent.lower() q_tokens = word_tokenize(question) s_tokens = word_tokenize(sent) negations = set(['not', 'never', "aren't"]) ans = '' # case 1: negations for neg in negations: if (neg in q_tokens) and (neg not in s_tokens): if ans == "No": ans = "Yes" else: ans = "No" if (neg in q_tokens) and (neg in s_tokens): if ans == "Yes": ans = "No" else: ans = "Yes" # case 2: similarity sim = fuzz.partial_ratio(question, sent) if sim > 90: ans = "Yes" else: ans = "No" return (ans, sim) def qstType(self, qst): self.thisType = 'UK' self.qstFlag = False self.qstSim = None tree = self.sNLP.parser_sents([ qst, ]) for i in tree: self.decideType(i) def fitness(self, txt, qst): self.qstType(qst) if self.thisType == 'UK': _, sim = self.bin_answer(qst, txt) return sim > self.threshold qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in [txt]: extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_ans = '_' best_candi = None best_sen = None for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) score = fuzz.partial_ratio(self.qstSim, nowSentence) this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue best_dis = score best_sen = nowSentence best_ans = this_ans return self.threshold < best_dis def dropFragment(self, myParent, qstType): flag = 0 for node in myParent: if isinstance(node, str): continue if self.dropTime > self.dropTotal: return if node.label() in self.dropType[qstType]: self.dropTime += 1 if self.dropTime > self.dropTotal: myParent.remove(node) self.candidateAnswer.append(node.leaves()) self.findFlag = 1 return self.dropFragment(node, qstType) if node.label() == 'ROOT' and self.findFlag: # print(node.leaves()) self.candidateSentence.append(node.leaves()) def findFragment(self, myParent, qstType): for node in myParent: if isinstance(node, str): continue # node.pretty_print() if node.label() in self.dropType[qstType]: self.candidateAnswer.append((node.leaves(), node.label())) self.findFragment(node, qstType) def answerSpecial(self, txtList, tokens, qstType): # print(tokens[0]) self.candidateAnswer = [] self.finalAnswer = [] self.candidateSentence = [] for txt in txtList: tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.findFragment(i, qstType) for i in self.candidateAnswer: sentence = ' '.join(i[0]) pos_tag = self.sNLP.ner(sentence) print(pos_tag) if pos_tag[1][1] in self.typeNer[qstType]: # print(pos_tag) self.finalAnswer.append(sentence) print(self.finalAnswer[0]) def preProcessText(self, text): data = re.sub("\(.*\)", "", text) data = re.sub(' +', ' ', data).strip() return data def answer(self, txtList, qst): self.head = word_tokenize(qst)[0].lower() self.qstType(qst) if self.thisType == 'UK': best_score = 0 best_ans = 'Yes' best_sent = '_' for txt in txtList: ans, sim = self.bin_answer(qst, txt) if sim > best_score: best_ans = ans best_score = sim best_sent = txt #print('=======') #print(best_sent) #print(qst) print(best_ans + '.') #print(best_score) #print('=======') return qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in txtList: thisSent = self.preProcessText(thisSent) if (len(word_tokenize(thisSent)) < 4 or len(word_tokenize(thisSent)) > 25): continue extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_candi = None best_sen = None best_ans = '_' for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) # print(nowSentence) # print(self.qstSim) score = fuzz.partial_ratio(self.qstSim, nowSentence) # print(score) # print('----------') this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue if self.head == 'who': ners = getExhaustiveNERs(this_ans) #print(this_ans, ners[0]) if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[ 0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'when': ners = getExhaustiveNERs(this_ans) if 'DATE' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'where': ners = getExhaustiveNERs(this_ans) if 'LOCATION' not in ners[0] and 'CITY' not in ners[ 0] and 'ORGANIZATION' not in ners[ 0] and 'STATE_OR_PROVINCE' not in ners[ 0] and 'COUNTRY' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 best_dis = score best_sen = nowSentence best_ans = this_ans #print('++++++++++++++++++') #print(qst) #print(best_dis) #print(best_sen) if best_ans == '_': print('I cannot answer that question: ' + qst) else: print(best_ans.capitalize() + '.') #print('++++++++++++++++++') def edit_distance(self, s1, s2): if len(s1) < len(s2): return self.edit_distance(s2, s1) # len(s1) >= len(s2) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): c1 = c1.lower() current_row = [i + 1] for j, c2 in enumerate(s2): c2 = c2.lower() insertions = previous_row[ j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer deletions = current_row[j] + 1 # than s2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] for i in range(len(self.candidateSentence)): nowSentence = self.candidateSentence[i] score = self.edit_distance(nowSentence, tokens) best_candi = ' '.join(nowSentence) this_ans = ' '.join(self.candidateAnswer[i]) if (score < best_dis or (score == best_dis and len(this_ans) < len(best_ans))): best_dis = score best_ans = this_ans return best_dis
class QuestionGeneration: def __init__(self): self.sNLP = StanfordNLP() self.beVerbs = {"am", "is", "are", "was", "were"} # self.aux_verbs = {'is', 'were', 'can', 'could', } def auxilaryWord(self, sub, POS_tag): # TODO lowercase # TODO will may... # TODO plural... # Jerry and I if sub.lower() in ('i', 'they', 'you'): return 'do' if sub.lower() in ('he', 'she'): return 'does' def beWork(self, sentence): # pos = nltk.pos_tag(sentence) j = None for i in range(len(sentence) - 1): if sentence[i] in self.beVerbs: j = i break if j is not None: temp = sentence[j] sentence.pop(j) sentence.insert(0, temp) #print(sentence) return sentence return # def getNounandVerbOfSentence(self, sentence): def QG(self, text): dep_parse_Tree = self.sNLP.dependency_parse(text) dep_parse_Tree = dep_parse_Tree.__next__() Ques_list = [] # Yes or No question be_question = self.beWork(text) if be_question is not None: be_question += '?' Ques_list.append(be_question) # WHO question for Subject # create NER tags ner_tags = dict(self.sNLP.ner(text)) pos_tag = self.sNLP.pos(text) #print(ner_tags) # get triples list of the dependency tree triples_list = list(dep_parse_Tree.triples()) #print(triples_list) ##### LOOP THRU DEPENDENCY TREE AND CREATE QUESTIONS auxWord = 'xxx' for this in triples_list: # print(this) temp_text = '?' # for the subject question if this[1] in ['nsubj', 'csubj', 'nsubjpass']: subject = None sub_pos = None # in order of preference if this[2][1] in ['NNP', 'NNPS', 'PRP']: subject = this[2][0] sub_pos = this[2][1] elif this[0][1] in ['NNP', 'NNPS']: subject = this[0][0] sub_pos = this[0][1] elif this[2][1] in ['NN', 'NNS']: subject = this[2][0] sub_pos = this[2][1] #print("sub", subject) if subject is not None: # need to add sub_pos auxWord = self.auxilaryWord(subject, sub_pos) if ner_tags[subject] in ['PERSON', 'TITLE', 'MISC' ]: # check if its a PERSON NER temp_text = self.contructQ(triples_list, subject, text, None) temp_text = temp_text.replace(subject, "Who").replace( " .", "?") # create question # some string manipulation to get the ? if "?" not in temp_text: temp_text = temp_text + "?" # print(text.replace(subject, "Who").replace(" .", "?")) if ner_tags[ subject] == 'ORGANIZATION': # if the subject is ORG temp_text = text.replace(subject, "Which organization").replace( " .", "?") if ner_tags[subject] == 'CITY': # if the subject is CITY temp_text = text.replace(subject, "Which city").replace( " .", "?") if ner_tags[ subject] == 'COUNTRY': # if the subject is CITY temp_text = text.replace(subject, "Which country").replace( " .", "?") if this[2][1] in ['PRP']: # if the subject is preposition temp_text = text.replace(subject, "Who").replace(" .", "?") if ner_tags[subject] in [ 'O', 'LOCATION' ] and temp_text == '?': # if the subject is Other temp_text = self.contructQ(triples_list, subject, text, None) if sub_pos == 'PRP' and subject.lower() in [ 'they', 'he', 'she' ]: temp_text = temp_text.replace(subject, "Who").replace( " .", "?") else: temp_text = temp_text.replace(subject, "What").replace( " .", "?") # for number, How many questions elif this[1] in ['nummod']: numPhrase = this[2][0] + ' ' + this[0][0] targetWord = this[2][0] if ner_tags[targetWord] in ('NUMBERS'): temp_text = text.replace(numPhrase, "").replace(" .", "?") temp_text = "How many " + this[0][0] + " " + ( auxWord if auxWord is not None else "") + " " + temp_text # for possessive questions elif this[1] in ['nmod:poss']: if this[2][1] in ['NNP']: # if this[2][0][-1] == 's': # poss_word = this[2][0] # else: poss_word = this[2][0] #+ " 's" temp_text = self.contructQ(triples_list, this[2][0], text, None) temp_text = temp_text.replace(poss_word, "Whose").replace( " .", "?").replace("'s", "").replace(" '", "") if not temp_text.startswith("Whose"): temp_text = temp_text.replace("Whose", "whose").replace( " '", "") # for prop questions elif this[1] in ('case'): subject = this[0][0] propPhrase = this[2][0] + ' ' + this[0][0] # print(propPhrase) if ner_tags[subject] in ['CITY']: # where temp_text = text.replace(propPhrase, "").replace( " .", "?") # create question temp_text = "Where " + (auxWord if auxWord is not None else "") + " " + temp_text # some string manipulation to get the ? if ner_tags[subject] in ['DATE']: # when temp_text = text.replace(propPhrase, "").replace(" .", "?") # print(auxWord, temp_text) temp_text = "When " + (auxWord if auxWord is not None else "") + " " + temp_text elif this[1] in ('iobj', 'dobj'): # code to be written for questions on direct and indirect Objects pass #### endif if "?" not in temp_text: temp_text = temp_text + "?" if temp_text != '?': # print(temp_text) Ques_list.append(temp_text) return (Ques_list) #### in case of the subject has modifiers or the Subject is a part of a long NP remove all the related modifiers of the subject with the help of dependency tree #### same to be replicated for Object as well def contructQ(self, list_triples, subject, text, object): if subject is not None: text = text[text.find( subject ):] ## removing unnecessary determinants (a, the, An) by slicing off until the subject word # print(text) dict_of_words_removed = { } # subject related word removal to construct a question for thisTriple in list_triples: ## loop thru dependency tree if thisTriple[0][0] == subject or thisTriple[0][ 0] in dict_of_words_removed: if thisTriple[1] not in ['nsubj', 'csubj']: if (thisTriple[2][0]).lower() not in [ 'the', 'a', 'an' ]: # skipping determinants as they can be present in other places of the sentence as well text = re.sub( ' +', ' ', text.replace(thisTriple[2][0], '')).strip( ) # removing subject related words dict_of_words_removed[thisTriple[2][ 0]] = 0 # adding the removed word so that other words that are connected to this can also be removed return (text)
def getNerSet(phrase): sNLP = StanfordNLP() return {t[1] for t in sNLP.ner(phrase)}