def __init__(self, dataFile): self.sNLP = StanfordNLP() self.punc = {'.', '?', '!', '\n'} # self.textData = open(dataFile,"r", encoding='utf-8').read() self.textData = dataFile self.textData = self.preProcessText(self.textData) self.sentence_list = [] # self.getSentenceList(self.textData) self.tokenizePara(self.textData)
def __init__(self): self.sNLP = StanfordNLP() self.dropType = {} self.typeNer = {} self.typePro = {} self.initQstType() self.candidateAnswer = [] self.candidateSentence = [] self.qgPipeline = QGPipeline() self.threshold = 90
def When_module(sent, sent_features): question = [] structures = [] sNLP = StanfordNLP() # print(sent_features) # dep_parse = sNLP.dependency_parse(sent) # dep_parse = dep_parse.__next__() # # dep_parse_list = list(dep_parse.triples()) parse = sNLP.parse(sent) # parse.pretty_print() # for t in dep_parse_list: # print(t) # print(sNLP.ner(sent)) # print(sNLP.pos(sent)) when_parseTraversal(sent, parse, question, structures) # print(question) # print(structures) prev_min = float('Inf') if len(structures) > 0: whenPhrase = "" for t in structures: if t[1] < prev_min: whenPhrase = t[0] prev_min = t[1] # print(sent) # print(whenPhrase) thisQ = sent.replace(whenPhrase, "") dep_tree = sNLP.dependency_parse(thisQ) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) return construct_when(thisQ, dep_tree_list) for q in question: dep_tree = sNLP.dependency_parse(q) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) return construct_when(q, dep_tree_list) # print() pass
def Where_Which_module(sent, sent_features): question = [] simple_ques = [] sNLP = StanfordNLP() # print(sent_features) # dep_parse = sNLP.dependency_parse(sent) # dep_parse = dep_parse.__next__() # # dep_parse_list = list(dep_parse.triples()) parse = sNLP.parse(sent) # parse.pretty_print() # # for t in dep_parse_list: # print(t) # print(sNLP.ner(sent)) # print(sNLP.pos(sent)) where_which_inFirstPP(sent, parse, simple_ques) if len(simple_ques) > 0: for bool, thisSent, nerSet, thisPP in simple_ques: dep_tree = sNLP.dependency_parse(thisSent) dep_tree = dep_tree.__next__() dep_tree_list = list(dep_tree.triples()) # for t in dep_tree_list: # print(t) if bool: case = thisPP.split(" ")[0] type = "" if "COUNTRY" in nerSet: type = "country" elif "LOCATION" in nerSet: type = "location" elif "CITY" in nerSet: type = "city" else: type = "place" return([construct_where_which(thisSent, dep_tree_list,case,type)]) else: where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question) return(question)
def categorizeQs(sents, sent_to_Q_dict): # print(sents) sent_features = {} sNLP = StanfordNLP() normal_ners = sNLP.ner(sents) normal_ner_set = {t[1] for t in normal_ners} aux_Flag = max([1 if w in sents else 0 for w in aux_words]) # print(aux_Flag) if aux_Flag == 1: thisQues = Binary_QG.bin_question(sents) for p_b in thisQues: if p_b is not None: sent_to_Q_dict["Binary"].append((sents, p_b)) why_flag = max([1 if w in sents else 0 for w in why_keys]) # print(why_flag) if why_flag == 1: thisQues = Why_QG.why_q(sents) if thisQues is not None: sent_to_Q_dict["Why"].append((sents, thisQues)) thisQues = What_Who_QG.What_Who_module(sents) for p_ in thisQues: if p_ is not None: sent_to_Q_dict["What_Who"].append((sents, p_)) if 'LOCATION' in normal_ner_set or 'COUNTRY' in normal_ner_set or 'CITY' in normal_ner_set: thisQ = Where_Which_QG.Where_Which_module(sents, sent_features) for p in thisQ: if p is not None: sent_to_Q_dict["Where_Which"].append((sents, p)) if 'DATE' in normal_ner_set or 'TIME' in normal_ner_set: thisQ = When_QG.When_module(sents, sent_features) if thisQ is not None: sent_to_Q_dict["When"].append((sents, thisQ))
def why_q(sents): # preprocessing sNLP = StanfordNLP() parse = sNLP.parse(sents) sents = What_Who_QG.remove_modifiers(parse) # print("remove modifiers", sents) tokenized_sentences = [] question = "" tokenized_sentences.append(word_tokenize(sents)) q_set = [] for sent in tokenized_sentences: pos_tags = nltk.pos_tag(sent) # print(pos_tags) if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'): pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1]) q_list = copy.deepcopy(pos_tags) q_string = '' #print(pos_tags) for i in range(len(pos_tags)): if pos_tags[i][1] == 'VBD': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD') q_list.insert(0, ('Why did', 0)) break elif pos_tags[i][1] == 'VBZ': if pos_tags[i][0] in aux_words: q_list.insert(0, q_list.pop(i)) q_list.insert(0, ("Why", 0)) else: q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ") if q_list[i][0] == "do": q_list.pop(i) q_list.insert(0, ("Why does", 0)) break elif pos_tags[i][1] == 'VBP': q_list.insert(0, q_list.pop(i)) q_list.insert(0, ("Why", 0)) break replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:] q_list[0] = (replace_string, 0) #print(q_list) question = ' '.join([i[0] for i in q_list]) ind = -1 for k in why_keys: if question.find(k) != -1: ind = question.find(k) break if ind != -1: question = question[:ind - 1] question = question + "?" # print(question) if question != "": return (question) else: return None
def __init__(self): self.sNLP = StanfordNLP() self.sent_simpl = Simplification() self.QG = QuestionGeneration()
def __init__(self): self.sNLP = StanfordNLP()
def __init__(self): self.sNLP = StanfordNLP() self.beVerbs = {"am", "is", "are", "was", "were"}
def getNerSet(phrase): sNLP = StanfordNLP() return {t[1] for t in sNLP.ner(phrase)}
def bin_question(sents): # preprocessing # text_file = sys.argv[1] # sentences = [] # with io.open(text_file, 'r', encoding='utf-8') as f: # for line in f: # line = line.strip() # sentences.extend(sent_tokenize(line)) # # tagging # tokenized_sentences = [word_tokenize(i) for i in sentences if # (len(word_tokenize(i)) > 5) and (len(word_tokenize(i)) < 25)] sNLP = StanfordNLP() parse = sNLP.parse(sents) sents = What_Who_QG.remove_modifiers(parse) # print("remove modifiers", sents) tokenized_sentences = [] tokenized_sentences.append(word_tokenize(sents)) # print("TOKE", tokenized_sentences) aux_words = ['are', 'was', 'were', 'is', 'have', 'has'] aux_words = set(aux_words) question_set = [] # c = 0 for sent in tokenized_sentences: pos_tags = nltk.pos_tag(sent) # print(pos_tags) if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'): pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1]) q_list = copy.deepcopy(pos_tags) q_string = '' for i in range(len(pos_tags)): if pos_tags[i][0] in aux_words: q_list.insert(0, q_list.pop(i)) break elif pos_tags[i][1] == 'VBD': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD') q_list.insert(0, ('Did', 0)) break elif pos_tags[i][1] == 'VBZ': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ") q_list.insert(0, ("Does", 0)) # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v') break elif pos_tags[i][1] == 'VBP': q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBP") q_list.insert(0, ("Do", 0)) # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v') break if q_list[0][0].lower() in [ 'are', 'was', 'were', 'is', 'have', 'has', 'did', 'do', 'does' ]: replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:] q_list[0] = (replace_string, 0) question = ' '.join([i[0] for i in q_list]) question = question[:-2] question = question + "?" question_set.append(question) # print(question_set) return question_set
# for i in dep_tree: # if i[1] in ['nsubj', 'csubj', 'nsubjpass']: # return(i[0][0], i[0][1]) # return (None,None) # def findAuxVerb(dep_tree, verb): # aux = "" # mod = "" # for i in dep_tree: # if i[0][0] == verb and i[1] in ["auxpass", "aux"]: # aux += i[2][0]+" " # if i[0][0] == verb and i[1] in ["adv", "advmod"]: # mod += i[2][0] + " " # return (aux, mod, verb) sNLP = StanfordNLP() def getDecapitalized(sentence): tokens = sNLP.word_tokenize(sentence) first = tokens[0] # print(first) thisNER = sNLP.ner(sentence) # print(thisNER) if thisNER[0][1] not in [ 'PERSON', 'LOCATION', 'ORGANIZATION', 'CITY', 'NATIONALITY', 'COUNTRY', 'TIME' ]: first = first.lower() return first + " " + " ".join(tokens[1:])