def computeResult(self): #For evaluation of BOW eval_qns = faq_config.getEvaluationQns() for qns in eval_qns: TOPTEN_ANS.clear() user_qa = [base_objects.QAPair(qns.question, "")] if self.aType == 1: #BOW Type user_feat_extractor = nltk_objects.NLTKFeatureExtraction( user_qa) bow_algo = BOWAlgorithm(user_qa, user_feat_extractor, self.fext) resultDict = bow_algo._compute() else: uq_nlp_feat = [b.TextFeatureExtraction(qns.question, qns)] tstate = model.State(uq_nlp_feat, self.fext, model.final_weights, None) resultDict = tstate.get_final_scores(model.final_weights)[0] self.get_topNResults(resultDict, 10) index_ = TOPTEN_ANS.index( qns.answer) if qns.answer in TOPTEN_ANS else -1 print("Question is: ", qns.question) print("Correct answer at index: ", index_) print("--------------------------------------------") self.rdict.update({qns.question: index_ + 1})
def main(): print("****** Hummingbird FAQ engine powered by NLTK *********") faqs = faq_config.getFAQs() ''' TRAINING Code ''' if do_training: state = model.train_model(faqs) model.final_weights = state.weights if report_training: all_scores = state.get_scores(state.weights) for ix, q_score_set in enumerate(all_scores): dict_scores = sorted([(ascore, qnum) for qnum, ascore in q_score_set.items()], reverse=True) print(state.best_choices[ix]) for pair in dict_scores: print("%2d: %f" % (pair[1], pair[0])) print() if do_main: faq_bow_feat = nltk_objects.NLTKFeatureExtraction(faqs) faq_nlp_feat = model.get_faq_features(faqs) run_mrr(faq_bow_feat, CONFIG_ALGO_BOW) space_out() run_mrr(faq_nlp_feat, CONFIG_ALGO_NLP) print( "You can enter question multiple times. Enter quit or Ctrl+c to quit" ) while 1: #''' space_out() user_q = input("Enter your question or 'quit' to Exit : ") #user_q = "when is hummingbird season" #user_q = "Do hummingbirds migrate in winter?" #user_q = "How fast do hummingbirds' wings beat per second?" if user_q == "" or user_q == None: raise ValueError("Invalid question given. Exiting") exit(1) elif user_q == "quit": print("Thank you for trying out our FAQ Engine..Exiting") exit(1) user_qa = [base_objects.QAPair(user_q, "")] space_out() run_userq(user_qa, faq_bow_feat, CONFIG_ALGO_BOW) space_out() run_userq(user_qa, faq_nlp_feat, CONFIG_ALGO_NLP)
def run_userq(user_qa, faq_feat, algoType): #FIXME: It has to be added to the empty list because nltk_object operates on the list #Alt: Alternate approach. Only call __tokenize(). But move stops to a class variable. user_q = user_qa[0].question if (algoType == CONFIG_ALGO_BOW): #BOW specific implementation. uq_bow_feat = nltk_objects.NLTKFeatureExtraction(user_qa) bow_algo = BOWAlgorithm(user_q, uq_bow_feat, faq_feat) resultDict = bow_algo._compute() else: #NLP Pipeline specific uq_nlp_feat = [b.TextFeatureExtraction(user_q, user_qa)] ''' Testing code ''' tstate = model.State(uq_nlp_feat, faq_feat, model.final_weights, None) nlp_rdict = tstate.get_final_scores(model.final_weights) resultDict = nlp_rdict[0] print_results(user_q, resultDict, algoType)
#TODO: not sure if i need to remove stopwords before lemmatizing (ok, tokenizing does that) # but i'm not sure if tokenizing should do that........ #TODO: some words (like In) may need to be lowercased #TODO: maybe we should leave stopwords. like "to" should be there for verbs i feel... #TODO: words like "United States" are being tagged with synsets separately #TODO: need to add in parts of speech. look at question 50. "build" should not be a noun sub_folder = 'data/synsets' faqs = faq_config.getFAQs() feature_extractor = no.NLTKFeatureExtraction(faqs) #flatten = lambda l: [item for sublist in l for item in sublist] def save_synsets(lemmas, filename): with open(filename, "w+") as outfile: first = True for lemma in lemmas: lemma_synset = lesk.get_lemma_synset(lemma, lemmas) if lemma_synset is not None: if not first: outfile.write('\n') outfile.write("%s %s" % (lemma, lemma_synset.name())) first = False
import nltk_objects import faq_config faqs = faq_config.getFAQs() feature_extractor = nltk_objects.NLTKFeatureExtraction(faqs) for qatoken in feature_extractor.tokens: print(qatoken) for qatoken in feature_extractor.sentence_tokens: print(qatoken) for qabow in feature_extractor.bow: print(qabow) for qalemma in feature_extractor.lemmas: print(qalemma) for qastem in feature_extractor.stems: print(qastem) for postag in feature_extractor.pos_tags: print(postag) for graphs in feature_extractor.dependency_graphs: print(graphs) for syns in feature_extractor.synsets: print(syns) ''' Test cases: