def split_trainingsdata_into_sentences(): data = json.load(open('trainings_data.json')) f = open('trainings_data_sentence', 'w') trainings_data_sentence = [] for entry in data['data']: sentences = entry['text'].split('.') for qas in entry['qas']: question = qas['question'] qp_result = process_question(question, NLPToolkit()) #relevant_sentences = [] #not_relevant_sentences = [] keywords = qp_result.question_model.get_keywords() answer_type = qp_result.answer_type #print(answer_type) for sentence in sentences: relevant = False for answer in qas['answers']: if answer in sentence: relevant = True count_keywords = get_number_of_keywords(sentence, keywords) similarity = get_similiarity(question, answer) #count_named_entities = get_number_of_named_entities(sentence, answer_type) #print("Keywords: " + str(keywords)) #print("NE: " + str(named_entities)) f.write(question + " " + sentence + " " + str(count_keywords) + " " + str(similarity) + " ::: " + str(relevant) + '\n')
def test_get_context_can_return_multiple_sentences(self): expected_context = "This contrasts with expendable launch " \ "systems, where each launch vehicle is launched once and then discarded. No completely reusable " \ "orbital launch system has ever been created." answer_predictor = AnswerPredictor(NLPToolkit()) context = answer_predictor._get_context_of_best_span(passage, 259, 270) self.assertEqual(context, expected_context)
def preprocessing_pipeline(docs: Documents, qp_result: QPResult, nlp_toolkit: NLPToolkit): # preprocessing pipeline: # 1: split docs in sentences # 2: filter sentences processed_docs = [] # we use all docs in the moment and evaluate based on the probability distribution available through softmax for doc in docs.docs: sentences = nlp_toolkit.text_to_sentences(doc.text) filtered_sentences = \ [sentence for sentence in sentences if filter_passages(sentence, qp_result.answer_type, nlp_toolkit)] processed_docs.append(' '.join(filtered_sentences)) return processed_docs
def process_question(question: str, nlp_toolkit: NLPToolkit) -> QPResult: # start logging Logger.info('started') start = datetime.now() # start question processing clf_name = get_clf_name(question) clf = get_clf_from_disk(clf_name) label = get_predicted_label(question, clf) keywords = nlp_toolkit.get_headwords(question) # keywords = get_key_words(question) # print(keywords) # print(AnswerType[label.upper()]) # end logging end = datetime.now() diff = end - start Logger.info('AnswerType: ' + str(AnswerType[label])) Logger.info('finished (' + str(diff.seconds) + '.' + str(diff.microseconds)[0:2] + ' s)') Logger.small_seperator() return QPResult(QuestionModel(keywords, question), AnswerType[label])
import unittest from utils.nlptoolkit import NLPToolkit nlp_toolkit = NLPToolkit() class TestNLPToolkit(unittest.TestCase): def test_get_headwords(self): headwords = nlp_toolkit.get_headwords('What is question answering') print(headwords) if __name__ == '__main__': unittest.main()
def text_contains_any_answer(text, answers): for a in answers: if a['text'] in text: return True return False Logger.config('info') Logger.info("Start analysis") question_counter = 0 correct_answers_counter = 0 nlptoolkit = NLPToolkit() for dataset in data['data']: title = dataset['title'] Logger.info('Dataset: ' + title) for paragraph in dataset['paragraphs'][:5]: context = paragraph['context'] for question_answer_set in paragraph['qas']: question_counter += 1 question = question_answer_set['question'] correct_answers = question_answer_set['answers'] Logger.info(question) doc = Document(title, context) docs = Documents()
def test_returns_the_sentence_containing_the_answer(self): answer_predictor = AnswerPredictor(NLPToolkit()) result = answer_predictor.predict(passage, question) self.assertEqual(result['context'], correct_context)
def test_returns_the_expected_answer(self): answer_predictor = AnswerPredictor(NLPToolkit()) result = answer_predictor.predict(passage, question) self.assertEqual(result['answer'], correct_answer)
sys.path.append(DIR + '/../../src') data = json.load(open('dev-v1.1.json')) def text_contains_any_answer(text, answers): for a in answers: if a['text'] in text: return True return False Logger.config('error') Logger.error("Start analysis") nlp = NLPToolkit() question_counter = 0 correct_answers_counter = 0 correct_answer_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0} false_answer = 0 for dataset in data['data']: title = dataset['title'] title_ext = title.replace('_', ' ') Logger.error('Dataset: ' + title) for paragraph in dataset['paragraphs'][:5]: for question_answer_set in paragraph['qas']: Logger.error( '####################################################')
def test_returns_main_article(self): question_str = "Arnold Schwarzenegger" question_model = QuestionModel(question_str.split(), question_str) result = document_retrieval.receive_docs(question_model, NLPToolkit()) self.assertEqual(result.get_doc_with_highest_rank().title, 'Arnold Schwarzenegger')
FINE_CLASSES_SYNSETS = [ 'abbreviation.n.01', 'formula.n.01', 'animal.n.01', 'body.n.01', 'color.n.01', 'creative.a.01', 'currency.n.01', 'event.n.01', 'food.n.01', 'musical_instrument.n.01', 'speech.n.02', 'letter.n.02', 'plant.n.02', 'merchandise.n.01', 'religion.n.01', 'sport.n.01', 'substance.n.01', 'symbol.n.01', 'technique.n.01', 'term.n.01', 'vehicle.n.01', 'word.n.01', 'definition.n.01', 'description.n.01', 'manner.n.01', 'reason.n.02', 'group.n.01', 'person.n.01', 'city.n.01', 'state.n.04', 'mountain.n.01', 'code.v.02', 'count.n.01', 'date.n.01', 'distance.n.01', 'money.n.01', 'rate.v.01', 'period.n.05', 'percentage.n.01', 'speed.n.01', 'temperature.n.01', 'size.n.01', 'weight.n.01', 'disease.n.01', 'entity.n.01', 'title.n.06', 'description.n.02', 'location.n.01', 'state.n.01', 'numeral.n.01' ] nlp = NLPToolkit() def get_features(questions): feature_enriched_questions = [] for question in questions: doc = get_doc(question) wh_word = str(get_wh_word(doc)) enriched_question = question if wh_word == "how": pass elif wh_word == "who": enriched_question = enriched_question + " " + str( get_head_word_noun_phrase(doc)) elif wh_word == "why": pass
return True return False def is_correct_article(title, correct_title) -> bool: if title == correct_title.replace("_", " "): return True else: return False Logger.config('error') Logger.info("Start document_retrieval analysis") nlpToolkit = NLPToolkit() question_counter = 0 correct_article_counter = 0 correct_firstArticle_counter = 0 correct_secondArticle_counter = 0 correct_thirdArticle_counter = 0 correct_answers_counter = 0 for dataset in data['data']: title = dataset['title'] Logger.info('Dataset: ' + title) for paragraph in dataset['paragraphs'][:5]: for question_answer_set in paragraph['qas']: question_counter += 1 question = question_answer_set['question']