def get_key_words(text): analyser = TextAnalyser() key_words = set(analyser.get_words_without_stopwords(text)) #print(key_words) nn_phrases = [' '.join(p) for p in analyser.get_nn_chunks(text) if len(p) > 1] #print(nn_phrases) key_words = key_words.union(nn_phrases) np_phrases = [' '.join(p) for p in analyser.get_np_chunks(text) if len(p) > 1] key_words = key_words.union (np_phrases) key_words = [w for w in key_words] return key_words
class SimilarityPipeline: def __init__(self, train_data, test_data, analyser=None): self._train_data = train_data self._test_data = test_data self._analyser = analyser self.predictions = pd.DataFrame("-", index=self._test_data.x.index.values, columns=['answer', "description", "train_question", "train_id","train_answer"]) if analyser is None: self._analyser = TextAnalyser() def run_pipeline(self): self._calc() def print_summary(self): pass def score(self): pass def write_to_disk(self, directory): self.predictions.to_csv(os.path.join(directory, "predictions.csv")) self.predictions["answer"].to_csv(os.path.join(directory, "submission_predictions.csv"), header=["correctAnswer"]) test_data = self._test_data.x.join(self._test_data.y) if self._test_data.y is not None else self._test_data.x test_data.join(self.predictions, rsuffix="pred.").to_csv(os.path.join(directory, "test_data_with_predictions.csv")) def _calc(self): question_index = self._train_data.x.columns.get_loc("question") test_q_index = self._train_data.x.columns.get_loc("question") for test_row in self._test_data.x.itertuples(): test_sentence = test_row[test_q_index + 1] train_tuples = self._train_data.x.itertuples() top_similar_train_row = \ self._analyser.get_top_n_similar_sentences(test_sentence, train_tuples, 1, similarity_threshold= 0, sentence_extractor= lambda x: x[question_index + 1]) if len( top_similar_train_row ) > 0: self._calculate_correct_answer(test_row, top_similar_train_row[0]) def _calculate_correct_answer(self, test_row, top_similar_train_row): correct_answer_column_name = self._train_data.y.loc[top_similar_train_row[0], "answer"] correct_answer_column_index = self._train_data.x.columns.get_loc(correct_answer_column_name) + 1 top_matching_answer = \ self._analyser.get_top_n_similar_sentences(top_similar_train_row[correct_answer_column_index], test_row[2:6], 1, 0)[0] self.predictions.loc[test_row[0]].description = top_matching_answer self.predictions.loc[test_row[0]].answer = ["A", "B", "C", "D"][test_row[2:6].index(top_matching_answer)] self.predictions.loc[test_row[0]].train_question=top_similar_train_row[self._train_data.x.columns.get_loc("question")+1] self.predictions.loc[test_row[0]].train_id=top_similar_train_row[0] self.predictions.loc[test_row[0]].train_answer=top_similar_train_row[correct_answer_column_index]
def __init__(self, train_data, test_data, analyser=None): self._train_data = train_data self._test_data = test_data self._analyser = analyser self.predictions = pd.DataFrame("-", index=self._test_data.x.index.values, columns=['answer', "description", "train_question", "train_id","train_answer"]) if analyser is None: self._analyser = TextAnalyser()
def __init__(self, data, text_analyser=None, logger=None): self._data = data self.logger = logger or logging.getLogger(__name__) self.predictions = pd.DataFrame("-", index=self._data.x.index.values, columns=['answer', "question_key", "A_Key", "B_Key", "C_Key", "D_key", "score", "A_score", "B_score", "C_score", "D_score", "q_word_count"]) if text_analyser is None: self._analyser = TextAnalyser()
class IntegrationTestAristoAnalyser(unittest.TestCase): def setUp(self): self.sut = TextAnalyser() self.data = "When athletes begin to exercise, their heart rates and respiration rates increase. " def test_should_tokenise_words(self): self.sut.get_words(self.data) def test_should_get_named_entities(self): self.sut.aristo_get_named_entities(self.data) def test_should_write_most_common_words_to_file(self): self.sut.aristo_write_most_common_words_to_file(self.data, 10 , "../../../temp.tsv") def test_should_write_most_common_nouns_to_file(self): self.sut.aristo_write_most_common_nouns_to_file(self.data, 10 , "../../../test_nouns.tsv")
class SolrWikipediaPipeline: """ Default scoring pipeline """ def __init__(self, data, text_analyser=None, logger=None): self._data = data self.logger = logger or logging.getLogger(__name__) self.predictions = pd.DataFrame("-", index=self._data.x.index.values, columns=['answer', "question_key", "A_Key", "B_Key", "C_Key", "D_key", "score", "A_score", "B_score", "C_score", "D_score", "q_word_count"]) if text_analyser is None: self._analyser = TextAnalyser() def run_pipeline(self): self._calc() def print_summary(self): pass def score(self): if self._data.y is None: return -1 df = self._data.x.join(self._data.y) df = df.join(self.predictions, rsuffix="pred") correct_answers = df[df.answer == df.answerpred] score = len(correct_answers.index) / len(df.index) self.logger.info("Score : " + str(score)) return score def write_to_disk(self, directory): self.predictions.to_csv(os.path.join(directory, "predictions.csv")) self.predictions["answer"].to_csv(os.path.join(directory, "submission_predictions.csv"), header=["correctAnswer"]) test_data = self._data.x.join(self._data.y) if self._data.y is not None else self._data.x test_data.join(self.predictions, rsuffix="pred").to_csv(os.path.join(directory, "data_with_predictions.csv")) def _calc(self): self.logger.info("running _calc") q_index = self._data.x.columns.get_loc("question") id_index = 0 for row in self._data.x.itertuples(): question = row[q_index + 1] max_score = -1 correct_answer = "-" self.logger.info("running question id {}".format(row[0])) for choice in ["A", "B", "C", "D"]: choice_index = self._data.x.columns.get_loc(choice) choice_text = row[choice_index + 1] score = self._get_score_answer_search_within_top_question_search_pages(question, choice_text) if score > max_score: max_score = score correct_answer = choice self.predictions.loc[row[id_index]].answer = correct_answer self.predictions.loc[row[id_index]].score = max_score self.predictions.loc[row[id_index]].q_word_count = len(self._analyser.get_words_without_stopwords(question)) def _get_score_answer_search_within_top_question_search_pages(self, question, answer_choice, url=None): """ Calculates the score of the answer as follows 1. Obtain key words from the question & the answer by removing stop words 2. Obtain top 3 pages matching the search results for the questions 3. Search solr for the answer key words, but restrict the search to the top 3 pages from the question search 4. Return the average score for the answer search :param url: solr url :param question: The question string :param answer_choice: The answer string :return: the score of the answer """ # Get keywords from question and answer self.logger.info("running _get_score_answer_search_within_top_question_search_pages") self.logger.info("------------") exlude_words = self._get_science_stop_words() q_keywords = [word for word in self._analyser.get_words_without_stopwords(question) \ if word.lower() not in exlude_words] q_keywords = self._remove_duplicates_preserve_order(q_keywords) a_keywords = [word for word in self._analyser.get_words_without_stopwords(answer_choice) \ if word.lower() not in exlude_words and word.lower() not in q_keywords] q_query = ' '.join(q_keywords) a_query = ' '.join(a_keywords) # submit the question keywords to solr to obtain top 3 documents if None is url: url = 'http://localhost:8983/solr/wikipedia/select?fl=*%2Cscore&wt=json' self.logger.info("question: " + question) self.logger.info(answer_choice) is_short_q = (len(q_keywords) < 3) search_query = ' '.join(["{}^1000".format(qw) for qw in q_keywords]) + " " + a_query if is_short_q else q_query self.logger.info("Search query {}".format(search_query)) rsp = self._submit_search_request_by_query(q_query, url, limit=5) top_page_ids = "(" + ' OR '.join([d['id'] for d in rsp['response']['docs']]) + ")" top_page_titles = "(" + ' \n\t '.join([d['title'] for d in rsp['response']['docs']]) + ")" self.logger.info(top_page_ids) self.logger.info("Top page titles \n\t {}".format(top_page_titles)) fq = "id:" + top_page_ids self.logger.info("Url used to search answer {}".format(url)) rsp = self._submit_search_request_by_query(a_query, url, limit=3, fq=fq) # Return the average score the solr results for the answer matching_docs = rsp['response']['docs'] score = sum([d['score'] for d in matching_docs]) / len(matching_docs) if (len(matching_docs) > 0) else 0 self.logger.info(score) return score def _extract_snippets_from_solr_json_response(self, rsp): snippets = [] for d in rsp['response']['docs']: snippets = snippets + rsp['highlighting'][d['id']]["text"] return snippets def _submit_search_request(self, keywords, url, limit=3): keywords = ' '.join([word.replace(":", " ") for word in keywords]) return self._submit_search_request_by_query(keywords, url, limit) def _submit_search_request_by_query(self, query, url, limit=3, fq=""): data = {'limit': limit, 'query': query, 'filter': fq} headers = {'Content-Type': 'application/json'} self.logger.info("Query {}, limit {}, fq {}... truncated".format(query, limit, str(fq)[:50])) self.logger.debug(data) r = requests.post(url, simplejson.dumps(data), headers=headers) if r.status_code != 200: raise RuntimeError(r.text) rsp = simplejson.loads(r.text) return rsp def _get_science_stop_words(self): return ["following", "follow", "using", "true", "explanation", "explain", "explained", "explains", "example", "past", "would", "examples", "way", "called", "describes", "describe", "used", "use", "must", "several", "many", "likely", "includes", "include", "much" "most", "also", "shows", "show", "best", "illustrate", "illustrated", "illustrates", "statement", "statements", "tell", "us", "done", "certain", "call", "good", "another", "other", "correct", "correctly", "suggests", "suggest", "suggestion", "greatest", "great", "believed", "consider", "considered"] def _remove_duplicates_preserve_order(self, seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))]
def setUp(self): self.sut = TextAnalyser() self.data = "When athletes begin to exercise, their heart rates and respiration rates increase. "