Example #1
0
def get_key_words(text):
    analyser = TextAnalyser()
    key_words = set(analyser.get_words_without_stopwords(text))
    #print(key_words)
    nn_phrases = [' '.join(p) for p in analyser.get_nn_chunks(text) if len(p) > 1]
    #print(nn_phrases)
    key_words = key_words.union(nn_phrases)
    np_phrases = [' '.join(p) for p in analyser.get_np_chunks(text) if len(p) > 1]
    key_words = key_words.union (np_phrases)
    key_words = [w for w in key_words]
    return key_words
Example #2
0
class SimilarityPipeline:
    def __init__(self, train_data, test_data, analyser=None):
        self._train_data = train_data
        self._test_data = test_data
        self._analyser = analyser
        self.predictions = pd.DataFrame("-", index=self._test_data.x.index.values, columns=['answer', "description", "train_question", "train_id","train_answer"])
        if analyser is None:
            self._analyser = TextAnalyser()

    def run_pipeline(self):
        self._calc()

    def print_summary(self):
        pass

    def score(self):
        pass


    def write_to_disk(self, directory):
        self.predictions.to_csv(os.path.join(directory, "predictions.csv"))
        self.predictions["answer"].to_csv(os.path.join(directory, "submission_predictions.csv"), header=["correctAnswer"])
        test_data =  self._test_data.x.join(self._test_data.y) if self._test_data.y is not None else self._test_data.x
        test_data.join(self.predictions, rsuffix="pred.").to_csv(os.path.join(directory, "test_data_with_predictions.csv"))

    def _calc(self):
        question_index = self._train_data.x.columns.get_loc("question")
        test_q_index = self._train_data.x.columns.get_loc("question")

        for test_row in self._test_data.x.itertuples():
            test_sentence = test_row[test_q_index + 1]
            train_tuples = self._train_data.x.itertuples()
            top_similar_train_row = \
                self._analyser.get_top_n_similar_sentences(test_sentence, train_tuples, 1,
                                                                  similarity_threshold= 0,
                                                                  sentence_extractor= lambda x: x[question_index + 1])
            if len( top_similar_train_row ) > 0:
                self._calculate_correct_answer(test_row, top_similar_train_row[0])


    def _calculate_correct_answer(self, test_row, top_similar_train_row):
        correct_answer_column_name = self._train_data.y.loc[top_similar_train_row[0], "answer"]
        correct_answer_column_index = self._train_data.x.columns.get_loc(correct_answer_column_name) + 1
        top_matching_answer = \
            self._analyser.get_top_n_similar_sentences(top_similar_train_row[correct_answer_column_index],
                                                              test_row[2:6], 1, 0)[0]
        self.predictions.loc[test_row[0]].description = top_matching_answer
        self.predictions.loc[test_row[0]].answer = ["A", "B", "C", "D"][test_row[2:6].index(top_matching_answer)]
        self.predictions.loc[test_row[0]].train_question=top_similar_train_row[self._train_data.x.columns.get_loc("question")+1]
        self.predictions.loc[test_row[0]].train_id=top_similar_train_row[0]
        self.predictions.loc[test_row[0]].train_answer=top_similar_train_row[correct_answer_column_index]
Example #3
0
 def __init__(self, train_data, test_data, analyser=None):
     self._train_data = train_data
     self._test_data = test_data
     self._analyser = analyser
     self.predictions = pd.DataFrame("-", index=self._test_data.x.index.values, columns=['answer', "description", "train_question", "train_id","train_answer"])
     if analyser is None:
         self._analyser = TextAnalyser()
    def __init__(self, data, text_analyser=None, logger=None):
        self._data = data
        self.logger = logger or logging.getLogger(__name__)

        self.predictions = pd.DataFrame("-", index=self._data.x.index.values,
                                        columns=['answer', "question_key", "A_Key", "B_Key", "C_Key", "D_key", "score",
                                                 "A_score", "B_score", "C_score", "D_score", "q_word_count"])
        if text_analyser is None:
            self._analyser = TextAnalyser()
class IntegrationTestAristoAnalyser(unittest.TestCase):
    def setUp(self):
        self.sut = TextAnalyser()
        self.data = "When athletes begin to exercise, their heart rates and respiration rates increase. "

    def test_should_tokenise_words(self):
        self.sut.get_words(self.data)

    def test_should_get_named_entities(self):
            self.sut.aristo_get_named_entities(self.data)

    def test_should_write_most_common_words_to_file(self):
        self.sut.aristo_write_most_common_words_to_file(self.data, 10 , "../../../temp.tsv")

    def test_should_write_most_common_nouns_to_file(self):
        self.sut.aristo_write_most_common_nouns_to_file(self.data, 10 , "../../../test_nouns.tsv")
class SolrWikipediaPipeline:
    """
    Default scoring pipeline
    """

    def __init__(self, data, text_analyser=None, logger=None):
        self._data = data
        self.logger = logger or logging.getLogger(__name__)

        self.predictions = pd.DataFrame("-", index=self._data.x.index.values,
                                        columns=['answer', "question_key", "A_Key", "B_Key", "C_Key", "D_key", "score",
                                                 "A_score", "B_score", "C_score", "D_score", "q_word_count"])
        if text_analyser is None:
            self._analyser = TextAnalyser()

    def run_pipeline(self):
        self._calc()


    def print_summary(self):
        pass

    def score(self):
        if self._data.y is None:
            return -1
        df = self._data.x.join(self._data.y)
        df = df.join(self.predictions, rsuffix="pred")

        correct_answers = df[df.answer == df.answerpred]
        score = len(correct_answers.index) / len(df.index)
        self.logger.info("Score :  " + str(score))
        return score

    def write_to_disk(self, directory):
        self.predictions.to_csv(os.path.join(directory, "predictions.csv"))
        self.predictions["answer"].to_csv(os.path.join(directory, "submission_predictions.csv"),
                                          header=["correctAnswer"])
        test_data = self._data.x.join(self._data.y) if self._data.y is not None else self._data.x
        test_data.join(self.predictions, rsuffix="pred").to_csv(os.path.join(directory, "data_with_predictions.csv"))

    def _calc(self):
        self.logger.info("running _calc")
        q_index = self._data.x.columns.get_loc("question")
        id_index = 0
        for row in self._data.x.itertuples():
            question = row[q_index + 1]
            max_score = -1
            correct_answer = "-"
            self.logger.info("running question id {}".format(row[0]))
            for choice in ["A", "B", "C", "D"]:
                choice_index = self._data.x.columns.get_loc(choice)
                choice_text = row[choice_index + 1]
                score = self._get_score_answer_search_within_top_question_search_pages(question, choice_text)
                if score > max_score:
                    max_score = score
                    correct_answer = choice
            self.predictions.loc[row[id_index]].answer = correct_answer
            self.predictions.loc[row[id_index]].score = max_score
            self.predictions.loc[row[id_index]].q_word_count = len(self._analyser.get_words_without_stopwords(question))

    def _get_score_answer_search_within_top_question_search_pages(self, question, answer_choice, url=None):
        """
        Calculates the score of the answer as follows
            1. Obtain key words from the question & the answer by removing stop words
            2. Obtain top 3 pages matching the search results for the questions
            3. Search solr for the answer key words, but restrict the search to the top 3 pages from the question search
            4. Return the average score for the answer search

        :param url: solr url
        :param question: The question string
        :param answer_choice: The answer string
        :return: the score of the answer
        """
        # Get keywords from question and answer
        self.logger.info("running _get_score_answer_search_within_top_question_search_pages")
        self.logger.info("------------")

        exlude_words = self._get_science_stop_words()

        q_keywords = [word for word in self._analyser.get_words_without_stopwords(question) \
                      if word.lower() not in exlude_words]
        q_keywords = self._remove_duplicates_preserve_order(q_keywords)
        a_keywords = [word for word in self._analyser.get_words_without_stopwords(answer_choice) \
                      if word.lower() not in exlude_words and word.lower() not in q_keywords]
        q_query = ' '.join(q_keywords)
        a_query = ' '.join(a_keywords)

        # submit the question keywords to solr to obtain top 3 documents
        if None is url:
            url = 'http://localhost:8983/solr/wikipedia/select?fl=*%2Cscore&wt=json'
        self.logger.info("question: " + question)
        self.logger.info(answer_choice)
        is_short_q = (len(q_keywords) < 3)
        search_query = ' '.join(["{}^1000".format(qw) for qw in q_keywords]) + " " + a_query if is_short_q else q_query
        self.logger.info("Search query {}".format(search_query))
        rsp = self._submit_search_request_by_query(q_query, url, limit=5)

        top_page_ids = "(" + ' OR '.join([d['id'] for d in rsp['response']['docs']]) + ")"
        top_page_titles = "(" + ' \n\t '.join([d['title'] for d in rsp['response']['docs']]) + ")"
        self.logger.info(top_page_ids)
        self.logger.info("Top page titles \n\t {}".format(top_page_titles))

        fq = "id:" + top_page_ids
        self.logger.info("Url used to search answer {}".format(url))
        rsp = self._submit_search_request_by_query(a_query, url, limit=3, fq=fq)

        # Return the average score the solr results for the answer
        matching_docs = rsp['response']['docs']
        score = sum([d['score'] for d in matching_docs]) / len(matching_docs) if (len(matching_docs) > 0) else 0
        self.logger.info(score)
        return score

    def _extract_snippets_from_solr_json_response(self, rsp):
        snippets = []
        for d in rsp['response']['docs']:
            snippets = snippets + rsp['highlighting'][d['id']]["text"]

        return snippets

    def _submit_search_request(self, keywords, url, limit=3):
        keywords = ' '.join([word.replace(":", " ") for word in keywords])

        return self._submit_search_request_by_query(keywords, url, limit)

    def _submit_search_request_by_query(self, query, url, limit=3, fq=""):

        data = {'limit': limit, 'query': query, 'filter': fq}
        headers = {'Content-Type': 'application/json'}
        self.logger.info("Query {}, limit {}, fq {}... truncated".format(query, limit, str(fq)[:50]))
        self.logger.debug(data)
        r = requests.post(url, simplejson.dumps(data), headers=headers)
        if r.status_code != 200:
            raise RuntimeError(r.text)

        rsp = simplejson.loads(r.text)
        return rsp

    def _get_science_stop_words(self):
        return ["following", "follow", "using", "true", "explanation", "explain", "explained", "explains", "example",
                "past", "would", "examples",
                "way", "called", "describes", "describe", "used", "use", "must", "several", "many", "likely",
                "includes", "include", "much"
                                       "most", "also", "shows", "show", "best", "illustrate", "illustrated",
                "illustrates", "statement", "statements", "tell", "us", "done", "certain", "call", "good", "another",
                "other", "correct", "correctly", "suggests", "suggest", "suggestion", "greatest", "great", "believed",
                "consider", "considered"]

    def _remove_duplicates_preserve_order(self, seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]
 def setUp(self):
     self.sut = TextAnalyser()
     self.data = "When athletes begin to exercise, their heart rates and respiration rates increase. "