Example #1
0
File: answer.py Project: nrvnujd/qa
    def _stanford_ner(self, text, searched_entity, question):
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]

        # Entity Classification
        try:
            host = MyConfig.get("answer_extraction", "stanford_host")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            host = "localhost"

        try:
            port = int(MyConfig.get("answer_extraction", "stanford_port"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            port = 1234

        try:
            recognizer = StanfordNER.get_instance(host, port)
            text = recognizer.process(text)
        except StanfordNERError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Stanford NER not available, using NLTK NER")
            return self._nltk_ner(text, searched_entity)

        # XML Parsing
        text = "<xml>" + text.replace("&", "") + "</xml>"
        try:
            tree = fromstring(text)
        except:
            return []

        # Entity Extraction
        entities = []
        all_entities = []
        for element in tree.iterchildren():
            word = "" if element.text is None else element.text
            if element is None:
                continue
            if element.tag == searched_entity:
                entities.append(word)
            all_entities.append(word)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities,
                                                question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences,
                                                 all_entities)

        return entities
Example #2
0
File: answer.py Project: nrvnujd/qa
    def _other_recognition(self, tagged_sentences, all_entities, question):
        # Nouns retrieval
        nouns = []
        for sentence in tagged_sentences:
            nouns += filter(lambda x: x[1] == "NN", sentence)
        nouns = [noun for (noun, tag) in nouns]

        # Nouns filtering
        # Remove all entities that are nouns
        all_entities = set(itertools.chain(*map(str.split, all_entities)))
        nouns = [noun for noun in nouns if noun not in all_entities]

        features = QuestionClassifier.get_features(question.text, "hn")
        head = features["head"]
        if head == "":
            return nouns

        # Filter nouns with WordNet synsets
        try:
            threshold = float(
                MyConfig.get("answer_extraction", "other_threshold"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            threshold = 0.6

        try:
            ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            ic = wordnet_ic.ic("ic-bnc.dat")

        result = []

        head_synsets = wn.synsets(head, pos=wn.NOUN)
        if len(head_synsets) == 0:
            noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN)
            if len(noun_synsets) == 0:
                return nouns
            else:
                head_synset = noun_synsets[0]
        else:
            head_synset = head_synsets[0]

        for noun in nouns:
            try:
                noun_synset = wn.synsets(noun, pos=wn.NOUN)[0]
                if threshold < noun_synset.lin_similarity(head_synset,
                                                          ic) < 0.9:
                    result.append(noun)
            except IndexError:
                continue

        return result
Example #3
0
    def _stanford_ner(self, text, searched_entity, question):
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]

        # Entity Classification
        try:
            host = MyConfig.get("answer_extraction", "stanford_host")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            host = "localhost"

        try:
            port = int(MyConfig.get("answer_extraction", "stanford_port"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            port = 1234

        try:
            recognizer = StanfordNER.get_instance(host, port)
            text = recognizer.process(text)
        except StanfordNERError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Stanford NER not available, using NLTK NER")
            return self._nltk_ner(text, searched_entity)

        # XML Parsing
        text = "<xml>" + text.replace("&", "") + "</xml>"
        try:
            tree = fromstring(text)
        except:
            return []

        # Entity Extraction
        entities = []
        all_entities = []
        for element in tree.iterchildren():
            word = "" if element.text is None else element.text
            if element is None:
                continue
            if element.tag == searched_entity:
                entities.append(word)
            all_entities.append(word)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities, question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences, all_entities)

        return entities
Example #4
0
    def _other_recognition(self, tagged_sentences, all_entities, question):
        # Nouns retrieval
        nouns = []
        for sentence in tagged_sentences:
            nouns += filter(lambda x: x[1] == "NN", sentence)
        nouns = [noun for (noun, tag) in nouns]

        # Nouns filtering
        # Remove all entities that are nouns
        all_entities = set(itertools.chain(*map(str.split, all_entities)))
        nouns = [noun for noun in nouns if noun not in all_entities]

        features = QuestionClassifier.get_features(question.text, "hn")
        head = features["head"]
        if head == "":
            return nouns

        # Filter nouns with WordNet synsets
        try:
            threshold = float(MyConfig.get("answer_extraction", "other_threshold"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            threshold = 0.6

        try:
            ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            ic = wordnet_ic.ic("ic-bnc.dat")

        result = []

        head_synsets = wn.synsets(head, pos=wn.NOUN)
        if len(head_synsets) == 0:
            noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN)
            if len(noun_synsets) == 0:
                return nouns
            else:
                head_synset = noun_synsets[0]
        else:
            head_synset = head_synsets[0]

        for noun in nouns:
            try:
                noun_synset = wn.synsets(noun, pos=wn.NOUN)[0]
                if threshold < noun_synset.lin_similarity(head_synset, ic) < 0.9:
                    result.append(noun)
            except IndexError:
                continue

        return result
Example #5
0
    def search(self):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tDocument Retrieval", self.id_q)

        search_engines = self._get_search_engines()

        try:
            num = int(MyConfig.get("document_retrieval", "n_results"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            num = 10

        results = []
        for engine in search_engines:
            try:
                results += engine.search(self.query, count=num)
            except Exception as e:
                logger = logging.getLogger("qa_logger")
                logger.error("Problem with search engine.")
                logger.debug(e)
                sys.exit(1)

        doc_list = []
        # rank loops over [0..num-1]
        rank = 0
        # ignore repeated urls
        unique_urls = set()
        for resource in results:
            if resource.url in unique_urls:
                continue
            unique_urls.add(resource.url)

            # rank+1 loops over [1..num]
            # rank+1 is the relative position of the results
            doc_list.append(Document(resource, rank + 1))
            rank = (rank + 1) % num

        try:
            if MyConfig.get("persistence", "document") == "True":
                output = open("documentos.pkl", "wb")
                pickle.dump(doc_list, output, 0)
                output.close()
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))

        return doc_list
Example #6
0
    def process_answer(self, passage, question):
        q = question.text
        p = passage.text

        searched_entity = self._question_classification(q)

        try:
            ner_algorithm = MyConfig.get("answer_extraction", "ner")
        except MyConfigException as e:
            ner_algorithm = "stanford"
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))

        if ner_algorithm == "nltk":
            entities = self._nltk_ner(p, searched_entity, question)
        else:
            entities = self._stanford_ner(p, searched_entity, question)

        entities = self._filter_entities(entities, q)

        exact, window, score = self._entity_ranking(entities)

        answer = Answer(passage, question, window, exact, score)

        return answer
Example #7
0
File: answer.py Project: nrvnujd/qa
    def process_answer(self, passage, question):
        q = question.text
        p = passage.text

        searched_entity = self._question_classification(q)

        try:
            ner_algorithm = MyConfig.get("answer_extraction", "ner")
        except MyConfigException as e:
            ner_algorithm = "stanford"
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))

        if ner_algorithm == "nltk":
            entities = self._nltk_ner(p, searched_entity, question)
        else:
            entities = self._stanford_ner(p, searched_entity, question)

        entities = self._filter_entities(entities, q)

        exact, window, score = self._entity_ranking(entities)

        answer = Answer(passage, question, window, exact, score)

        return answer
Example #8
0
    def __init__(self, result, rank):
        self.title = result.title
        self.url = utils.from_unicode_to_ascii(result.url)
        self.rank = rank
        self.description = utils.from_unicode_to_ascii(result.description)

        self.content = utils.from_unicode_to_ascii(self._get_content(result))

        # Split document into passages
        try:
            algorithm = MyConfig.get("document_segmentation", "algorithm")
            if algorithm == "lines":
                self.passages = SplitIntoLinesAlgorithm.split_into_passages(
                    self)
            elif algorithm == "paragraphs":
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                    self)
            elif algorithm == "sentences":
                self.passages = SplitIntoSentencesAlgorithm.split_into_passages(
                    self)
            else:
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                    self)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                self)
Example #9
0
    def get_run_tag(self):
        try:
            exact = MyConfig.get("show_answer", "exact") == "True"
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            exact = False

        return ("plna" + ("ex" if exact else "st") + "031ms", exact)
Example #10
0
    def _get_search_engines(self):
        try:
            lang = MyConfig.get("document_retrieval", "lang")
            engines = safe_eval(MyConfig.get("document_retrieval", "engines"))
            throttle = MyConfig.get("document_retrieval", "throttle")

            l = []
            for (engine, license) in engines:
                # Eval to something like this:
                # Google(google_license, throttle, lang)
                l.append(eval(engine + "(\"" + license + "\", " + throttle + ", " + lang + ")"))
            return l

        except MyConfigException:
            sys.exit("_get_search_engines: config error")
        except:
            logger = logging.getLogger("qa_logger")
            logger.exception("_get_search_engines: fatal error")
            sys.exit(2)
Example #11
0
    def _question_classification(self, question):
        # Choose the specified classifier
        try:
            features = MyConfig.get("answer_extraction", "question_features")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            features = "fnh"

        try:
            classifier_file = MyConfig.get("answer_extraction", "question_classifier")
            classifier_path = os.path.join("qc", features, classifier_file)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            classifier_path = os.path.join("qc", "fhn", "qc_bayes.pkl")

        # Question classification
        return QuestionClassifier.classify(classifier_path, question, features)
Example #12
0
    def calculate_score(self, question, passage):
        rank = passage.document.rank
        q = question.text
        text = passage.text

        # Removestopwords from question and passage
        # and split it into words
        q = StopwordsAlgorithm.formulate_query(q).split()
        text = StopwordsAlgorithm.formulate_query(text).split()

        # Apply stemming to q and text
        porter = PorterStemmer()
        q = map(porter.stem, q)
        text = map(porter.stem, text)

        score = 0
        searched_term = 0
        last_match = 0
        first_match = True

        if len(q) < 1:
            return 0

        for i, word in enumerate(text):
            if searched_term >= len(q):
                searched_term = 0
                first_match = True

            if word == q[searched_term]:
                if first_match:
                    score += 1
                    first_match = False
                else:
                    score += 1.0 / (i - last_match)

                last_match = i
                searched_term += 1

        try:
            num = int(MyConfig.get("document_retrieval", "n_results"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            return score

        # Reverse rank order from 1..n to n..1
        rank = num - rank + 1.0

        # Normalize rank from n..1 to 1..0.5
        rank = (rank - 2 + num) / (2 * num - 2)

        # Weight score by rank
        score = score * rank

        return score
Example #13
0
File: answer.py Project: nrvnujd/qa
    def _question_classification(self, question):
        # Choose the specified classifier
        try:
            features = MyConfig.get("answer_extraction", "question_features")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            features = "fnh"

        try:
            classifier_file = MyConfig.get("answer_extraction",
                                           "question_classifier")
            classifier_path = os.path.join("qc", features, classifier_file)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            classifier_path = os.path.join("qc", "fhn", "qc_bayes.pkl")

        # Question classification
        return QuestionClassifier.classify(classifier_path, question, features)
Example #14
0
    def find_answer(self, question):
        try:
            algorithm = MyConfig.get("answer_extraction", "algorithm")
            if (algorithm == "entity"):
                self.answer = EntityRecognitionAlgorithm.process_answer(self, question)
            else:
                self.answer = EntityRecognitionAlgorithm.process_answer(self, question)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            self.answer = EntityRecognitionAlgorithm.process_answer(self, question)

        return self.answer
Example #15
0
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)
Example #16
0
    def _formulate_query(self):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tQuery Formulation", self.id_q)

        try:
            algorithm = MyConfig.get("query_formulation", "algorithm")
            if algorithm == "stopwords":
                return StopwordsAlgorithm.formulate_query(self.text)
            else:
                return StopwordsAlgorithm.formulate_query(self.text)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            return StopwordsAlgorithm.formulate_query(self.text)
Example #17
0
File: QA.py Project: nrvnujd/qa
    def get_best_answers(self, passage_list, q):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tAnswer Processing", q.id_q)

        empty = passage_list == []

        logger.info("%s:\t\tAnswer Extraction", q.id_q)

        answer_list = []
        for passage in passage_list:
            a = passage.find_answer(q)
            if a.is_successful():
                answer_list.append(a)

        if not answer_list:
            return ([], empty)

        logger.info("%s:\t\tAnswer Filtering", q.id_q)

        # Obtain answer frequency
        fd = FreqDist(answer_list)

        # Normalize frequencies
        normalize = fd.freq(fd.max())

        # Modify scores by frequency
        for answer in answer_list:
            answer.score = int(answer.score * (fd.freq(answer) / normalize))

        # Sort answers by score
        answer_list.sort(key=lambda x: x.score, reverse=True)

        # Filter bad answers
        try:
            threshold = int(MyConfig.get("answer_filtering", "threshold"))
        except:
            logger = logging.getLogger("qa_logger")
            logger.error("answer quality threshold not found")
            threshold = 50

        answer_list = filter(lambda x: x.score > threshold, answer_list)

        final_answers = []
        for a in answer_list:
            if a not in final_answers:
                final_answers.append(a)
            if len(final_answers) == 3:
                break

        return (final_answers, empty)
Example #18
0
 def calculate_score(self, question):
     try:
         algorithm = MyConfig.get("passage_filtering", "algorithm")
         if (algorithm == "similarity"):
             self.score = SimilarityAlgorithm.calculate_score(question, self)
         elif (algorithm == "proximity"):
             self.score = ProximityAlgorithm.calculate_score(question, self)
         elif (algorithm == "mixed"):
             self.score = MixedAlgorithm.calculate_score(question, self)
         else:
             self.score = MixedAlgorithm.calculate_score(question, self)
     except MyConfigException as e:
         logger = logging.getLogger("qa_logger")
         logger.warning(str(e))
         self.score = MixedAlgorithm.calculate_score(question, self)
Example #19
0
    def get_relevant_passages(self, doc_list, question):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tPassage Retrieval", question.id_q)
        logger.info("%s:\t\tDocument Segmentation", question.id_q)

        passage_list = self.score_passages(doc_list, question)
        passage_list.sort(key=lambda x: x.score, reverse=True)

        # Select n best passages
        try:
            n = int(MyConfig.get("document_segmentation", "n_relevants"))
        except:
            n = 100
            logger = logging.getLogger("qa_logger")
            logger.warning("n_relevants not found")

        return passage_list[:n]
Example #20
0
File: QA.py Project: nrvnujd/qa
    def get_relevant_passages(self, doc_list, question):
        logger = logging.getLogger("qa_logger")
        logger.info("%s:\tPassage Retrieval", question.id_q)
        logger.info("%s:\t\tDocument Segmentation", question.id_q)

        passage_list = self.score_passages(doc_list, question)
        passage_list.sort(key=lambda x: x.score, reverse=True)

        # Select n best passages
        try:
            n = int(MyConfig.get("document_segmentation", "n_relevants"))
        except:
            n = 100
            logger = logging.getLogger("qa_logger")
            logger.warning("n_relevants not found")

        return passage_list[:n]
Example #21
0
    def _get_content(self, result):
        url = URL(result.url)

        try:
            timeout = int(MyConfig.get("document_retrieval", "timeout"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            timeout = 15

        try:
            mimetype = url.mimetype
            content = utils.from_unicode_to_ascii(url.download(timeout=timeout, unicode=True))
        except Exception as e:
            # If we cannot retrieve the document, we skip it
            logger = logging.getLogger("qa_logger")
            logger.warning("%s couldn't be retrieved", result.url)
            logger.warning(str(e))
            return ""

        return self._extract_text(content, mimetype)
Example #22
0
    def _get_content(self, result):
        url = URL(result.url)

        try:
            timeout = int(MyConfig.get("document_retrieval", "timeout"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            timeout = 15

        try:
            mimetype = url.mimetype
            content = utils.from_unicode_to_ascii(
                url.download(timeout=timeout, unicode=True))
        except Exception as e:
            # If we cannot retrieve the document, we skip it
            logger = logging.getLogger("qa_logger")
            logger.warning("%s couldn't be retrieved", result.url)
            logger.warning(str(e))
            return ""

        return self._extract_text(content, mimetype)
Example #23
0
    def calculate_score(self, question, passage):
        rank = passage.document.rank
        q = question.text
        text = passage.text

        # Remove stopwords from question and passage
        # and split it into words
        q = StopwordsAlgorithm.formulate_query(q).split()
        text = StopwordsAlgorithm.formulate_query(text).split()

        # Apply stemming to q and text
        porter = PorterStemmer()
        q = map(porter.stem, q)
        text = map(porter.stem, text)

        # Filter all words in passage that they are
        # not present in question
        words = filter(lambda x: x in q, text)

        # Our initial score is the number of coincidences
        score = len(words)

        try:
            num = int(MyConfig.get("document_retrieval", "n_results"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            return score

        # Reverse rank order from 1..n to n..1
        rank = num - rank + 1.0

        # Normalize rank from n..1 to 1..0.5
        rank = (rank - 2 + num) / (2 * num - 2)

        # Weight score by rank
        score = score * rank

        return score
Example #24
0
    def __init__(self, result, rank):
        self.title = result.title
        self.url = utils.from_unicode_to_ascii(result.url)
        self.rank = rank
        self.description = utils.from_unicode_to_ascii(result.description)

        self.content = utils.from_unicode_to_ascii(self._get_content(result))

        # Split document into passages
        try:
            algorithm = MyConfig.get("document_segmentation", "algorithm")
            if algorithm == "lines":
                self.passages = SplitIntoLinesAlgorithm.split_into_passages(self)
            elif algorithm == "paragraphs":
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)
            elif algorithm == "sentences":
                self.passages = SplitIntoSentencesAlgorithm.split_into_passages(self)
            else:
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)