Esempio n. 1
0
File: answer.py Progetto: nrvnujd/qa
    def _stanford_ner(self, text, searched_entity, question):
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]

        # Entity Classification
        try:
            host = MyConfig.get("answer_extraction", "stanford_host")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            host = "localhost"

        try:
            port = int(MyConfig.get("answer_extraction", "stanford_port"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            port = 1234

        try:
            recognizer = StanfordNER.get_instance(host, port)
            text = recognizer.process(text)
        except StanfordNERError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Stanford NER not available, using NLTK NER")
            return self._nltk_ner(text, searched_entity)

        # XML Parsing
        text = "<xml>" + text.replace("&", "") + "</xml>"
        try:
            tree = fromstring(text)
        except:
            return []

        # Entity Extraction
        entities = []
        all_entities = []
        for element in tree.iterchildren():
            word = "" if element.text is None else element.text
            if element is None:
                continue
            if element.tag == searched_entity:
                entities.append(word)
            all_entities.append(word)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities,
                                                question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences,
                                                 all_entities)

        return entities
Esempio n. 2
0
    def _stanford_ner(self, text, searched_entity, question):
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]

        # Entity Classification
        try:
            host = MyConfig.get("answer_extraction", "stanford_host")
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            host = "localhost"

        try:
            port = int(MyConfig.get("answer_extraction", "stanford_port"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            port = 1234

        try:
            recognizer = StanfordNER.get_instance(host, port)
            text = recognizer.process(text)
        except StanfordNERError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Stanford NER not available, using NLTK NER")
            return self._nltk_ner(text, searched_entity)

        # XML Parsing
        text = "<xml>" + text.replace("&", "") + "</xml>"
        try:
            tree = fromstring(text)
        except:
            return []

        # Entity Extraction
        entities = []
        all_entities = []
        for element in tree.iterchildren():
            word = "" if element.text is None else element.text
            if element is None:
                continue
            if element.tag == searched_entity:
                entities.append(word)
            all_entities.append(word)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities, question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences, all_entities)

        return entities
Esempio n. 3
0
def clean():
    StanfordNER.disconnect_all()
    StanfordParser.disconnect_all()