def _stanford_ner(self, text, searched_entity, question): sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences] # Entity Classification try: host = MyConfig.get("answer_extraction", "stanford_host") except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) host = "localhost" try: port = int(MyConfig.get("answer_extraction", "stanford_port")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) port = 1234 try: recognizer = StanfordNER.get_instance(host, port) text = recognizer.process(text) except StanfordNERError: logger = logging.getLogger("qa_logger") logger.warning("Stanford NER not available, using NLTK NER") return self._nltk_ner(text, searched_entity) # XML Parsing text = "<xml>" + text.replace("&", "") + "</xml>" try: tree = fromstring(text) except: return [] # Entity Extraction entities = [] all_entities = [] for element in tree.iterchildren(): word = "" if element.text is None else element.text if element is None: continue if element.tag == searched_entity: entities.append(word) all_entities.append(word) if 'OTHER' == searched_entity: entities += self._other_recognition(tagged_sentences, all_entities, question) if 'NUMBER' == searched_entity: entities += self._number_recognition(text, tagged_sentences, all_entities) return entities
def clean(): StanfordNER.disconnect_all() StanfordParser.disconnect_all()