def _stanford_ner(self, text, searched_entity, question): sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences] # Entity Classification try: host = MyConfig.get("answer_extraction", "stanford_host") except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) host = "localhost" try: port = int(MyConfig.get("answer_extraction", "stanford_port")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) port = 1234 try: recognizer = StanfordNER.get_instance(host, port) text = recognizer.process(text) except StanfordNERError: logger = logging.getLogger("qa_logger") logger.warning("Stanford NER not available, using NLTK NER") return self._nltk_ner(text, searched_entity) # XML Parsing text = "<xml>" + text.replace("&", "") + "</xml>" try: tree = fromstring(text) except: return [] # Entity Extraction entities = [] all_entities = [] for element in tree.iterchildren(): word = "" if element.text is None else element.text if element is None: continue if element.tag == searched_entity: entities.append(word) all_entities.append(word) if 'OTHER' == searched_entity: entities += self._other_recognition(tagged_sentences, all_entities, question) if 'NUMBER' == searched_entity: entities += self._number_recognition(text, tagged_sentences, all_entities) return entities
def _other_recognition(self, tagged_sentences, all_entities, question): # Nouns retrieval nouns = [] for sentence in tagged_sentences: nouns += filter(lambda x: x[1] == "NN", sentence) nouns = [noun for (noun, tag) in nouns] # Nouns filtering # Remove all entities that are nouns all_entities = set(itertools.chain(*map(str.split, all_entities))) nouns = [noun for noun in nouns if noun not in all_entities] features = QuestionClassifier.get_features(question.text, "hn") head = features["head"] if head == "": return nouns # Filter nouns with WordNet synsets try: threshold = float( MyConfig.get("answer_extraction", "other_threshold")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) threshold = 0.6 try: ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) ic = wordnet_ic.ic("ic-bnc.dat") result = [] head_synsets = wn.synsets(head, pos=wn.NOUN) if len(head_synsets) == 0: noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN) if len(noun_synsets) == 0: return nouns else: head_synset = noun_synsets[0] else: head_synset = head_synsets[0] for noun in nouns: try: noun_synset = wn.synsets(noun, pos=wn.NOUN)[0] if threshold < noun_synset.lin_similarity(head_synset, ic) < 0.9: result.append(noun) except IndexError: continue return result
def _other_recognition(self, tagged_sentences, all_entities, question): # Nouns retrieval nouns = [] for sentence in tagged_sentences: nouns += filter(lambda x: x[1] == "NN", sentence) nouns = [noun for (noun, tag) in nouns] # Nouns filtering # Remove all entities that are nouns all_entities = set(itertools.chain(*map(str.split, all_entities))) nouns = [noun for noun in nouns if noun not in all_entities] features = QuestionClassifier.get_features(question.text, "hn") head = features["head"] if head == "": return nouns # Filter nouns with WordNet synsets try: threshold = float(MyConfig.get("answer_extraction", "other_threshold")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) threshold = 0.6 try: ic = wordnet_ic.ic(MyConfig.get("answer_extraction", "ic")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) ic = wordnet_ic.ic("ic-bnc.dat") result = [] head_synsets = wn.synsets(head, pos=wn.NOUN) if len(head_synsets) == 0: noun_synsets = wn.synsets(features["noun"], pos=wn.NOUN) if len(noun_synsets) == 0: return nouns else: head_synset = noun_synsets[0] else: head_synset = head_synsets[0] for noun in nouns: try: noun_synset = wn.synsets(noun, pos=wn.NOUN)[0] if threshold < noun_synset.lin_similarity(head_synset, ic) < 0.9: result.append(noun) except IndexError: continue return result
def search(self): logger = logging.getLogger("qa_logger") logger.info("%s:\tDocument Retrieval", self.id_q) search_engines = self._get_search_engines() try: num = int(MyConfig.get("document_retrieval", "n_results")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) num = 10 results = [] for engine in search_engines: try: results += engine.search(self.query, count=num) except Exception as e: logger = logging.getLogger("qa_logger") logger.error("Problem with search engine.") logger.debug(e) sys.exit(1) doc_list = [] # rank loops over [0..num-1] rank = 0 # ignore repeated urls unique_urls = set() for resource in results: if resource.url in unique_urls: continue unique_urls.add(resource.url) # rank+1 loops over [1..num] # rank+1 is the relative position of the results doc_list.append(Document(resource, rank + 1)) rank = (rank + 1) % num try: if MyConfig.get("persistence", "document") == "True": output = open("documentos.pkl", "wb") pickle.dump(doc_list, output, 0) output.close() except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return doc_list
def process_answer(self, passage, question): q = question.text p = passage.text searched_entity = self._question_classification(q) try: ner_algorithm = MyConfig.get("answer_extraction", "ner") except MyConfigException as e: ner_algorithm = "stanford" logger = logging.getLogger("qa_logger") logger.warning(str(e)) if ner_algorithm == "nltk": entities = self._nltk_ner(p, searched_entity, question) else: entities = self._stanford_ner(p, searched_entity, question) entities = self._filter_entities(entities, q) exact, window, score = self._entity_ranking(entities) answer = Answer(passage, question, window, exact, score) return answer
def __init__(self, result, rank): self.title = result.title self.url = utils.from_unicode_to_ascii(result.url) self.rank = rank self.description = utils.from_unicode_to_ascii(result.description) self.content = utils.from_unicode_to_ascii(self._get_content(result)) # Split document into passages try: algorithm = MyConfig.get("document_segmentation", "algorithm") if algorithm == "lines": self.passages = SplitIntoLinesAlgorithm.split_into_passages( self) elif algorithm == "paragraphs": self.passages = SplitIntoParagraphsAlgorithm.split_into_passages( self) elif algorithm == "sentences": self.passages = SplitIntoSentencesAlgorithm.split_into_passages( self) else: self.passages = SplitIntoParagraphsAlgorithm.split_into_passages( self) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) self.passages = SplitIntoParagraphsAlgorithm.split_into_passages( self)
def get_run_tag(self): try: exact = MyConfig.get("show_answer", "exact") == "True" except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) exact = False return ("plna" + ("ex" if exact else "st") + "031ms", exact)
def _get_search_engines(self): try: lang = MyConfig.get("document_retrieval", "lang") engines = safe_eval(MyConfig.get("document_retrieval", "engines")) throttle = MyConfig.get("document_retrieval", "throttle") l = [] for (engine, license) in engines: # Eval to something like this: # Google(google_license, throttle, lang) l.append(eval(engine + "(\"" + license + "\", " + throttle + ", " + lang + ")")) return l except MyConfigException: sys.exit("_get_search_engines: config error") except: logger = logging.getLogger("qa_logger") logger.exception("_get_search_engines: fatal error") sys.exit(2)
def _question_classification(self, question): # Choose the specified classifier try: features = MyConfig.get("answer_extraction", "question_features") except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) features = "fnh" try: classifier_file = MyConfig.get("answer_extraction", "question_classifier") classifier_path = os.path.join("qc", features, classifier_file) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) classifier_path = os.path.join("qc", "fhn", "qc_bayes.pkl") # Question classification return QuestionClassifier.classify(classifier_path, question, features)
def calculate_score(self, question, passage): rank = passage.document.rank q = question.text text = passage.text # Removestopwords from question and passage # and split it into words q = StopwordsAlgorithm.formulate_query(q).split() text = StopwordsAlgorithm.formulate_query(text).split() # Apply stemming to q and text porter = PorterStemmer() q = map(porter.stem, q) text = map(porter.stem, text) score = 0 searched_term = 0 last_match = 0 first_match = True if len(q) < 1: return 0 for i, word in enumerate(text): if searched_term >= len(q): searched_term = 0 first_match = True if word == q[searched_term]: if first_match: score += 1 first_match = False else: score += 1.0 / (i - last_match) last_match = i searched_term += 1 try: num = int(MyConfig.get("document_retrieval", "n_results")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return score # Reverse rank order from 1..n to n..1 rank = num - rank + 1.0 # Normalize rank from n..1 to 1..0.5 rank = (rank - 2 + num) / (2 * num - 2) # Weight score by rank score = score * rank return score
def find_answer(self, question): try: algorithm = MyConfig.get("answer_extraction", "algorithm") if (algorithm == "entity"): self.answer = EntityRecognitionAlgorithm.process_answer(self, question) else: self.answer = EntityRecognitionAlgorithm.process_answer(self, question) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) self.answer = EntityRecognitionAlgorithm.process_answer(self, question) return self.answer
def get_best_answers(self, passage_list, q): logger = logging.getLogger("qa_logger") logger.info("%s:\tAnswer Processing", q.id_q) empty = passage_list == [] logger.info("%s:\t\tAnswer Extraction", q.id_q) answer_list = [] for passage in passage_list: a = passage.find_answer(q) if a.is_successful(): answer_list.append(a) if not answer_list: return ([], empty) logger.info("%s:\t\tAnswer Filtering", q.id_q) # Obtain answer frequency fd = FreqDist(answer_list) # Normalize frequencies normalize = fd.freq(fd.max()) # Modify scores by frequency for answer in answer_list: answer.score = int(answer.score * (fd.freq(answer) / normalize)) # Sort answers by score answer_list.sort(key=lambda x: x.score, reverse=True) # Filter bad answers try: threshold = int(MyConfig.get("answer_filtering", "threshold")) except: logger = logging.getLogger("qa_logger") logger.error("answer quality threshold not found") threshold = 50 answer_list = filter(lambda x: x.score > threshold, answer_list) final_answers = [] for a in answer_list: if a not in final_answers: final_answers.append(a) if len(final_answers) == 3: break return (final_answers, empty)
def _formulate_query(self): logger = logging.getLogger("qa_logger") logger.info("%s:\tQuery Formulation", self.id_q) try: algorithm = MyConfig.get("query_formulation", "algorithm") if algorithm == "stopwords": return StopwordsAlgorithm.formulate_query(self.text) else: return StopwordsAlgorithm.formulate_query(self.text) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return StopwordsAlgorithm.formulate_query(self.text)
def calculate_score(self, question): try: algorithm = MyConfig.get("passage_filtering", "algorithm") if (algorithm == "similarity"): self.score = SimilarityAlgorithm.calculate_score(question, self) elif (algorithm == "proximity"): self.score = ProximityAlgorithm.calculate_score(question, self) elif (algorithm == "mixed"): self.score = MixedAlgorithm.calculate_score(question, self) else: self.score = MixedAlgorithm.calculate_score(question, self) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) self.score = MixedAlgorithm.calculate_score(question, self)
def get_relevant_passages(self, doc_list, question): logger = logging.getLogger("qa_logger") logger.info("%s:\tPassage Retrieval", question.id_q) logger.info("%s:\t\tDocument Segmentation", question.id_q) passage_list = self.score_passages(doc_list, question) passage_list.sort(key=lambda x: x.score, reverse=True) # Select n best passages try: n = int(MyConfig.get("document_segmentation", "n_relevants")) except: n = 100 logger = logging.getLogger("qa_logger") logger.warning("n_relevants not found") return passage_list[:n]
def _get_content(self, result): url = URL(result.url) try: timeout = int(MyConfig.get("document_retrieval", "timeout")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) timeout = 15 try: mimetype = url.mimetype content = utils.from_unicode_to_ascii(url.download(timeout=timeout, unicode=True)) except Exception as e: # If we cannot retrieve the document, we skip it logger = logging.getLogger("qa_logger") logger.warning("%s couldn't be retrieved", result.url) logger.warning(str(e)) return "" return self._extract_text(content, mimetype)
def _get_content(self, result): url = URL(result.url) try: timeout = int(MyConfig.get("document_retrieval", "timeout")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) timeout = 15 try: mimetype = url.mimetype content = utils.from_unicode_to_ascii( url.download(timeout=timeout, unicode=True)) except Exception as e: # If we cannot retrieve the document, we skip it logger = logging.getLogger("qa_logger") logger.warning("%s couldn't be retrieved", result.url) logger.warning(str(e)) return "" return self._extract_text(content, mimetype)
def calculate_score(self, question, passage): rank = passage.document.rank q = question.text text = passage.text # Remove stopwords from question and passage # and split it into words q = StopwordsAlgorithm.formulate_query(q).split() text = StopwordsAlgorithm.formulate_query(text).split() # Apply stemming to q and text porter = PorterStemmer() q = map(porter.stem, q) text = map(porter.stem, text) # Filter all words in passage that they are # not present in question words = filter(lambda x: x in q, text) # Our initial score is the number of coincidences score = len(words) try: num = int(MyConfig.get("document_retrieval", "n_results")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return score # Reverse rank order from 1..n to n..1 rank = num - rank + 1.0 # Normalize rank from n..1 to 1..0.5 rank = (rank - 2 + num) / (2 * num - 2) # Weight score by rank score = score * rank return score
def __init__(self, result, rank): self.title = result.title self.url = utils.from_unicode_to_ascii(result.url) self.rank = rank self.description = utils.from_unicode_to_ascii(result.description) self.content = utils.from_unicode_to_ascii(self._get_content(result)) # Split document into passages try: algorithm = MyConfig.get("document_segmentation", "algorithm") if algorithm == "lines": self.passages = SplitIntoLinesAlgorithm.split_into_passages(self) elif algorithm == "paragraphs": self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self) elif algorithm == "sentences": self.passages = SplitIntoSentencesAlgorithm.split_into_passages(self) else: self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)