Beispiel #1
0
class Information_Retrieval:
    def __init__(self, article, saveTags=False):
        self.article_name = article
        self.raw_text = open(article, 'r').read().replace("\xc2\xa0", " ")
        self.article = Parse(self.raw_text)
        self.default_person = None
        self.default_location = None
        self.tags = None
        if saveTags:
            self.tags = (Sentence(self.raw_text, 0).get_ner_tags())
            self.default_person = self.get_default_pronoun(
                self.tags, ['PERSON'])
            self.default_location = self.get_default_pronoun(
                self.tags, ['LOCATION'])

    def get_default_pronoun(self, tags, answer_tag):
        counts = defaultdict(int)
        for (word, tag) in tags:
            if tag in answer_tag:
                counts[word] += 1
        max = ''
        max_count = 0
        for key in counts:
            count = counts[key]
            if count > max_count:
                max = key
                max_count = count
        return max

    def ranked_list(self, question):
        relevant_sentences = dict()

        for (word, tag) in question.get_pos_tags():
            if 'DT' not in tag and '.' not in tag:
                sents = self.article.findword(word)
                weight = len(sents)
                #print word
                for sent in sents:
                    if sent not in relevant_sentences:
                        relevant_sentences[sent] = 1.0 / weight
                    else:
                        relevant_sentences[sent] += 1.0 / weight

        syn_sentences = dict()
        for (word, tag) in question.get_pos_tags():
            if 'DT' not in tag and '.' not in tag:
                sents = self.article.findword_sym(word)
                weight = len(sents)
                #print word
                for sent in sents:
                    if sent not in syn_sentences:
                        syn_sentences[sent] = 1.0 / weight
                    else:
                        syn_sentences[sent] += 1.0 / weight

        #print relevant_sentences
        (best_sen_reg, max1) = self.find_best_sentences(relevant_sentences)
        (best_sen_syn, max2) = self.find_best_sentences(syn_sentences)

        if max1 > max2:
            return best_sen_reg
        else:
            return best_sen_syn

    def find_best_sentences(self, relevant_sentences):
        best_indices = sorted(relevant_sentences.keys(),
                              key=lambda x: relevant_sentences[x],
                              reverse=True)
        max = 0
        top = []
        for ind in best_indices:
            count = relevant_sentences[ind]
            if count > max:
                top = [ind]
                max = count
            elif count == max:
                top.append(ind)

        best_sentences = [
            Sentence(self.article.sentences[x], 0).raw_text for x in top
        ]
        return (sorted(best_sentences, key=lambda x: len(x)), max)