예제 #1
0
    def getRankedList(self, question):
        self.logger.info("CoreMMR: ranking question")
        self.logger.debug('Question type is a ' + str(type(question)))
        self.logger.debug(question)
        selectedSentences = []
        snippets = question.snippets  #question['snippets']
        # This is the class method from the BiRanker that is used to compute the positional scores of the sentences in the snippets.
        pos_dict = {}
        self.beta = 0
        best = []
        current_best = None
        summaryFinal = ""
        length = 0
        similarity = SimilarityJaccard()
        # class method from abstract class that tokenizes all the snippets to sentences.
        sentences = self.getSentences(question)
        for i in range(self.numSelectedSentences):
            best_sim = -99999999
            for sentence in sentences:
                # similarityJaccard is an extension of Similarity Measure that takes 2 sentences ansd returns the float (similarity)
                # similarityInstance = SimilarityJaccard(sentence, question['body'])
                # ques_sim = similarityInstance.calculateSimilarity()
                ques_sim = similarity.calculateSimilarity(
                    sentence, question.body)
                max_sent_sim = -99999999
                for other in best:
                    # similarityInstance = SimilarityJaccard(sentence, other)
                    sim = similarity.calculateSimilarity(sentence, other)
                    if self.beta != 0:
                        try:
                            # current_sent_sim = (self.beta*similarityInstance.calculateSimilarity())+((1-self.beta)*self.pos_dict[sentence])
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) * self.pos_dict[sentence])
                        except:
                            self.logger.info('Looking for Sentence: ' +
                                             str(sentence.lstrip().rstrip()) +
                                             'in positional dictionary')
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) *
                                self.pos_dict[sentence.lstrip().rstrip()])
                    else:  # since the value of beta is set to 0
                        current_sent_sim = sim  # similarityInstance.calculateSimilarity()
                    if current_sent_sim > max_sent_sim:
                        max_sent_sim = current_sent_sim
                        # equation for mmr to balance between similarity with already selected sentences and similarity with question
                final_sim = (
                    (1 - self.alpha) * ques_sim) - (self.alpha * max_sent_sim)
                if final_sim > best_sim:
                    best_sim = final_sim
                    current_best = sentence
            best.append(current_best)
            # maintaining a list of sentences that are not already selected so they can be used for selection for next iteration
            sentences = set(sentences).difference(set(best))
            if current_best != None:
                selectedSentences.append(current_best)
            else:
                break

        self.logger.info('Performed Core MMR')
        return selectedSentences
예제 #2
0
    def getRankedList(self, question):
        self.logger.info('SoftMMR ranking question')
        selectedSentences = []
        snippets = question.snippets
        # This is the class method from the BiRanker that is used to compute the positional scores of the sentences in the snippets.
        pos_dict = self.computePositions(snippets)
        self.beta = 0.5
        best = []
        current_best = None
        summaryFinal = ""
        length = 0
        # class method from abstract class that tokenizes all the snippets to sentences.
        similarity = SimilarityJaccard()
        sentences = self.getSentences(question)
        for i in range(self.numSelectedSentences):
            best_sim = -99999999
            for sentence in sentences:
                # similarityJaccard is an extension of Similarity Measure that takes 2 sentences ansd returns the float (similarity)
                # similarityInstance = SimilarityJaccard(sentence, question['body'])
                # ques_sim = similarityInstance.calculateSimilarity()
                """ Kaichen Chen 0304: """
                # ques_sim = similarity.calculateSimilarity(sentence, question.body)
                ques_sim = similarity.calculateSimilarity(
                    sentence.tokens, question.tokens)

                max_sent_sim = -99999999
                for other in best:
                    # similarityInstance = SimilarityJaccard(sentence, other)
                    """ Kaichen Chen 0304: """
                    # sim = similarity.calculateSimilarity(sentence, other)
                    sim = similarity.calculateSimilarity(
                        sentence.tokens, other.tokens)
                    if self.beta != 0:  # since the value of beta is set to 0.5
                        try:
                            # current_sent_sim = (self.beta*similarityInstance.calculateSimilarity())+((1-self.beta)*pos_dict[sentence])
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) * pos_dict[sentence])
                        except:
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) *
                                pos_dict[sentence.lstrip().rstrip()])
                    else:
                        current_sent_sim = sim  # similarityInstance.calculateSimilarity()
                    if current_sent_sim > max_sent_sim:
                        max_sent_sim = current_sent_sim
                # equation for mmr to balance between similarity with already selected sentences and similarity with question
                final_sim = (
                    (1 - self.alpha) * ques_sim) - (self.alpha * max_sent_sim)
                if final_sim > best_sim:
                    best_sim = final_sim
                    current_best = sentence
            best.append(current_best)
            # maintaining a list of sentences that are not already selected so they can be used for selection for next iteration
            sentences = set(sentences).difference(set(best))
            if current_best != None:
                selectedSentences.append(current_best)
            else:
                break
        self.logger.info('Computed Soft Constrained MMR')
        return selectedSentences
예제 #3
0
    def getRankedList(self, question):
        self.logger.info("CoreMMR: ranking question")
        self.logger.debug('Question type is a ' + str(type(question)))
        self.logger.debug(question)
        selectedSentences = []
        #snippets = question.snippets #question['snippets']
        # This is the class method from the BiRanker that is used to compute the positional scores of the sentences in the snippets.
        pos_dict = {}
        self.beta = 0
        best = []
        current_best = None
        summaryFinal = ""
        length = 0
        similarity = SimilarityJaccard()
        #splitter = Splitter()

        # class method from abstract class that tokenizes all the snippets to sentences.
        sentences = self.getSentences(question)
        print sentences
        print len(sentences)

        #### Holder dictionary for the tokens per sentence. Change requiredf for O(N) improvement so that the token is fetched from memory over getting computed at every iteration.
        #dict_token_holder = dict()
        #for sentence in sentences:
        #    tokens = splitter.tokenize_sentence(sentence)
        #    dict_token_holder[sentence] = tokens

        for i in range(self.numSelectedSentences):
            best_sim = -99999999
            for sentence in sentences:
                # similarityJaccard is an extension of Similarity Measure that takes 2 sentences ansd returns the float (similarity)
                # similarityInstance = SimilarityJaccard(sentence, question['body'])
                # ques_sim = similarityInstance.calculateSimilarity()
                ###print (question.body)
                ques_sim = similarity.calculateSimilarity_token_input(
                    sentence.tokens, question.tokens)
                max_sent_sim = -99999999
                for other in best:
                    # similarityInstance = SimilarityJaccard(sentence, other)
                    sim = similarity.calculateSimilarity_token_input(
                        sentence.tokens, other.tokens)
                    if self.beta != 0:
                        try:
                            # current_sent_sim = (self.beta*similarityInstance.calculateSimilarity())+((1-self.beta)*self.pos_dict[sentence])
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) * self.pos_dict[sentence])
                        except:
                            self.logger.info('Looking for Sentence: ' +
                                             str(sentence.lstrip().rstrip()) +
                                             'in positional dictionary')
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) *
                                self.pos_dict[sentence.lstrip().rstrip()])
                    else:  # since the value of beta is set to 0
                        current_sent_sim = sim  # similarityInstance.calculateSimilarity()
                    if current_sent_sim > max_sent_sim:
                        max_sent_sim = current_sent_sim
                        # equation for mmr to balance between similarity with already selected sentences and similarity with question
                final_sim = (
                    (1 - self.alpha) * ques_sim) - (self.alpha * max_sent_sim)
                if final_sim > best_sim:
                    best_sim = final_sim
                    current_best = sentence
            best.append(current_best)
            # maintaining a list of sentences that are not already selected so they can be used for selection for next iteration
            ##### list of lists are not hashable - thus set difference will not work.
            #### Using tuples insread.
            #sentences = filter(lambda x:x not in best,sentences) + filter(lambda x:x not in sentences, best)
            sentences = set(sentences).difference(set(best))
            if current_best != None:
                selectedSentences.append(current_best.text)
            else:
                break

        self.logger.info('Performed Core MMR')
        return selectedSentences
예제 #4
0
    def getRankedList(self, question):
        self.logger.info("CoreMMR: ranking question")
        self.logger.debug('Question type is a ' + str(type(question)))
        self.logger.debug(question)
        selectedSentences = []
        snippets = question.snippets  #question['snippets']
        # This is the class method from the BiRanker that is used to compute the positional scores of the sentences in the snippets.
        pos_dict = {}
        self.beta = 0
        best = []
        current_best = None
        summaryFinal = ""
        length = 0
        similarity = SimilarityJaccard()
        # class method from abstract class that tokenizes all the snippets to sentences.
        """ Kaichen Chen 0304:  """
        if not debug:
            sentences = self.getSentences(question)
        else:
            """ list of model.Sentence() """
            sentences = self.getTokenizedSentences(question)
            """ list of string """
            # sentences = self.getTokenizedSentences(question)
            # sentences = self.getSentences(question)
        if debug:
            for i in sentences:
                pass
                # print "getRankedList(): input sentence: {}, {}".format(type(i), i)

        for i in range(self.numSelectedSentences):
            best_sim = -99999999
            """ list of model.Sentence() """
            for sentence in sentences:
                # similarityJaccard is an extension of Similarity Measure that takes 2 sentences and returns the float (similarity)
                # similarityInstance = SimilarityJaccard(sentence, question['body'])
                # ques_sim = similarityInstance.calculateSimilarity()
                """ Kaichen Chen 0304: """
                if not debug:
                    ques_sim = similarity.calculateSimilarity(
                        sentence, question.body)
                    print "Origin on: {}".format(ques_sim)
                else:
                    # print word_tokenize(sentence)
                    ques_sim = similarity.calculateTokenSimilarity(
                        sentence.tokens, question.tokens)
                    # print "Origin on: {}".format(similarity.calculateSimilarity(sentence, question.body))
                    print "Debug on: {}".format(ques_sim)

                max_sent_sim = -99999999
                for other in best:
                    # similarityInstance = SimilarityJaccard(sentence, other)
                    """ Kaichen Chen 0304: """
                    if not debug:
                        sim = similarity.calculateSimilarity(sentence, other)
                    else:
                        sim = similarity.calculateTokenSimilarity(
                            sentence.tokens, other.tokens)
                        # print ques_sim

                    if self.beta != 0:
                        try:
                            # current_sent_sim = (self.beta*similarityInstance.calculateSimilarity())+((1-self.beta)*self.pos_dict[sentence])
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) * self.pos_dict[sentence])
                        except:
                            self.logger.info('Looking for Sentence: ' +
                                             str(sentence.lstrip().rstrip()) +
                                             'in positional dictionary')
                            current_sent_sim = (self.beta * sim) + (
                                (1 - self.beta) *
                                self.pos_dict[sentence.lstrip().rstrip()])
                    else:  # since the value of beta is set to 0
                        current_sent_sim = sim  # similarityInstance.calculateSimilarity()
                    if current_sent_sim > max_sent_sim:
                        max_sent_sim = current_sent_sim
                        # equation for mmr to balance between similarity with already selected sentences and similarity with question
                final_sim = (
                    (1 - self.alpha) * ques_sim) - (self.alpha * max_sent_sim)
                if final_sim > best_sim:
                    best_sim = final_sim
                    """ Kaichen Chen 0304 """
                    if not debug:
                        current_best = sentence
                    else:
                        current_best = sentence
                        # print "Type of sentence: {}, sentence.text: {}".format(type(sentence), type(sentence.text))
            best.append(current_best)
            # maintaining a list of sentences that are not already selected so they can be used for selection for next iteration
            sentences = set(sentences).difference(set(best))
            if current_best != None:
                """ Kaichen Chen 0304 """
                if not debug:
                    selectedSentences.append(current_best)
                else:
                    selectedSentences.append(current_best.text)
            else:
                break

        self.logger.info('Performed Core MMR')
        return selectedSentences