Ejemplo n.º 1
0
    def _initDoc(self):
        self.spacyDoc = nlp(self.text)
        self.tokens = [t.text for t in self.spacyDoc]
        self.topSentencesText = [
            sent.text for sent in self.spacyDoc._.textrank.summary(
                limit_phrases=20, limit_sentences=NUMBER_OF_TOP_SENTENCES_KEPT)
        ]

        # sentence tokenization done with SpaCy - for consistency within all variants

        if self.representationStyle == REPRESENTATION_STYLE_SPACY:
            # since it is time consuming to compute Spacy objects per sentence, we pass in the sentence
            # vector representation per sentence:
            self.sentences = []
            for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents):
                self.sentences.append(
                    Sentence(self.id,
                             sentIdx,
                             sentSpacyObj.text,
                             self.representationStyle,
                             doNotInitRepresentation=True))
                self.sentences[-1].setRepresentation(sentSpacyObj.vector)

        # in all other cases, and as it should be for correct code, the representations are computed
        # within the Sentence object:
        else:
            self.sentences = [
                Sentence(self.id, sentIdx, sentSpacyObj.text,
                         self.representationStyle)
                for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents)
            ]
Ejemplo n.º 2
0
    def _getQuerySummaryText(self, query, numSentencesNeeded, sentences):
        # The algorithm here is:
        #   Spacy-vectorize the query
        #   Get the similarity of the query to each of the potential sentences in the corpus
        #   Take the most similar sentences to the query as long as it isn't redundant to the sentences already added
        #       (and not sentences in previous summaries)

        if self._noMoreSentences():
            return ["NO MORE INFORMATION."], [], 0

        if query == '':
            finalSummaryTxtList, finalSummaryIds, numWordsInSummary = self._getNextGeneralSentences(
                numSentencesNeeded * 20)
            return finalSummaryTxtList, finalSummaryIds, numWordsInSummary

        # make a sentence object for the query:
        queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query,
                                   self.corpus.representationStyle)

        # get an ordered list of sentences by similarity to the query:
        similaritiesToQuery = [
            (sentence, queryAsSentence.similarity(sentence))
            for sentence in self.allSentencesForPotentialSummaries
        ]
        similaritiesToQuery.sort(key=operator.itemgetter(1), reverse=True)

        # keep taking most query-similar, non-redundant sentences until we have enough:
        sentencesUsing = []
        for sentence, _ in similaritiesToQuery:
            if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText and not self._isRedundant(
                    sentence, sentencesUsing):
                sentencesUsing.append(sentence)
                self.usedSentences[sentence.sentId] = sentence
                self.usedSentencesText[
                    sentence.textCompressed] = sentence.sentId
                if len(sentencesUsing) == numSentencesNeeded:
                    break

        # return also the length in words of the returned summary:
        summaryLength = sum(len(sent) for sent in sentencesUsing)

        return [sent.text for sent in sentencesUsing
                ], [sent.sentId for sent in sentencesUsing], summaryLength
Ejemplo n.º 3
0
    def _getQuerySummaryText(self, query, numSentencesNeeded, sentences):
        # The algorithm here is:
        #   Spacy-vectorize the query
        #   Get the similarity of the query to each of the potential sentences in the corpus
        #   Take the most similar sentences to the query as long as it isn't redundant to the sentences already added
        #       (and not sentences in previous summaries)

        if self._noMoreSentences():
            return ["NO MORE INFORMATION."], [], 0

        if query == '':
            finalSummaryTxtList, finalSummaryIds, numWordsInSummary = self._getNextGeneralSentences(
                numSentencesNeeded * 20)
            return finalSummaryTxtList, finalSummaryIds, numWordsInSummary

        # make a sentence object for the query:
        queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query,
                                   self.corpus.representationStyle)

        # get an ordered list of sentences based on its MMR score:
        lambta = 0.5
        usedSentencesList = [v for k, v in self.usedSentences.items()]
        sentenceMMRScores = [
            (sentence, ) +
            MMRScore(sentence, queryAsSentence, usedSentencesList,
                     lambta)  # [(sent, mmrscore, sim1, sim2)]
            for sentence in self.allSentencesForPotentialSummaries
        ]
        sentencesUsing = []
        while (len(sentencesUsing) <= numSentencesNeeded):
            if len(sentencesUsing) > 0:
                ## take the last added sentence and update the mmr score for the rest of the sentenes
                for index, sentMMR in enumerate(sentenceMMRScores):
                    newSim2 = sentMMR[0].similarity(sentencesUsing[-1])
                    if newSim2 > sentMMR[3]:
                        mmrScore = lambta * sentMMR[2] - (1 - lambta) * newSim2
                        sentenceMMRScores[index] = (sentMMR[0], mmrScore,
                                                    sentMMR[2], newSim2)

            sentenceMMRScores.sort(key=operator.itemgetter(1), reverse=True)
            # keep taking most query-similar, non-redundant sentences until we have enough:
            for index, (sentence, _, _, _) in enumerate(sentenceMMRScores):
                if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText:
                    sentencesUsing.append(sentence)
                    self.usedSentences[sentence.sentId] = sentence
                    self.usedSentencesText[
                        sentence.textCompressed] = sentence.sentId
                    sentenceMMRScores.pop(index)
                    break

        # return also the length in words of the returned summary:
        summaryLength = sum(len(sent) for sent in sentencesUsing)

        return [sent.text for sent in sentencesUsing
                ], [sent.sentId for sent in sentencesUsing], summaryLength
Ejemplo n.º 4
0
    def _getSemanticSimilarityScores(self, query, sentencesToCompareTo):
        # Gets the semantic similarity scores between the query and the sentences specified
        # input: query -- the query string given by the user
        #        sentencesToCompareTo -- list of sentence objects
        # output: A dictionary of {metric: {sentenceId -> score}} for the representation style defined by the corpus
        #         in self.corpus.representationStyle.
        #         The sentence IDs are of those specified in sentencesToCompareTo

        queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query, self.corpus.representationStyle)
        similaritiesToQuery = {sentence.sentId: sentence.similarity(queryAsSentence) for sentence in sentencesToCompareTo}

        return {self.corpus.representationStyle: similaritiesToQuery}
Ejemplo n.º 5
0
    def _getNextGeneralSentences(self, desiredWordCount):
        # concatenate sentences until the the word limit is up:
        numWordsInSummary = 0
        finalSummaryTxtList = []
        finalSummaryIds = []
        if self.isGenericClustering:
            while numWordsInSummary < desiredWordCount and not self._noMoreSentences(
            ):
                # get the next index to use in the sentenceClusterLabelsOrdered list (loop back to the beginning):
                self.sentenceClusterIndexLast = (
                    self.sentenceClusterIndexLast + 1) % len(
                        self.sentenceClusterLabelsOrdered)
                # get the index of the cluster to use now:
                curClusterLabel = self.sentenceClusterLabelsOrdered[
                    self.sentenceClusterIndexLast]
                # get the best sentence in that cluster:
                bestSentenceInCluster = self._getBestSentence(
                    self.allSentencesForPotentialSummaries,
                    self.sentenceClusters[curClusterLabel], self.corpus)
                # append the chosen sentence to the summary:
                if bestSentenceInCluster != None:
                    finalSummaryTxtList.append(bestSentenceInCluster.text)
                    finalSummaryIds.append(bestSentenceInCluster.sentId)
                    numWordsInSummary += len(bestSentenceInCluster)
                    self.usedSentences[
                        bestSentenceInCluster.sentId] = bestSentenceInCluster
                    self.usedSentencesText[
                        bestSentenceInCluster.
                        textCompressed] = bestSentenceInCluster.sentId

        else:
            # now create MMR-based generic summary
            topWords = self._findTopWords()
            queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries),
                                       " ".join(topWords),
                                       self.corpus.representationStyle)

            # get an ordered list of sentences based on its MMR score:
            lambta = 0.5
            usedSentencesList = []
            sentenceMMRScores = [
                (sentence, ) +
                MMRScore(sentence, queryAsSentence, usedSentencesList,
                         lambta)  # [(sent, mmrscore, sim1, sim2)]
                for sentence in self.allSentencesForPotentialSummaries
            ]
            sentencesUsing = []
            while numWordsInSummary < desiredWordCount and not self._noMoreSentences(
            ):
                if len(sentencesUsing) > 0:
                    ## take the last added sentence and update the mmr score for the rest of the sentenes
                    for index, sentMMR in enumerate(sentenceMMRScores):
                        newSim2 = sentMMR[0].similarity(sentencesUsing[-1])
                        if newSim2 > sentMMR[3]:
                            mmrScore = lambta * sentMMR[2] - (1 -
                                                              lambta) * newSim2
                            sentenceMMRScores[index] = (sentMMR[0], mmrScore,
                                                        sentMMR[2], newSim2)

                sentenceMMRScores.sort(key=operator.itemgetter(1),
                                       reverse=True)
                # keep taking most query-similar, non-redundant sentences until we have enough:
                for index, (sentence, _, _, _) in enumerate(sentenceMMRScores):
                    if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText:
                        sentencesUsing.append(sentence)
                        finalSummaryTxtList.append(sentence.text)
                        finalSummaryIds.append(sentence.sentId)
                        numWordsInSummary += len(sentence)
                        self.usedSentences[sentence.sentId] = sentence
                        self.usedSentencesText[
                            sentence.textCompressed] = sentence.sentId
                        sentenceMMRScores.pop(index)
                        break

        return finalSummaryTxtList, finalSummaryIds, numWordsInSummary