def _initDoc(self): self.spacyDoc = nlp(self.text) self.tokens = [t.text for t in self.spacyDoc] self.topSentencesText = [ sent.text for sent in self.spacyDoc._.textrank.summary( limit_phrases=20, limit_sentences=NUMBER_OF_TOP_SENTENCES_KEPT) ] # sentence tokenization done with SpaCy - for consistency within all variants if self.representationStyle == REPRESENTATION_STYLE_SPACY: # since it is time consuming to compute Spacy objects per sentence, we pass in the sentence # vector representation per sentence: self.sentences = [] for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents): self.sentences.append( Sentence(self.id, sentIdx, sentSpacyObj.text, self.representationStyle, doNotInitRepresentation=True)) self.sentences[-1].setRepresentation(sentSpacyObj.vector) # in all other cases, and as it should be for correct code, the representations are computed # within the Sentence object: else: self.sentences = [ Sentence(self.id, sentIdx, sentSpacyObj.text, self.representationStyle) for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents) ]
def _getQuerySummaryText(self, query, numSentencesNeeded, sentences): # The algorithm here is: # Spacy-vectorize the query # Get the similarity of the query to each of the potential sentences in the corpus # Take the most similar sentences to the query as long as it isn't redundant to the sentences already added # (and not sentences in previous summaries) if self._noMoreSentences(): return ["NO MORE INFORMATION."], [], 0 if query == '': finalSummaryTxtList, finalSummaryIds, numWordsInSummary = self._getNextGeneralSentences( numSentencesNeeded * 20) return finalSummaryTxtList, finalSummaryIds, numWordsInSummary # make a sentence object for the query: queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query, self.corpus.representationStyle) # get an ordered list of sentences by similarity to the query: similaritiesToQuery = [ (sentence, queryAsSentence.similarity(sentence)) for sentence in self.allSentencesForPotentialSummaries ] similaritiesToQuery.sort(key=operator.itemgetter(1), reverse=True) # keep taking most query-similar, non-redundant sentences until we have enough: sentencesUsing = [] for sentence, _ in similaritiesToQuery: if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText and not self._isRedundant( sentence, sentencesUsing): sentencesUsing.append(sentence) self.usedSentences[sentence.sentId] = sentence self.usedSentencesText[ sentence.textCompressed] = sentence.sentId if len(sentencesUsing) == numSentencesNeeded: break # return also the length in words of the returned summary: summaryLength = sum(len(sent) for sent in sentencesUsing) return [sent.text for sent in sentencesUsing ], [sent.sentId for sent in sentencesUsing], summaryLength
def _getQuerySummaryText(self, query, numSentencesNeeded, sentences): # The algorithm here is: # Spacy-vectorize the query # Get the similarity of the query to each of the potential sentences in the corpus # Take the most similar sentences to the query as long as it isn't redundant to the sentences already added # (and not sentences in previous summaries) if self._noMoreSentences(): return ["NO MORE INFORMATION."], [], 0 if query == '': finalSummaryTxtList, finalSummaryIds, numWordsInSummary = self._getNextGeneralSentences( numSentencesNeeded * 20) return finalSummaryTxtList, finalSummaryIds, numWordsInSummary # make a sentence object for the query: queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query, self.corpus.representationStyle) # get an ordered list of sentences based on its MMR score: lambta = 0.5 usedSentencesList = [v for k, v in self.usedSentences.items()] sentenceMMRScores = [ (sentence, ) + MMRScore(sentence, queryAsSentence, usedSentencesList, lambta) # [(sent, mmrscore, sim1, sim2)] for sentence in self.allSentencesForPotentialSummaries ] sentencesUsing = [] while (len(sentencesUsing) <= numSentencesNeeded): if len(sentencesUsing) > 0: ## take the last added sentence and update the mmr score for the rest of the sentenes for index, sentMMR in enumerate(sentenceMMRScores): newSim2 = sentMMR[0].similarity(sentencesUsing[-1]) if newSim2 > sentMMR[3]: mmrScore = lambta * sentMMR[2] - (1 - lambta) * newSim2 sentenceMMRScores[index] = (sentMMR[0], mmrScore, sentMMR[2], newSim2) sentenceMMRScores.sort(key=operator.itemgetter(1), reverse=True) # keep taking most query-similar, non-redundant sentences until we have enough: for index, (sentence, _, _, _) in enumerate(sentenceMMRScores): if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText: sentencesUsing.append(sentence) self.usedSentences[sentence.sentId] = sentence self.usedSentencesText[ sentence.textCompressed] = sentence.sentId sentenceMMRScores.pop(index) break # return also the length in words of the returned summary: summaryLength = sum(len(sent) for sent in sentencesUsing) return [sent.text for sent in sentencesUsing ], [sent.sentId for sent in sentencesUsing], summaryLength
def _getSemanticSimilarityScores(self, query, sentencesToCompareTo): # Gets the semantic similarity scores between the query and the sentences specified # input: query -- the query string given by the user # sentencesToCompareTo -- list of sentence objects # output: A dictionary of {metric: {sentenceId -> score}} for the representation style defined by the corpus # in self.corpus.representationStyle. # The sentence IDs are of those specified in sentencesToCompareTo queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), query, self.corpus.representationStyle) similaritiesToQuery = {sentence.sentId: sentence.similarity(queryAsSentence) for sentence in sentencesToCompareTo} return {self.corpus.representationStyle: similaritiesToQuery}
def _getNextGeneralSentences(self, desiredWordCount): # concatenate sentences until the the word limit is up: numWordsInSummary = 0 finalSummaryTxtList = [] finalSummaryIds = [] if self.isGenericClustering: while numWordsInSummary < desiredWordCount and not self._noMoreSentences( ): # get the next index to use in the sentenceClusterLabelsOrdered list (loop back to the beginning): self.sentenceClusterIndexLast = ( self.sentenceClusterIndexLast + 1) % len( self.sentenceClusterLabelsOrdered) # get the index of the cluster to use now: curClusterLabel = self.sentenceClusterLabelsOrdered[ self.sentenceClusterIndexLast] # get the best sentence in that cluster: bestSentenceInCluster = self._getBestSentence( self.allSentencesForPotentialSummaries, self.sentenceClusters[curClusterLabel], self.corpus) # append the chosen sentence to the summary: if bestSentenceInCluster != None: finalSummaryTxtList.append(bestSentenceInCluster.text) finalSummaryIds.append(bestSentenceInCluster.sentId) numWordsInSummary += len(bestSentenceInCluster) self.usedSentences[ bestSentenceInCluster.sentId] = bestSentenceInCluster self.usedSentencesText[ bestSentenceInCluster. textCompressed] = bestSentenceInCluster.sentId else: # now create MMR-based generic summary topWords = self._findTopWords() queryAsSentence = Sentence(QUERY_DOC_ALIAS, len(self.queries), " ".join(topWords), self.corpus.representationStyle) # get an ordered list of sentences based on its MMR score: lambta = 0.5 usedSentencesList = [] sentenceMMRScores = [ (sentence, ) + MMRScore(sentence, queryAsSentence, usedSentencesList, lambta) # [(sent, mmrscore, sim1, sim2)] for sentence in self.allSentencesForPotentialSummaries ] sentencesUsing = [] while numWordsInSummary < desiredWordCount and not self._noMoreSentences( ): if len(sentencesUsing) > 0: ## take the last added sentence and update the mmr score for the rest of the sentenes for index, sentMMR in enumerate(sentenceMMRScores): newSim2 = sentMMR[0].similarity(sentencesUsing[-1]) if newSim2 > sentMMR[3]: mmrScore = lambta * sentMMR[2] - (1 - lambta) * newSim2 sentenceMMRScores[index] = (sentMMR[0], mmrScore, sentMMR[2], newSim2) sentenceMMRScores.sort(key=operator.itemgetter(1), reverse=True) # keep taking most query-similar, non-redundant sentences until we have enough: for index, (sentence, _, _, _) in enumerate(sentenceMMRScores): if sentence.sentId not in self.usedSentences and sentence.textCompressed not in self.usedSentencesText: sentencesUsing.append(sentence) finalSummaryTxtList.append(sentence.text) finalSummaryIds.append(sentence.sentId) numWordsInSummary += len(sentence) self.usedSentences[sentence.sentId] = sentence self.usedSentencesText[ sentence.textCompressed] = sentence.sentId sentenceMMRScores.pop(index) break return finalSummaryTxtList, finalSummaryIds, numWordsInSummary