Beispiel #1
0
    def get_features_by_coverage(self, k=None, **kwargs):
        """
        Returns a list of top covering features.
        By default, the features are post_processed through the post_features_item method.

        Parameters
        ----------
        k: int, optional
            Number of documents to output. If not set, k is automatically computed
            using the max_k and target_k runtime parameters.
        kwargs: dict, optional
            Custom runtime parameters.

        Returns
        -------
        list

        """
        p = self.parameters(**kwargs)
        post = p['post']
        if k is None:
            k = auto_k(data=self.diteration.y_relevance,
                       order=self.diteration.y_order,
                       max_k=p['max_k'],
                       target=p['target_k'])
        p['post'] = False
        cluster = self.get_features_by_cluster(k=int(k * p['stretch']), **p)
        indices = covering_order(cluster, wide=p['wide'])[:k]
        if post:
            return [self.post_features_item(self, i) for i in indices]
        else:
            return indices
Beispiel #2
0
    def get_features_by_rank(self, k=None, **kwargs):
        """
        Returns a list of top features according to the current ranking.
        By default, the features are post_processed through the post_features_item method.

        Parameters
        ----------
        k: int, optional
            Number of documents to output. If not set, k is automatically computed
            using the max_k and target_k runtime parameters.
        kwargs: dict, optional
            Custom runtime parameters.

        Returns
        -------
        list
        """
        p = self.parameters(**kwargs)
        if k is None:
            k = auto_k(data=self.diteration.y_relevance,
                       order=self.diteration.y_order,
                       max_k=p['max_k'],
                       target=p['target_k'])
        if p['post']:
            return [
                self.post_features_item(self, i)
                for i in self.diteration.y_order[:k]
            ]
        else:
            return self.diteration.y_order[:k]
Beispiel #3
0
    def get_features_by_cluster(self, k=None, **kwargs):
        """
        Returns a cluster of the best ranked features.
        The cluster is by default post_processed through the post_features_cluster method.

        Parameters
        ----------
        k: int, optional
            Number of documents to output. If not set, k is automatically computed
            using the max_k and target_k runtime parameters.
        kwargs: dict, optional
            Custom runtime parameters.

        Returns
        -------
        object
        """
        p = self.parameters(**kwargs)
        if k is None:
            k = auto_k(data=self.diteration.y_relevance,
                       order=self.diteration.y_order,
                       max_k=p['max_k'],
                       target=p['target_k'])
        return self.get_features_by_cluster_from_indices(
            self.diteration.y_order[:k], **kwargs)
Beispiel #4
0
 def get_landmarks_by_rank(self, reference, k=None, base=None, **kwargs):
     p = self.parameters(**kwargs)
     if base is None:
         base = self.get_base(p['balance'])
     direction = get_direction(reference, p['balance'])
     if isinstance(direction, np.ndarray):
         similarities = base.dot(direction)
     elif isinstance(direction, csr_matrix):
         similarities = np.squeeze(base.dot(direction.T).toarray())
     else:
         log.error(
             "Direction type not supported. Direction must be gismo.gismo.Gismo, gismo.clustering.Cluster, "
             "gismo.landmarks.Landmarks, numpy.ndarray or scipy.sparse.csr_matrix."
         )
         similarities = None
     order = np.argsort(-similarities)
     if k is None:
         k = auto_k(data=similarities,
                    order=order,
                    max_k=p['max_k'],
                    target=p['target_k'])
     if p['post']:
         return [self.post_item(self, i) for i in order[:k]]
     else:
         return order[:k]
Beispiel #5
0
    def summarize(self, query="", **kwargs):
        """
        Performs a full run of all summary-related operations:

        - Rank a query at document level, fallback to a generic query if the query fails;
        - Extract sentences from the top documents
        - Order sentences by one of the three methods proposed, *rank*, *coverage*, and *cosine*
        - Apply post-processing and return list of selected sentences.

        Note that calling a :class:`~sisu.summarizer.Summarizer` will call its
        :meth:`~sisu.summarizer.Summarizer.summarize` method.

        Parameters
        ----------
        query: :class:`str`
            Query to run.
        kwargs: :class:`dict`
            Runtime specific parameters
            (see :obj:`~sisu.summarizer.default_summarizer_parameters` for possible arguments).

        Returns
        -------
        :class:`list` of :class:`str`
            Summary.
        """
        # Instantiate parameters for the call
        p = self.parameters(**kwargs)
        # Perform query, fallback to generic query in case of failure
        self.rank_documents(query=query, num_query=p['num_query'])
        # Extract and preprocess sentences
        self.build_sentence_source(num_documents=p['num_documents'],
                                   getter=p['text_getter'],
                                   tester=p['sentence_tester'])
        # Order sentences
        if p['order'] == 'cosine':
            self.order_ = cosine_order(self.gismo.embedding.query_projection,
                                       self.sentences_, self.query_)
        elif p['order'] in {'rank', 'coverage'}:
            self.build_sentence_gismo(itf=p['itf'],
                                      s_g_p=p['sentence_gismo_parameters'])
            self.sentence_gismo_.rank(query)
            if p['num_sentences'] is None:
                p['num_sentences'] = auto_k(
                    data=self.sentence_gismo_.diteration.x_relevance,
                    order=self.sentence_gismo_.diteration.x_order,
                    max_k=self.sentence_gismo_.parameters.max_k,
                    target=self.sentence_gismo_.parameters.target_k)
            if p['order'] == 'rank':
                self.order_ = self.sentence_gismo_.diteration.x_order
            else:
                self.order_ = self.build_coverage_order(p['num_sentences'])
        if p['max_chars'] is None:
            results = [
                p['post_processing'](self, i)
                for i in self.order_[:p['num_sentences']]
            ]
            return [txt for txt in results if len(txt) > 0]
        else:
            results = []
            length = 0
            # Maximal number of sentences that will be processed
            max_sentences = int(p['max_chars'] / 50)
            for i in self.order_[:max_sentences]:
                txt = p['post_processing'](self, i)
                l = len(txt)
                if l > 0 and length + l < p['max_chars']:
                    results.append(txt)
                    length += l
                if length > .98 * p['max_chars']:
                    break
            return results