def get_features_by_coverage(self, k=None, **kwargs): """ Returns a list of top covering features. By default, the features are post_processed through the post_features_item method. Parameters ---------- k: int, optional Number of documents to output. If not set, k is automatically computed using the max_k and target_k runtime parameters. kwargs: dict, optional Custom runtime parameters. Returns ------- list """ p = self.parameters(**kwargs) post = p['post'] if k is None: k = auto_k(data=self.diteration.y_relevance, order=self.diteration.y_order, max_k=p['max_k'], target=p['target_k']) p['post'] = False cluster = self.get_features_by_cluster(k=int(k * p['stretch']), **p) indices = covering_order(cluster, wide=p['wide'])[:k] if post: return [self.post_features_item(self, i) for i in indices] else: return indices
def get_features_by_rank(self, k=None, **kwargs): """ Returns a list of top features according to the current ranking. By default, the features are post_processed through the post_features_item method. Parameters ---------- k: int, optional Number of documents to output. If not set, k is automatically computed using the max_k and target_k runtime parameters. kwargs: dict, optional Custom runtime parameters. Returns ------- list """ p = self.parameters(**kwargs) if k is None: k = auto_k(data=self.diteration.y_relevance, order=self.diteration.y_order, max_k=p['max_k'], target=p['target_k']) if p['post']: return [ self.post_features_item(self, i) for i in self.diteration.y_order[:k] ] else: return self.diteration.y_order[:k]
def get_features_by_cluster(self, k=None, **kwargs): """ Returns a cluster of the best ranked features. The cluster is by default post_processed through the post_features_cluster method. Parameters ---------- k: int, optional Number of documents to output. If not set, k is automatically computed using the max_k and target_k runtime parameters. kwargs: dict, optional Custom runtime parameters. Returns ------- object """ p = self.parameters(**kwargs) if k is None: k = auto_k(data=self.diteration.y_relevance, order=self.diteration.y_order, max_k=p['max_k'], target=p['target_k']) return self.get_features_by_cluster_from_indices( self.diteration.y_order[:k], **kwargs)
def get_landmarks_by_rank(self, reference, k=None, base=None, **kwargs): p = self.parameters(**kwargs) if base is None: base = self.get_base(p['balance']) direction = get_direction(reference, p['balance']) if isinstance(direction, np.ndarray): similarities = base.dot(direction) elif isinstance(direction, csr_matrix): similarities = np.squeeze(base.dot(direction.T).toarray()) else: log.error( "Direction type not supported. Direction must be gismo.gismo.Gismo, gismo.clustering.Cluster, " "gismo.landmarks.Landmarks, numpy.ndarray or scipy.sparse.csr_matrix." ) similarities = None order = np.argsort(-similarities) if k is None: k = auto_k(data=similarities, order=order, max_k=p['max_k'], target=p['target_k']) if p['post']: return [self.post_item(self, i) for i in order[:k]] else: return order[:k]
def summarize(self, query="", **kwargs): """ Performs a full run of all summary-related operations: - Rank a query at document level, fallback to a generic query if the query fails; - Extract sentences from the top documents - Order sentences by one of the three methods proposed, *rank*, *coverage*, and *cosine* - Apply post-processing and return list of selected sentences. Note that calling a :class:`~sisu.summarizer.Summarizer` will call its :meth:`~sisu.summarizer.Summarizer.summarize` method. Parameters ---------- query: :class:`str` Query to run. kwargs: :class:`dict` Runtime specific parameters (see :obj:`~sisu.summarizer.default_summarizer_parameters` for possible arguments). Returns ------- :class:`list` of :class:`str` Summary. """ # Instantiate parameters for the call p = self.parameters(**kwargs) # Perform query, fallback to generic query in case of failure self.rank_documents(query=query, num_query=p['num_query']) # Extract and preprocess sentences self.build_sentence_source(num_documents=p['num_documents'], getter=p['text_getter'], tester=p['sentence_tester']) # Order sentences if p['order'] == 'cosine': self.order_ = cosine_order(self.gismo.embedding.query_projection, self.sentences_, self.query_) elif p['order'] in {'rank', 'coverage'}: self.build_sentence_gismo(itf=p['itf'], s_g_p=p['sentence_gismo_parameters']) self.sentence_gismo_.rank(query) if p['num_sentences'] is None: p['num_sentences'] = auto_k( data=self.sentence_gismo_.diteration.x_relevance, order=self.sentence_gismo_.diteration.x_order, max_k=self.sentence_gismo_.parameters.max_k, target=self.sentence_gismo_.parameters.target_k) if p['order'] == 'rank': self.order_ = self.sentence_gismo_.diteration.x_order else: self.order_ = self.build_coverage_order(p['num_sentences']) if p['max_chars'] is None: results = [ p['post_processing'](self, i) for i in self.order_[:p['num_sentences']] ] return [txt for txt in results if len(txt) > 0] else: results = [] length = 0 # Maximal number of sentences that will be processed max_sentences = int(p['max_chars'] / 50) for i in self.order_[:max_sentences]: txt = p['post_processing'](self, i) l = len(txt) if l > 0 and length + l < p['max_chars']: results.append(txt) length += l if length > .98 * p['max_chars']: break return results