Ejemplo n.º 1
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        #hidden = self.model(content, self.hidden, self.reduce_option)

        my_path = os.path.abspath(os.path.dirname(__file__))
        model_path = os.path.join(
            *[my_path, "distilroberta-base-paraphrase-v1"])
        print("path====", model_path)
        #self.model = SentenceTransformer(model_name_or_path=model_path)
        self.model = SentenceTransformer("distilroberta-base-paraphrase-v1")
        hidden = self.model.encode(content, convert_to_tensor=True)

        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]

        #embeddings = np.asarray([hidden[j] for j in hidden_args])
        embeddings = hidden

        return sentences, embeddings
    def cluster_runner(
        self,
        content: List[str],
        ratio: float = 0.2,
        algorithm: str = 'kmeans',
        use_first: bool = True,
        num_sentences: int = None
    ) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences
        #NASH
        #hidden = self.model(content, self.hidden, self.reduce_option)
        
        self.model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
        hidden = self.model.encode(content, convert_to_tensor=True)
        
        hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        #NASH
        #embeddings = np.asarray([hidden[j] for j in hidden_args])
        embeddings = hidden

        return sentences, embeddings
Ejemplo n.º 3
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Return the first sentence in the output (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """
        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        hidden = self.model(content,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        embeddings = np.asarray([hidden[j] for j in hidden_args])

        return sentences, embeddings
Ejemplo n.º 4
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """
        def find_closest_args(centroids, features):
            args = {}
            for i, feature in enumerate(features):
                dist_list = []
                for j, centroid in enumerate(centroids):
                    value = np.linalg.norm(feature - centroid)
                    dist_list.append([value, j])
                args[i] = [min(dist_list, key=lambda x: x[0])]
            return args

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        hidden = self.model(content,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        self.features = hidden

        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)
        self.centroids = hidden_args[1]
        hidden_args = hidden_args[0]
        self.hidden_args = hidden_args
        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        embeddings = np.asarray([hidden[j] for j in hidden_args])
        args = find_closest_args(self.centroids, self.features)
        df = pd.DataFrame([a, b[0][0], b[0][1]] for a, b in args.items())
        df.columns = ['Id', 'Weight', 'Centroid']
        scaler = MinMaxScaler()
        df['Weight'] = scaler.fit_transform(df['Weight'].to_numpy().reshape(
            -1, 1))
        df['Weight'] = df['Weight'] - 1
        df['Weight'] = df['Weight'].abs().round(4)
        df['Sent'] = df['Id'].apply(lambda x: content[x])
        self.df = df
        return sentences, embeddings