def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool= True) -> List[str]:
        hidden = self.model(content, self.hidden, self.reduce_option)
        hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio)

        if use_first:
            if hidden_args[0] != 0:
                hidden_args.insert(0,0)

        return [content[j] for j in hidden_args]
Esempio n. 2
0
    def calculate_optimal_k(self,
                            body: str,
                            algorithm: str = 'kmeans',
                            min_length: int = 40,
                            max_length: int = 600,
                            k_max: int = None):
        """
        Calculates the optimal Elbow K.

        :param body: The input body to summarize.
        :param algorithm: The algorithm to use for clustering.
        :param min_length: The min length to use.
        :param max_length: The max length to use.
        :param k_max: The maximum number of clusters to search.
        :return:
        """
        sentences = self.sentence_handler(body, min_length, max_length)

        if k_max is None:
            k_max = len(sentences) - 1

        hidden = self.model(sentences,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        optimal_k = ClusterFeatures(
            hidden, algorithm,
            random_state=self.random_state).calculate_optimal_cluster(k_max)

        return optimal_k
Esempio n. 3
0
    def calculate_elbow(
        self,
        body: str,
        algorithm: str = 'kmeans',
        min_length: int = 40,
        max_length: int = 600,
        k_max: int = None,
    ) -> List[float]:
        """
        Calculates elbow across the clusters.

        :param body: The input body to summarize.
        :param algorithm: The algorithm to use for clustering.
        :param min_length: The min length to use.
        :param max_length: The max length to use.
        :param k_max: The maximum number of clusters to search.
        :return: List of elbow inertia values.
        """
        sentences = self.sentence_handler(body, min_length, max_length)

        if k_max is None:
            k_max = len(sentences) - 1

        hidden = self.model(sentences,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        elbow = ClusterFeatures(
            hidden, algorithm,
            random_state=self.random_state).calculate_elbow(k_max)

        return elbow
Esempio n. 4
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        #hidden = self.model(content, self.hidden, self.reduce_option)

        my_path = os.path.abspath(os.path.dirname(__file__))
        model_path = os.path.join(
            *[my_path, "distilroberta-base-paraphrase-v1"])
        print("path====", model_path)
        #self.model = SentenceTransformer(model_name_or_path=model_path)
        self.model = SentenceTransformer("distilroberta-base-paraphrase-v1")
        hidden = self.model.encode(content, convert_to_tensor=True)

        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]

        #embeddings = np.asarray([hidden[j] for j in hidden_args])
        embeddings = hidden

        return sentences, embeddings
    def cluster_runner(
        self,
        content: List[str],
        ratio: float = 0.2,
        algorithm: str = 'kmeans',
        use_first: bool = True,
        num_sentences: int = None
    ) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences
        #NASH
        #hidden = self.model(content, self.hidden, self.reduce_option)
        
        self.model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
        hidden = self.model.encode(content, convert_to_tensor=True)
        
        hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        #NASH
        #embeddings = np.asarray([hidden[j] for j in hidden_args])
        embeddings = hidden

        return sentences, embeddings
Esempio n. 6
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Return the first sentence in the output (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """
        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        hidden = self.model(content,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)

        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        embeddings = np.asarray([hidden[j] for j in hidden_args])

        return sentences, embeddings
Esempio n. 7
0
    def cluster_runner(
            self,
            content: List[str],
            ratio: float = 0.2,
            algorithm: str = 'kmeans',
            use_first: bool = True,
            num_sentences: int = None) -> Tuple[List[str], np.ndarray]:
        """
        Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.

        :param content: Content list of sentences.
        :param ratio: The ratio to use for clustering.
        :param algorithm: Type of algorithm to use for clustering.
        :param use_first: Whether to use first sentence (helpful for news stories, etc).
        :param num_sentences: Number of sentences to use for summarization.
        :return: A tuple of summarized sentences and embeddings
        """
        def find_closest_args(centroids, features):
            args = {}
            for i, feature in enumerate(features):
                dist_list = []
                for j, centroid in enumerate(centroids):
                    value = np.linalg.norm(feature - centroid)
                    dist_list.append([value, j])
                args[i] = [min(dist_list, key=lambda x: x[0])]
            return args

        if num_sentences is not None:
            num_sentences = num_sentences if use_first else num_sentences

        hidden = self.model(content,
                            self.hidden,
                            self.reduce_option,
                            hidden_concat=self.hidden_concat)
        self.features = hidden

        hidden_args = ClusterFeatures(hidden,
                                      algorithm,
                                      random_state=self.random_state).cluster(
                                          ratio, num_sentences)
        self.centroids = hidden_args[1]
        hidden_args = hidden_args[0]
        self.hidden_args = hidden_args
        if use_first:

            if not hidden_args:
                hidden_args.append(0)

            elif hidden_args[0] != 0:
                hidden_args.insert(0, 0)

        sentences = [content[j] for j in hidden_args]
        embeddings = np.asarray([hidden[j] for j in hidden_args])
        args = find_closest_args(self.centroids, self.features)
        df = pd.DataFrame([a, b[0][0], b[0][1]] for a, b in args.items())
        df.columns = ['Id', 'Weight', 'Centroid']
        scaler = MinMaxScaler()
        df['Weight'] = scaler.fit_transform(df['Weight'].to_numpy().reshape(
            -1, 1))
        df['Weight'] = df['Weight'] - 1
        df['Weight'] = df['Weight'].abs().round(4)
        df['Sent'] = df['Id'].apply(lambda x: content[x])
        self.df = df
        return sentences, embeddings