def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool= True) -> List[str]: hidden = self.model(content, self.hidden, self.reduce_option) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio) if use_first: if hidden_args[0] != 0: hidden_args.insert(0,0) return [content[j] for j in hidden_args]
def calculate_optimal_k(self, body: str, algorithm: str = 'kmeans', min_length: int = 40, max_length: int = 600, k_max: int = None): """ Calculates the optimal Elbow K. :param body: The input body to summarize. :param algorithm: The algorithm to use for clustering. :param min_length: The min length to use. :param max_length: The max length to use. :param k_max: The maximum number of clusters to search. :return: """ sentences = self.sentence_handler(body, min_length, max_length) if k_max is None: k_max = len(sentences) - 1 hidden = self.model(sentences, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) optimal_k = ClusterFeatures( hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max) return optimal_k
def calculate_elbow( self, body: str, algorithm: str = 'kmeans', min_length: int = 40, max_length: int = 600, k_max: int = None, ) -> List[float]: """ Calculates elbow across the clusters. :param body: The input body to summarize. :param algorithm: The algorithm to use for clustering. :param min_length: The min length to use. :param max_length: The max length to use. :param k_max: The maximum number of clusters to search. :return: List of elbow inertia values. """ sentences = self.sentence_handler(body, min_length, max_length) if k_max is None: k_max = len(sentences) - 1 hidden = self.model(sentences, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) elbow = ClusterFeatures( hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max) return elbow
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences #hidden = self.model(content, self.hidden, self.reduce_option) my_path = os.path.abspath(os.path.dirname(__file__)) model_path = os.path.join( *[my_path, "distilroberta-base-paraphrase-v1"]) print("path====", model_path) #self.model = SentenceTransformer(model_name_or_path=model_path) self.model = SentenceTransformer("distilroberta-base-paraphrase-v1") hidden = self.model.encode(content, convert_to_tensor=True) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] #embeddings = np.asarray([hidden[j] for j in hidden_args]) embeddings = hidden return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None ) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences #NASH #hidden = self.model(content, self.hidden, self.reduce_option) self.model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') hidden = self.model.encode(content, convert_to_tensor=True) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] #NASH #embeddings = np.asarray([hidden[j] for j in hidden_args]) embeddings = hidden return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Return the first sentence in the output (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences hidden = self.model(content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] embeddings = np.asarray([hidden[j] for j in hidden_args]) return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ def find_closest_args(centroids, features): args = {} for i, feature in enumerate(features): dist_list = [] for j, centroid in enumerate(centroids): value = np.linalg.norm(feature - centroid) dist_list.append([value, j]) args[i] = [min(dist_list, key=lambda x: x[0])] return args if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences hidden = self.model(content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) self.features = hidden hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) self.centroids = hidden_args[1] hidden_args = hidden_args[0] self.hidden_args = hidden_args if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] embeddings = np.asarray([hidden[j] for j in hidden_args]) args = find_closest_args(self.centroids, self.features) df = pd.DataFrame([a, b[0][0], b[0][1]] for a, b in args.items()) df.columns = ['Id', 'Weight', 'Centroid'] scaler = MinMaxScaler() df['Weight'] = scaler.fit_transform(df['Weight'].to_numpy().reshape( -1, 1)) df['Weight'] = df['Weight'] - 1 df['Weight'] = df['Weight'].abs().round(4) df['Sent'] = df['Id'].apply(lambda x: content[x]) self.df = df return sentences, embeddings