def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool= True) -> List[str]: hidden = self.model(content, self.hidden, self.reduce_option) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio) if use_first: if hidden_args[0] != 0: hidden_args.insert(0,0) return [content[j] for j in hidden_args]
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences #hidden = self.model(content, self.hidden, self.reduce_option) my_path = os.path.abspath(os.path.dirname(__file__)) model_path = os.path.join( *[my_path, "distilroberta-base-paraphrase-v1"]) print("path====", model_path) #self.model = SentenceTransformer(model_name_or_path=model_path) self.model = SentenceTransformer("distilroberta-base-paraphrase-v1") hidden = self.model.encode(content, convert_to_tensor=True) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] #embeddings = np.asarray([hidden[j] for j in hidden_args]) embeddings = hidden return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None ) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences #NASH #hidden = self.model(content, self.hidden, self.reduce_option) self.model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') hidden = self.model.encode(content, convert_to_tensor=True) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] #NASH #embeddings = np.asarray([hidden[j] for j in hidden_args]) embeddings = hidden return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Return the first sentence in the output (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences hidden = self.model(content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] embeddings = np.asarray([hidden[j] for j in hidden_args]) return sentences, embeddings
def cluster_runner( self, content: List[str], ratio: float = 0.2, algorithm: str = 'kmeans', use_first: bool = True, num_sentences: int = None) -> Tuple[List[str], np.ndarray]: """ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences. :param content: Content list of sentences. :param ratio: The ratio to use for clustering. :param algorithm: Type of algorithm to use for clustering. :param use_first: Whether to use first sentence (helpful for news stories, etc). :param num_sentences: Number of sentences to use for summarization. :return: A tuple of summarized sentences and embeddings """ def find_closest_args(centroids, features): args = {} for i, feature in enumerate(features): dist_list = [] for j, centroid in enumerate(centroids): value = np.linalg.norm(feature - centroid) dist_list.append([value, j]) args[i] = [min(dist_list, key=lambda x: x[0])] return args if num_sentences is not None: num_sentences = num_sentences if use_first else num_sentences hidden = self.model(content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat) self.features = hidden hidden_args = ClusterFeatures(hidden, algorithm, random_state=self.random_state).cluster( ratio, num_sentences) self.centroids = hidden_args[1] hidden_args = hidden_args[0] self.hidden_args = hidden_args if use_first: if not hidden_args: hidden_args.append(0) elif hidden_args[0] != 0: hidden_args.insert(0, 0) sentences = [content[j] for j in hidden_args] embeddings = np.asarray([hidden[j] for j in hidden_args]) args = find_closest_args(self.centroids, self.features) df = pd.DataFrame([a, b[0][0], b[0][1]] for a, b in args.items()) df.columns = ['Id', 'Weight', 'Centroid'] scaler = MinMaxScaler() df['Weight'] = scaler.fit_transform(df['Weight'].to_numpy().reshape( -1, 1)) df['Weight'] = df['Weight'] - 1 df['Weight'] = df['Weight'].abs().round(4) df['Sent'] = df['Id'].apply(lambda x: content[x]) self.df = df return sentences, embeddings