def test_hypergeom_p_values(self): results = [0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989, 1.0] # calculating on counts pvals = hypergeom_p_values(self.x, self.x[-2:, :]) np.testing.assert_almost_equal(pvals, results) # calculating on 0,1s clipped = self.x.clip(min=0, max=1) pvals = hypergeom_p_values(clipped, clipped[-2:, :]) np.testing.assert_almost_equal(pvals, results)
def test_hypergeom_p_values(self): results = [0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989, 1.0] # calculating on counts pvals = hypergeom_p_values(self.x, self.x[-2:, :]) np.testing.assert_almost_equal(pvals, results) # calculating on sparse counts pvals = hypergeom_p_values(sp.csr_matrix(self.x), self.x[-2:, :]) np.testing.assert_almost_equal(pvals, results) # calculating on 0,1s clipped = self.x.clip(min=0, max=1) pvals = hypergeom_p_values(clipped, clipped[-2:, :]) np.testing.assert_almost_equal(pvals, results) with self.assertRaises(ValueError): hypergeom_p_values(self.x, self.x[-2:, :-1])
def apply(self): self.clear() self.progressBarInit() self.filter_enabled(False) self.words = [i.name for i in self.selected_data_transformed.domain.attributes] self.p_values = hypergeom_p_values(self.data.X, self.selected_data_transformed.X, callback=self.progress) self.fdr_values = false_discovery_rate(self.p_values) self.filter_and_display() self.filter_enabled(True) self.progressBarFinished()
def test_hypergeom_p_values(self): results = [ 0.16666666666666669, 0.49999999999999989, 1.0, 0.49999999999999989, 1.0 ] # calculating on counts pvals = hypergeom_p_values(self.x, self.x[-2:, :]) np.testing.assert_almost_equal(pvals, results) # calculating on sparse counts pvals = hypergeom_p_values(sp.csr_matrix(self.x), self.x[-2:, :]) np.testing.assert_almost_equal(pvals, results) # calculating on 0,1s clipped = self.x.clip(min=0, max=1) pvals = hypergeom_p_values(clipped, clipped[-2:, :]) np.testing.assert_almost_equal(pvals, results) with self.assertRaises(ValueError): hypergeom_p_values(self.x, self.x[-2:, :-1])
def run(selected_data_transformed: Table, data: Table, result: Result, state: TaskState) -> None: state.set_status("Listing words") result.words = [ i.name for i in selected_data_transformed.domain.attributes ] state.set_status("Computing p-values") result.p_values = hypergeom_p_values(data.X, selected_data_transformed.X, callback=state.set_progress_value) state.set_status("Computing FDR values") result.fdr_values = FDR(result.p_values)
def _hypergeom_clusters( cluster_labels: np.ndarray, keywords: List[List[str]], fdr_threshold: float, n_words: int ) -> Tuple[Dict[int, List[str]], np.ndarray, np.ndarray, np.ndarray]: keywords = [[w for w, _ in doc_keywords] for doc_keywords in keywords] clusters_keywords = {} for label in sorted(set(cluster_labels) - {-1}): indices = set(np.flatnonzero(cluster_labels == label)) kwds = [k for i, k in enumerate(keywords) if i in indices] clusters_keywords[label] = kwds cv = CountVectorizer(tokenizer=lambda w: w, preprocessor=lambda w: w) X = cv.fit_transform(list(chain.from_iterable(clusters_keywords.values()))) all_keywords = np.array(cv.get_feature_names_out()) index = 0 selected_clusters_keywords = {} all_scores, all_p_values = [], [] for label, cls_kwds in clusters_keywords.items(): # find words that should be specific for a group with hypergeom test n_docs = len(cls_kwds) p_values = hypergeom_p_values(X, X[index:index + n_docs]) words = set(all_keywords[np.array(p_values) < fdr_threshold]) # select only words with p-values less than threshold sel_words = [w for w in chain.from_iterable(cls_kwds)] sel_words = [w for w in sel_words if w in words] sel_words = [(w, c / n_docs) for w, c in Counter(sel_words).most_common(n_words)] selected_clusters_keywords[label] = sel_words all_scores.append(X[index:index + n_docs].sum(axis=0) / n_docs) all_p_values.append(p_values) index += n_docs all_scores = np.vstack(all_scores) all_p_values = np.vstack(all_p_values) return selected_clusters_keywords, all_keywords, all_scores, all_p_values