def test_tdf_search_corpus(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text") results = tdf.search_corpus("movie") self.assertEqual( len(results[results["search_cosine_similarity"] > 0.2]), 5)
def test_tdf_extract_corpus_fragments(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df[:100], "text") fragments = tdf.extract_corpus_fragments(scan_top_n_matches_per_doc=1, min_fragment_length=3) self.assertEqual(len(fragments), 1) self.assertEqual(fragments[0], "s .")
def test_tdf_kmeans_clusters(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) tdf.kmeans_clusters(k=2) terms = tdf.top_cluster_terms("kmeans") self.assertEqual(len(terms.keys()), 2) self.assertIn(terms[1][0], ["alien", "husband"]) self.assertIn(terms[0][0], ["alien", "husband"])
def test_tdf_hdbscan_clusters(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) tdf.hdbscan_clusters(min_cluster_size=10) terms = tdf.top_cluster_terms("hdbscan") self.assertEqual(len(terms.keys()), 3) self.assertEqual(terms[-1][0], "mike") self.assertEqual(terms[18][0], "disney") self.assertEqual(terms[11][0], "jackie")
def test_tdf_pca_components(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) tdf.pca_components(k=5) docs = tdf.get_top_documents(component_prefix="pca", top_n=2) self.assertEqual(docs["pca_0"][0][:10], "there must") self.assertEqual(docs["pca_1"][0][:10], "plot : a d") self.assertEqual(docs["pca_2"][0][:10], "with the s") self.assertIn(docs["pca_3"][0][:10], ["every once", " * * * * *"]) self.assertEqual(docs["pca_4"][0][:10], "when i fir")
def test_make_document_cooccurrence_matrix(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) mat = tdf.make_document_cooccurrence_matrix(normalize=False) self.assertTrue(len(mat) == len(self.df)) self.assertTrue(mat.max().max() > 1.0) mat = tdf.make_document_cooccurrence_matrix(normalize=True) self.assertTrue(len(mat) == len(self.df)) self.assertTrue(mat.max().max() == 1.0)
def test_tdf_lsa_components(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) tdf.lsa_components(k=5) docs = tdf.get_top_documents(component_prefix="lsa", top_n=2) self.assertEqual(docs["lsa_0"][0][:10], " * * * the") self.assertEqual(docs["lsa_1"][0][:10], "susan gran") self.assertEqual(len(docs["lsa_2"]), 0) self.assertIn(docs["lsa_3"][0][:10], ["as a devou", "every once"]) self.assertEqual(docs["lsa_4"][0][:10], "when i fir")
def test_compute_hdbscan_clusters(self): from pewanalytics.stats.clustering import compute_hdbscan_clusters from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) hdbscan = compute_hdbscan_clusters(tdf.tfidf, min_cluster_size=10) self.assertEqual(len(hdbscan), 2000) self.assertEqual(len(set(hdbscan)), 23)
def test_mutual_info_bar_plot(self): from pewanalytics.text import TextDataFrame from pewanalytics.stats.mutual_info import mutual_info_bar_plot import matplotlib.pyplot as plt self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int) tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) mutual_info = tdf.mutual_info("outcome") plot = mutual_info_bar_plot( mutual_info, filter_col="pct_term_pos_neg_ratio", top_n=20, x_col="pct_term_pos_neg_ratio", ) # plt.show() # self.assertEqual(str(plot.__hash__()), '-9223372036574337697') # TODO: figure out how to get a unique representation of the plot self.assertTrue(True)
def test_tdf_find_related_keywords(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame( self.df, "text", min_df=10, max_df=0.95, use_idf=False, binary=True, sublinear_tf=False, smooth_idf=False, norm=None, ) terms = tdf.find_related_keywords("disney", n=25) for term in [ "animation", "mulan", "mermaid", "hercules", "tarzan", "pixar" ]: self.assertIn(term, terms)
def test_mutual_info(self): from pewanalytics.text import TextDataFrame from pewanalytics.stats.mutual_info import compute_mutual_info self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int) tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) tdf.corpus["weight"] = 1.0 mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf, weights=None) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf, weights=tdf.corpus["weight"]) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf, normalize=False) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf, l=1) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf.todense(), weights=None) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf.todense(), weights=tdf.corpus["weight"]) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf.todense(), normalize=False) self.assertIsNotNone(mutual_info) mutual_info = compute_mutual_info(tdf.corpus["outcome"], tdf.tfidf.todense(), l=1) self.assertIsNotNone(mutual_info)
def test_correspondence_analysis(self): from pewanalytics.stats.dimensionality_reduction import correspondence_analysis from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) matrix = pd.DataFrame(tdf.tfidf.todense(), columns=tdf.vectorizer.get_feature_names()) mca = correspondence_analysis(matrix) self.assertAlmostEqual(mca["mca_1"].values[0], 0.59554, 4) self.assertEqual(mca["node"].values[0], "over") self.assertAlmostEqual(mca["mca_1"].values[-1], -0.4274, 4) self.assertEqual(mca["node"].values[-1], "red")
def test_compute_kmeans_clusters(self): from pewanalytics.stats.clustering import compute_kmeans_clusters from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) kmeans = compute_kmeans_clusters(tdf.tfidf, k=2, return_score=False) self.assertEqual(len(kmeans), 2000) self.assertEqual(len(set(kmeans)), 2) kmeans, score = compute_kmeans_clusters(tdf.tfidf, k=2, return_score=True) self.assertEqual(len(kmeans), 2000) self.assertEqual(len(set(kmeans)), 2) self.assertGreater(score, 0)
def test_make_word_cooccurrence_matrix(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(ngram_range=(1, 1), stop_words="english", min_df=10, max_df=0.5) cv.fit_transform(self.df["text"]) vocab = cv.get_feature_names() mat = tdf.make_word_cooccurrence_matrix(normalize=False, min_frequency=10, max_frequency=0.5) self.assertTrue(len(mat) == len(vocab)) self.assertTrue(mat.max().max() > 1.0) mat = tdf.make_word_cooccurrence_matrix(normalize=True, min_frequency=10, max_frequency=0.5) self.assertTrue(len(mat) == len(vocab)) self.assertTrue(mat.max().max() == 1.0)
def test_tdf_match_text_to_corpus(self): from pewanalytics.text import TextDataFrame tdf = TextDataFrame( pd.DataFrame([ { "text": "I read books" }, { "text": "I like reading" }, { "text": "I read books" }, { "text": "reading is nice" }, { "text": "reading" }, { "text": "books" }, ]), "text", ) matches = tdf.match_text_to_corpus(["books", "reading"], min_similarity=0.1, allow_multiple=True) self.assertEqual( list(matches["match_text"].values), ["books", "reading", "books", "reading", "reading", "books"], ) matches = tdf.match_text_to_corpus(["books", "reading"], min_similarity=0.5, allow_multiple=True) self.assertEqual( list(matches["match_text"].values), ["books", "reading", "books", None, "reading", "books"], ) matches = tdf.match_text_to_corpus(["books", "reading"], min_similarity=0.6, allow_multiple=True) self.assertEqual( list(matches["match_text"].values), ["books", None, "books", None, "reading", "books"], ) matches = tdf.match_text_to_corpus(["books", "reading"], min_similarity=0.5, allow_multiple=False) self.assertEqual( list(matches["match_text"].values), [None, None, None, None, "reading", "books"], )
def test_tdf_find_duplicates(self): from pewanalytics.text import TextDataFrame self.df["text"] = self.df["text"].map(lambda x: x[:1000]) tdf = TextDataFrame(self.df, "text") dupes = tdf.find_duplicates(tfidf_threshold=0.8, fuzzy_ratio_threshold=80, allow_partial=False) self.assertEqual(len(dupes), 6) self.df["text"] = self.df["text"].map(lambda x: x[:-400] if random.random() > 0.5 else x) tdf = TextDataFrame(self.df, "text") dupes = tdf.find_duplicates(tfidf_threshold=0.6, fuzzy_ratio_threshold=80, allow_partial=True) self.assertEqual(len(dupes), 7)
def test_get_lsa(self): from pewanalytics.stats.dimensionality_reduction import get_lsa from pewanalytics.text import TextDataFrame tdf = TextDataFrame(self.df, "text", min_df=50, max_df=0.5) components, results = get_lsa(tdf.tfidf, k=5) component_means = components.mean().to_dict() result_means = results.mean().to_dict() self.assertEqual(components.shape[0], 2075) self.assertEqual(components.shape[1], 5) self.assertEqual(results.shape[0], 2000) self.assertEqual(results.shape[1], 6) self.assertAlmostEqual(component_means["lsa_0"], 0.0174, 2) self.assertAlmostEqual(component_means["lsa_1"], -0.0002, 2) self.assertAlmostEqual(component_means["lsa_2"], -0.0030, 2) self.assertAlmostEqual(component_means["lsa_3"], -0.0011, 2) self.assertAlmostEqual(component_means["lsa_4"], -0.0002, 2) self.assertAlmostEqual(result_means["lsa_0"], 0.3025, 2) self.assertAlmostEqual(result_means["lsa_1"], 0.001, 2) self.assertAlmostEqual(result_means["lsa_2"], -0.0034, 2) self.assertAlmostEqual(result_means["lsa_3"], -0.0022, 2) self.assertAlmostEqual(result_means["lsa_4"], -0.0, 2)
def test_tdf_mutual_info(self): from pewanalytics.text import TextDataFrame self.df["outcome"] = (self.df["sentiment"] == "pos").astype(int) self.df["text"] = self.df.apply( lambda x: "{} always_pos".format(x["text"]) if x["outcome"] else x["text"], axis=1, ) tdf = TextDataFrame( self.df, "text", min_df=50, max_df=0.5, use_idf=False, binary=True, sublinear_tf=False, smooth_idf=False, norm=None, ) # games occurs 24 times in the pos class, 26 times in the neg class; total is 50 # overall document total is 2000 (1000 pos) px1y1 = 24.0 / 2000.0 px1y0 = 26.0 / 2000.0 px1 = 50.0 / 2000.0 px0 = (2000.0 - 50.0) / 2000.0 py1 = 1000.0 / 2000.0 mutual_info = tdf.mutual_info("outcome", normalize=False) MI1 = math.log(px1y1 / (px1 * py1), 2) MI1_alt = math.log(px1y1, 2) - math.log(px1, 2) - math.log(py1, 2) self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1, 4) self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_alt, 4) mutual_info = tdf.mutual_info("outcome", normalize=True) MI1_norm = MI1 / (-1 * math.log(px1y1, 2)) MI1_norm_alt = (math.log(px1 * py1, 2) / math.log(px1y1, 2)) - 1.0 self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_norm, 4) self.assertAlmostEqual(mutual_info.loc["games"]["MI1"], MI1_norm_alt, 4) pos = mutual_info.sort_values("MI1", ascending=False)[:10] neg = mutual_info.sort_values("MI0", ascending=False)[:10] self.assertEqual(pos.index[0], "always_pos") self.assertEqual(pos.iloc[0]["MI1"], 1.0) self.assertEqual(pos.index[1], "outstanding") for field, val in [ ("MI1", 0.178374), ("MI0", -0.319942), ("total", 68.0), ("total_pos_with_term", 63.0), ("total_neg_with_term", 5.0), ("total_pos_neg_with_term_diff", 58.0), ("pct_pos_with_term", 0.063), ("pct_neg_with_term", 0.005), ("pct_pos_neg_with_term_diff", 0.058), ("pct_pos_neg_with_term_ratio", 12.6), ("pct_term_pos", 0.926471), ("pct_term_neg", 0.073529), ("pct_term_pos_neg_diff", 0.852941), ("pct_term_pos_neg_ratio", 12.6), ]: self.assertAlmostEqual(pos.iloc[1][field], val, 4) self.assertEqual(neg.index[0], "bad") for field, val in [ ("MI1", -0.195836), ("MI0", 0.209830), ("total", 773.0), ("total_pos_with_term", 259.0), ("total_neg_with_term", 514.0), ("total_pos_neg_with_term_diff", -255.0), ("pct_pos_with_term", 0.259), ("pct_neg_with_term", 0.514), ("pct_pos_neg_with_term_diff", -0.255), ("pct_pos_neg_with_term_ratio", 0.503891), ("pct_term_pos", 0.335058), ("pct_term_neg", 0.664942), ("pct_term_pos_neg_diff", -0.329884), ("pct_term_pos_neg_ratio", 0.503891), ]: self.assertAlmostEqual(neg.iloc[0][field], val, 4)