def test_wos(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) get_wos("wos")
def test_HDPVB(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("HDPVB_20news", categories=["sci.space", "rec.autos"]) hdp = HDPVB() topic_model, (n, ) = hdp.fit_transform(dataset, "test_HDPVB")
def test_LDAVB(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("LDAVB_20news", categories=["sci.space", "rec.autos"]) lda = LDAVB(2) topic_model = lda.fit_transform(dataset, "test_LDAVB") return dataset, topic_model
def test_hSBM(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("hSBM_20news", categories=["sci.space", "rec.autos"]) hsbm = hSBM() topic_model, (n_layer, ) = hsbm.fit_transform(dataset, "test_hSBM") assert round(NMI(dataset, topic_model), 5) == 0.1668
def test_simple_HDPVB(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) docs = ["A A B C D", "B B B A C", "E F E"] labels = ["1", "1", "2"] dataset = data.LabeledDataset.from_iterator("HDPVB_simple_data", zip(labels, docs)) dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), ) hdp = HDPVB() topic_model, (n, ) = hdp.fit_transform(dataset, "test_simple_HDPVB")
def test_simple_LDAVB(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) docs = ["A A B C D", "B B B A C", "E F E"] labels = ["1", "1", "2"] dataset = data.LabeledDataset.from_iterator("LDAVB_simple_data", zip(labels, docs)) dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), ) lda = LDAVB(2) return lda.fit_transform(dataset, "test_simple_LDAVB")
def test_NMF(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("NMF_20news", categories=["sci.space", "rec.autos"]) nmf = NMF(2) topic_model, (n_iter, error) = nmf.fit_transform(dataset, "test_NMF") assert round(NMI(dataset, topic_model), 5) == 0.00119 assert (confusion_matrix(dataset, topic_model).todense() == np.array([[617, 373], [653, 334] ])).all()
def test_TM(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("TM_20news", categories=["sci.space", "rec.autos"]) tm = TopicMapping() topic_model, (n, ) = tm.fit_transform(dataset, "test_TM") assert round(NMI(dataset, topic_model), 5) == 0.16266 assert (confusion_matrix(dataset, topic_model).todense() == np.array([ [18, 12, 45, 7, 294, 21, 71, 26, 6, 28, 92, 115, 86, 81, 4, 84], [162, 105, 58, 73, 2, 39, 79, 52, 44, 81, 36, 53, 54, 6, 130, 13], ])).all()
def test_LDAGS(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups( "LDAGS_20news", categories=[ "sci.space", "rec.autos", ], ) lda = LDAGS( 2, mallet_args={"optimize-interval": 20}, ) lda.fit_transform(dataset, "test_LDAGS")
def test_PLSA(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("PLSA_20news", categories=["sci.space", "rec.autos"]) plsa = PLSA(2, max_iter=5) topic_model = plsa.fit_transform(dataset, "test_PLSA") assert round(NMI(dataset, topic_model), 5) == 0.00032 assert (confusion_matrix(dataset, topic_model).todense() == np.array([[521, 469], [540, 447] ])).all() return dataset, topic_model
def test_UMAP_HDBSCAN(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dataset = get_20newsgroups("UH_20news", categories=["sci.space", "rec.autos"]) u_args = { "n_components": 2, "n_neighbors": 15, "min_dist": 0.1, "metric": "hellinger", } model = UMAP_HDBSCAN(u_args=u_args) topic_model = model.fit_transform(dataset, "test_simple_UMAP_HDBSCAN") assert round(NMI(dataset, topic_model), 5) == 0.24392 return dataset, topic_model
def test_simple_TM(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) docs = ["A A B C D", "B B B A C", "E F E"] labels = ["1", "1", "2"] dataset = data.LabeledDataset.from_iterator("TM_simple_data", zip(labels, docs)) dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), ) tm = TopicMapping(p=1) topic_model, (n, ) = tm.fit_transform(dataset, "test_simple_TM") assert round(NMI(dataset, topic_model), 5) == 1.0 assert (confusion_matrix(dataset, topic_model).todense() == np.array([[2, 0], [0, 1]])).all()
def test_simple_LDAGS(): adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) docs = ["A A B C D", "B B B A C", "E F E"] labels = ["1", "1", "2"] dataset = data.LabeledDataset.from_iterator("LDAGS_simple_data", zip(labels, docs)) dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), ) lda = LDAGS( 2, memory="512M", mallet_args={ "num-iterations": 1000, "optimize-interval": 20, }, ) lda.fit_transform(dataset, "test_simple_LDAGS")
Analyze the 20newsgroups dataset with LDA ========================================= """ # %% # import import random import adso import matplotlib.pyplot as plt import nltk import numpy as np # %% # set seed adso.set_seed(1234) # %% # Download the dataset and select 1000 random elements data = adso.data.load_20newsgroups(split="test") new_data = [] for i in random.sample(range(len(data)), k=1000): new_data.append(data[i]) data = adso.data.LabelledDataset(new_data) print("Number of documents: ", len(data)) # %% # Tokenize the dataset using a stemmer and a stopwords list, removing punctation
import gc import dask from dask.distributed import Client import adso from adso.corpora import get_20newsgroups if __name__ == "__main__": adso.set_adso_dir(".test") adso.set_project_name("test") adso.set_seed(8686) dask.config.set({"temporary_directory": str(adso.common.ADSODIR / "dask")}) client = Client() gc.set_threshold(50, 10, 10) adso.data.common.nltk_download("punkt") def my_tokenizer(doc): return list( filter( lambda s: s.isalpha() and len(s) >= 3, adso.data.common.tokenize_and_stem(doc), ) ) try: dataset = adso.data.LabeledDataset.load(".test/test/20news")