Exemple #1
0
def test_wos():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    get_wos("wos")
Exemple #2
0
def test_HDPVB():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("HDPVB_20news",
                               categories=["sci.space", "rec.autos"])

    hdp = HDPVB()

    topic_model, (n, ) = hdp.fit_transform(dataset, "test_HDPVB")
Exemple #3
0
def test_LDAVB():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("LDAVB_20news",
                               categories=["sci.space", "rec.autos"])

    lda = LDAVB(2)

    topic_model = lda.fit_transform(dataset, "test_LDAVB")

    return dataset, topic_model
Exemple #4
0
def test_hSBM():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("hSBM_20news",
                               categories=["sci.space", "rec.autos"])

    hsbm = hSBM()

    topic_model, (n_layer, ) = hsbm.fit_transform(dataset, "test_hSBM")

    assert round(NMI(dataset, topic_model), 5) == 0.1668
Exemple #5
0
def test_simple_HDPVB():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("HDPVB_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    hdp = HDPVB()

    topic_model, (n, ) = hdp.fit_transform(dataset, "test_simple_HDPVB")
Exemple #6
0
def test_simple_LDAVB():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("LDAVB_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    lda = LDAVB(2)

    return lda.fit_transform(dataset, "test_simple_LDAVB")
Exemple #7
0
def test_NMF():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("NMF_20news",
                               categories=["sci.space", "rec.autos"])

    nmf = NMF(2)

    topic_model, (n_iter, error) = nmf.fit_transform(dataset, "test_NMF")

    assert round(NMI(dataset, topic_model), 5) == 0.00119
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[617, 373],
                                                                 [653, 334]
                                                                 ])).all()
Exemple #8
0
def test_TM():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("TM_20news",
                               categories=["sci.space", "rec.autos"])

    tm = TopicMapping()

    topic_model, (n, ) = tm.fit_transform(dataset, "test_TM")

    assert round(NMI(dataset, topic_model), 5) == 0.16266
    assert (confusion_matrix(dataset, topic_model).todense() == np.array([
        [18, 12, 45, 7, 294, 21, 71, 26, 6, 28, 92, 115, 86, 81, 4, 84],
        [162, 105, 58, 73, 2, 39, 79, 52, 44, 81, 36, 53, 54, 6, 130, 13],
    ])).all()
Exemple #9
0
def test_LDAGS():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups(
        "LDAGS_20news",
        categories=[
            "sci.space",
            "rec.autos",
        ],
    )

    lda = LDAGS(
        2,
        mallet_args={"optimize-interval": 20},
    )

    lda.fit_transform(dataset, "test_LDAGS")
Exemple #10
0
def test_PLSA():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("PLSA_20news",
                               categories=["sci.space", "rec.autos"])

    plsa = PLSA(2, max_iter=5)

    topic_model = plsa.fit_transform(dataset, "test_PLSA")

    assert round(NMI(dataset, topic_model), 5) == 0.00032
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[521, 469],
                                                                 [540, 447]
                                                                 ])).all()

    return dataset, topic_model
Exemple #11
0
def test_UMAP_HDBSCAN():

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dataset = get_20newsgroups("UH_20news",
                               categories=["sci.space", "rec.autos"])

    u_args = {
        "n_components": 2,
        "n_neighbors": 15,
        "min_dist": 0.1,
        "metric": "hellinger",
    }
    model = UMAP_HDBSCAN(u_args=u_args)

    topic_model = model.fit_transform(dataset, "test_simple_UMAP_HDBSCAN")

    assert round(NMI(dataset, topic_model), 5) == 0.24392

    return dataset, topic_model
Exemple #12
0
def test_simple_TM():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("TM_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    tm = TopicMapping(p=1)

    topic_model, (n, ) = tm.fit_transform(dataset, "test_simple_TM")

    assert round(NMI(dataset, topic_model), 5) == 1.0
    assert (confusion_matrix(dataset,
                             topic_model).todense() == np.array([[2, 0],
                                                                 [0,
                                                                  1]])).all()
Exemple #13
0
def test_simple_LDAGS():
    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    docs = ["A A B C D", "B B B A C", "E F E"]
    labels = ["1", "1", "2"]

    dataset = data.LabeledDataset.from_iterator("LDAGS_simple_data",
                                                zip(labels, docs))

    dataset.set_vectorizer_params(tokenizer=(lambda s: s.split(" ")), )

    lda = LDAGS(
        2,
        memory="512M",
        mallet_args={
            "num-iterations": 1000,
            "optimize-interval": 20,
        },
    )

    lda.fit_transform(dataset, "test_simple_LDAGS")
Exemple #14
0
Analyze the 20newsgroups dataset with LDA
=========================================
"""

# %%
# import
import random

import adso
import matplotlib.pyplot as plt
import nltk
import numpy as np

# %%
# set seed
adso.set_seed(1234)

# %%
# Download the dataset and select 1000 random elements
data = adso.data.load_20newsgroups(split="test")

new_data = []
for i in random.sample(range(len(data)), k=1000):
    new_data.append(data[i])
data = adso.data.LabelledDataset(new_data)

print("Number of documents: ", len(data))

# %%
# Tokenize the dataset using a stemmer and a stopwords list, removing punctation
Exemple #15
0
import gc

import dask
from dask.distributed import Client

import adso
from adso.corpora import get_20newsgroups

if __name__ == "__main__":

    adso.set_adso_dir(".test")
    adso.set_project_name("test")
    adso.set_seed(8686)

    dask.config.set({"temporary_directory": str(adso.common.ADSODIR / "dask")})
    client = Client()

    gc.set_threshold(50, 10, 10)

    adso.data.common.nltk_download("punkt")

    def my_tokenizer(doc):
        return list(
            filter(
                lambda s: s.isalpha() and len(s) >= 3,
                adso.data.common.tokenize_and_stem(doc),
            )
        )

    try:
        dataset = adso.data.LabeledDataset.load(".test/test/20news")