Exemple #1
0
def test_load_20ng():
    data_home = get_data_home(data_home=None)
    cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz")
    if os.path.exists(cache_path):
        os.remove(cache_path)

    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    assert len(dataset.get_corpus()) == 16309
    assert len(dataset.get_labels()) == 16309
    assert os.path.exists(cache_path)

    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    assert len(dataset.get_corpus()) == 16309
Exemple #2
0
def _load_default_texts():
    """
    Loads default general texts

    Returns
    -------
    result : default 20newsgroup texts
    """
    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    return dataset.get_corpus()
Exemple #3
0
def test_model_output_etm_not_partitioned(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')
    num_topics = 3
    model = ETM(num_topics=num_topics, num_epochs=5, use_partitions=False)
    output = model.train_model(dataset)
    assert 'topics' in output.keys()
    assert 'topic-word-matrix' in output.keys()
    assert 'test-topic-document-matrix' not in output.keys()

    # check topics format
    assert type(output['topics']) == list
    assert len(output['topics']) == num_topics

    # check topic-word-matrix format
    assert type(output['topic-word-matrix']) == np.ndarray
    assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary()))

    # check topic-document-matrix format
    assert type(output['topic-document-matrix']) == np.ndarray
    assert output['topic-document-matrix'].shape == (num_topics, len(dataset.get_corpus()))