def test_load_20ng(): data_home = get_data_home(data_home=None) cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz") if os.path.exists(cache_path): os.remove(cache_path) dataset = Dataset() dataset.fetch_dataset("20NewsGroup") assert len(dataset.get_corpus()) == 16309 assert len(dataset.get_labels()) == 16309 assert os.path.exists(cache_path) dataset = Dataset() dataset.fetch_dataset("20NewsGroup") assert len(dataset.get_corpus()) == 16309
def _load_default_texts(): """ Loads default general texts Returns ------- result : default 20newsgroup texts """ dataset = Dataset() dataset.fetch_dataset("20NewsGroup") return dataset.get_corpus()
def test_model_output_etm_not_partitioned(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = ETM(num_topics=num_topics, num_epochs=5, use_partitions=False) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' not in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == (num_topics, len(dataset.get_corpus()))