def test_statistics_two_components(): ''' Assert that online lda extracts waited statistics on current document. ''' n_components = 2 np.random.seed(42) lda = LDA(n_components, number_of_documents=60) statistics_list = [] for doc in DOC_SET: word_list = lda.tokenizer(lda.preprocess(lda._get_text(doc))) lda._update_indexes(word_list=word_list) word_indexes = [lda.word_to_index[word] for word in word_list] statistics, _ = lda._compute_statistics_components( words_indexes_list=word_indexes, ) statistics_list.append(statistics) lda._update_weights(statistics=statistics) for index, statistics in enumerate(statistics_list): for component in range(n_components): assert np.array_equal( a1=statistics[component], a2=REFERENCE_STATISTICS_TWO_COMPONENTS[index][component], )
def test_five_components(): ''' Assert that components computed are identical to the original version for n dimensions. ''' np.random.seed(42) n_components = 5 online_lda = LDA( n_components=n_components, number_of_documents=60, maximum_size_vocabulary=100, alpha_beta=100, alpha_theta=0.5, ) components_list = [] for document in DOC_SET: components_list.append(online_lda.fit_transform_one(document)) for index, component in enumerate(components_list): assert np.array_equal( a1=list(component.values()), a2=REFERENCE_FIVE_COMPONENTS[index], )
def test_five_components(): """ Assert that components computed are identical to the original version for n dimensions. """ n_components = 5 lda = LDA(n_components=n_components, number_of_documents=60, maximum_size_vocabulary=100, alpha_beta=100, alpha_theta=0.5, seed=42) components_list = [] for document in DOC_SET: tokens = {token: 1 for token in document.split(' ')} components_list.append(lda.fit_transform_one(tokens)) for index, component in enumerate(components_list): assert np.array_equal( a1=list(component.values()), a2=REFERENCE_FIVE_COMPONENTS[index], )
def test_extraction_words_ids(): """ Assert that inputs words are splitted. Assert that indexes are updated and extractable. """ lda = LDA(2, number_of_documents=5, seed=42) word_indexes_list = [] for doc in DOC_SET: words = doc.split(' ') lda._update_indexes(word_list=words) word_indexes_list.append([lda.word_to_index[word] for word in words]) assert word_indexes_list == [ [1, 2], [1, 3, 4], [1, 2, 5], [1, 3], [1, 2, 6], ]
def test_extraction_words_ids(): ''' Assert that inputs words are splitted. Assert that indexes are updated and extractable. ''' np.random.seed(42) lda = LDA(2, number_of_documents=5) word_indexes_list = [] for doc in DOC_SET: words = lda.process_text(doc) lda._update_indexes(word_list=words) word_indexes_list.append([lda.word_to_index[word] for word in words]) assert word_indexes_list == [ [1, 2], [1, 3, 4], [1, 2, 5], [1, 3], [1, 2, 6], ]
def test_statistics_two_components(): """ Assert that online lda extracts waited statistics on current document. """ n_components = 2 lda = LDA(n_components, number_of_documents=60, seed=42) statistics_list = [] for doc in DOC_SET: word_list = doc.split(' ') lda._update_indexes(word_list=word_list) word_indexes = [lda.word_to_index[word] for word in word_list] statistics, _ = lda._compute_statistics_components( words_indexes_list=word_indexes, ) statistics_list.append(statistics) lda._update_weights(statistics=statistics) for index, statistics in enumerate(statistics_list): for component in range(n_components): assert np.array_equal( a1=statistics[component], a2=REFERENCE_STATISTICS_TWO_COMPONENTS[index][component], )
def test_prunning_vocabulary(): ''' Vocabulary prunning is available to improve accuracy and limit memory usage. You can perform vocabulary prunning with parameters vocab_prune_interval (int) and maximum_size_vocabulary (int). ''' np.random.seed(42) online_lda = LDA(n_components=2, number_of_documents=60, vocab_prune_interval=2, maximum_size_vocabulary=3) components_list = [] for document in DOC_SET: components_list.append(online_lda.fit_transform_one(x=document)) for index, component in enumerate(components_list): assert np.array_equal(a1=list(component.values()), a2=REFERENCE_COMPONENTS_WITH_PRUNNING[index])
def test_fit_transform(): """ Assert that fit_one and transform_one methods returns waited ouput. """ lda = LDA(n_components=2, number_of_documents=60, vocab_prune_interval=2, maximum_size_vocabulary=3, seed=42) components_list = [] for document in DOC_SET: tokens = {token: 1 for token in document.split(' ')} lda = lda.fit_one(x=tokens) components_list.append(lda.transform_one(x=tokens)) for index, component in enumerate(components_list): assert np.array_equal(a1=list(component.values()), a2=REFERENCE_FIT_ONE_PREDICT_ONE[index])
def test_fit_transform(): ''' Assert that fit_one and transform_one methods returns waited ouput. ''' np.random.seed(42) online_lda = LDA( n_components=2, number_of_documents=60, vocab_prune_interval=2, maximum_size_vocabulary=3, ) components_list = [] for document in DOC_SET: online_lda = online_lda.fit_one(x=document) components_list.append(online_lda.transform_one(x=document)) for index, component in enumerate(components_list): assert np.array_equal(a1=list(component.values()), a2=REFERENCE_FIT_ONE_PREDICT_ONE[index])
def test_prunning_vocabulary(): """ Vocabulary prunning is available to improve accuracy and limit memory usage. You can perform vocabulary prunning with parameters vocab_prune_interval (int) and maximum_size_vocabulary (int). """ lda = LDA(n_components=2, number_of_documents=60, vocab_prune_interval=2, maximum_size_vocabulary=3, seed=42) components_list = [] for document in DOC_SET: tokens = {token: 1 for token in document.split(' ')} components_list.append(lda.fit_transform_one(tokens)) for index, component in enumerate(components_list): assert np.array_equal(a1=list(component.values()), a2=REFERENCE_COMPONENTS_WITH_PRUNNING[index])
def test_statistics_five_components(): ''' Assert that online lda extracts waited statistics on current document. ''' np.random.seed(42) n_components = 5 lda = LDA( n_components=n_components, number_of_documents=60, maximum_size_vocabulary=100, alpha_beta=100, alpha_theta=0.5, ) statistics_list = [] for doc in DOC_SET: word_list = lda.process_text(doc) lda._update_indexes(word_list=word_list) word_indexes = [lda.word_to_index[word] for word in word_list] statistics, _ = lda._compute_statistics_components( words_indexes_list=word_indexes, ) statistics_list.append(statistics) lda._update_weights(statistics=statistics) for index, statistics in enumerate(statistics_list): for component in range(n_components): assert np.array_equal( a1=statistics[component], a2=REFERENCE_STATISTICS_FIVE_COMPONENTS[index][component], )