def test_statistics_two_components(): ''' Assert that online lda extracts waited statistics on current document. ''' n_components = 2 np.random.seed(42) lda = LDA(n_components, number_of_documents=60) statistics_list = [] for doc in DOC_SET: word_list = lda.tokenizer(lda.preprocess(lda._get_text(doc))) lda._update_indexes(word_list=word_list) word_indexes = [lda.word_to_index[word] for word in word_list] statistics, _ = lda._compute_statistics_components( words_indexes_list=word_indexes, ) statistics_list.append(statistics) lda._update_weights(statistics=statistics) for index, statistics in enumerate(statistics_list): for component in range(n_components): assert np.array_equal( a1=statistics[component], a2=REFERENCE_STATISTICS_TWO_COMPONENTS[index][component], )
def test_extraction_words_ids(): ''' Assert that inputs words are splitted. Assert that indexes are updated and extractable. ''' np.random.seed(42) lda = LDA(2, number_of_documents=5) word_indexes_list = [] for doc in DOC_SET: words = lda.tokenizer(lda.preprocess(lda._get_text(doc))) lda._update_indexes(word_list=words) word_indexes_list.append([lda.word_to_index[word] for word in words]) assert word_indexes_list == [ [1, 2], [1, 3, 4], [1, 2, 5], [1, 3], [1, 2, 6], ]