def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_document_passes = 1 num_outer_iterations = 2 batches_folder = tempfile.mkdtemp() model_filename = os.path.join(batches_folder, str(uuid.uuid1())) try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder}) # Create master component master = mc.MasterComponent(lib) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join(data_path, vocab)) # Initialize model master.initialize_model(model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) phi_matrix_info = master.get_phi_info(model=pwt) # Export initialized model master.export_model(pwt, model_filename) # Create new master component master_new = mc.MasterComponent(lib) # Import model into new master component master_new.import_model(pwt, model_filename) phi_matrix_info_new = master_new.get_phi_info(model=pwt) assert phi_matrix_info.token == phi_matrix_info_new.token assert phi_matrix_info_new.num_topics == num_topics print_string = 'Number of topic in new model is' print_string += ' {0} and number of tokens is {1}'.format(phi_matrix_info_new.num_topics, len(phi_matrix_info.token)) print(print_string) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_tokens = 3501 batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component master = mc.MasterComponent(lib) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # filter the dictionary master.filter_dictionary(dictionary_name=dictionary_name, dictionary_target_name=dictionary_name + '__', max_df=500, min_df=20) # Initialize topic model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name + '__') # Extract topic model and print extracted data info = master.get_phi_info(model=pwt) _, matrix = master.get_phi_matrix(model=pwt) assert len(info.token) == num_tokens assert numpy.count_nonzero(matrix) == matrix.size print('Number of tokens in Phi matrix = {0}'.format(len(info.token))) print(matrix) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' rwt = 'rwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' smsp_phi_tau = -0.2 smsp_theta_tau = -0.1 decor_phi_tau = 1000000 num_topics = 10 num_inner_iterations = 10 num_outer_iterations = 8 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6703.161, 1: 2426.277, 2: 2276.476, 3: 1814.072, 4: 1742.911, 5: 1637.142, 6: 1612.946, 7: 1581.725 } sparsity_tol = 0.001 expected_phi_sparsity_value_on_iteration = { 0: 0.059, 1: 0.120, 2: 0.212, 3: 0.306, 4: 0.380, 5: 0.438, 6: 0.483, 7: 0.516 } expected_theta_sparsity_value_on_iteration = { 0: 0.009, 1: 0.036, 2: 0.146, 3: 0.239, 4: 0.278, 5: 0.301, 6: 0.315, 7: 0.319 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_Format_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = { 'Perplexity': messages.PerplexityScoreConfig(), 'SparsityPhi': messages.SparsityPhiScoreConfig() } master = mc.MasterComponent(lib, scores=scores) master.create_score('SparsityTheta', messages.SparsityThetaScoreConfig()) master.create_score('TopTokens', messages.TopTokensScoreConfig()) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig(), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) master.create_regularizer(name='DecorrelatorPhi', config=messages.DecorrelatorPhiConfig(), tau=0.0) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], dictionary_name=dictionary_name) for iter in xrange(num_outer_iterations): # Invoke one scan of the collection, regularize and normalize Phi master.clear_score_cache() master.process_batches(pwt=pwt, nwt=nwt, num_inner_iterations=num_inner_iterations, batches_folder=batches_folder, regularizer_name=['SmoothSparseTheta'], regularizer_tau=[smsp_theta_tau]) master.regularize_model(pwt, nwt, rwt, ['SmoothSparsePhi', 'DecorrelatorPhi'], [smsp_phi_tau, decor_phi_tau]) master.normalize_model(pwt, nwt, rwt) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter#{0}'.format(iter) print_string += ': Perplexity = {0:.3f}'.format( perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print print_string assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol assert abs( sparsity_phi_score.value - expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_on_iteration[iter] ) < sparsity_tol # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print 'Top tokens per topic:' top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda (topic_index, _): topic_index): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) print print_string finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' rwt = 'rwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 smsp_phi_tau = -20.0 smsp_theta_tau = -3.0 perplexity_tol = 1.0 expected_perp_col_value_on_iteration = { 0: 6650.1, 1: 2300.2, 2: 1996.8, 3: 1786.1, 4: 1692.7, 5: 1644.4, 6: 1612.3, 7: 1589.5 } expected_perp_doc_value_on_iteration = { 0: 6614.6, 1: 2295.0, 2: 1996.4, 3: 1786.1, 4: 1692.7, 5: 1644.2, 6: 1611.7, 7: 1588.6 } expected_perp_zero_words_on_iteration = { 0: 494, 1: 210, 2: 24, 3: 0, 4: 2, 5: 10, 6: 28, 7: 47 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores perplexity_config = messages.PerplexityScoreConfig() perplexity_config.model_type = constants.PerplexityScoreConfig_Type_UnigramCollectionModel perplexity_config.dictionary_name = dictionary_name scores = { 'PerplexityDoc': messages.PerplexityScoreConfig(), 'PerplexityCol': perplexity_config } master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig( dictionary_name=dictionary_name), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): # Invoke one scan of the collection, regularize and normalize Phi master.clear_score_cache() master.process_batches(pwt=pwt, nwt=nwt, num_document_passes=num_document_passes, batches_folder=batches_folder, regularizer_name=['SmoothSparseTheta'], regularizer_tau=[smsp_theta_tau]) master.regularize_model(pwt, nwt, rwt, ['SmoothSparsePhi'], [smsp_phi_tau]) master.normalize_model(pwt, nwt, rwt) # Retrieve perplexity score perplexity_doc_score = master.get_score('PerplexityDoc') perplexity_col_score = master.get_score('PerplexityCol') # Assert and print scores string = 'Iter#{0}'.format(iter) string += ': Collection perp. = {0:.1f}'.format( perplexity_col_score.value) string += ', Document perp. = {0:.1f}'.format( perplexity_doc_score.value) string += ', Zero words = {0}'.format( perplexity_doc_score.zero_words) print(string) print(perplexity_col_score.value, expected_perp_col_value_on_iteration[iter]) assert abs( perplexity_col_score.value - expected_perp_col_value_on_iteration[iter]) < perplexity_tol assert abs( perplexity_doc_score.value - expected_perp_doc_value_on_iteration[iter]) < perplexity_tol assert perplexity_doc_score.zero_words - expected_perp_zero_words_on_iteration[ iter] == 0 finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_inner_iterations = 1 num_outer_iterations = 5 index_to_zero = 4 zero_tol = 1e-37 batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_Format_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = {'ThetaSnippet': messages.ThetaSnippetScoreConfig()} master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], dictionary_name=dictionary_name) # Attach Pwt matrix topic_model, numpy_matrix = master.attach_model(pwt) numpy_matrix[:, index_to_zero] = 0 # Perform iterations for iter in xrange(num_outer_iterations): master.clear_score_cache() master.process_batches(pwt, nwt, num_inner_iterations, batches_folder) master.normalize_model(pwt, nwt) theta_snippet_score = master.get_score('ThetaSnippet') print 'ThetaSnippetScore.' # Note that 5th topic is fully zero; this is because we performed "numpy_matrix[:, 4] = 0". snippet_tuples = zip(theta_snippet_score.values, theta_snippet_score.item_id) print_string = '' for values, item_id in snippet_tuples: print_string += 'Item# {0}:\t'.format(item_id) for index, value in enumerate(values.value): if index == index_to_zero: assert value < zero_tol print_string += '{0:.3f}\t'.format(value) print print_string print_string = '' finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 8 num_outer_iterations = 2 num_inner_iterations = 1 theta_value = 0.1 theta_tol = 0.1 num_items = [1000, 430] pair_num_items = [1430, 2000] total_num_items = 3430 batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_Format_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = {'ThetaSnippetScore': messages.ThetaSnippetScoreConfig()} master = mc.MasterComponent(lib, scores=scores, cache_theta=True) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], dictionary_name=dictionary_name) for iter in xrange(num_outer_iterations): # Invoke one scan of the collection and normalize Phi master.clear_score_cache() master.process_batches(pwt, nwt, num_inner_iterations, batches_folder) master.normalize_model(pwt, nwt) # Option 1. # Getting a small snippet of ThetaMatrix for last processed documents (just to get an impression how it looks) # This may be useful if you are debugging some weird behavior, playing with regularizer weights, etc. # This does not require 'master.config().cache_theta = True' theta_snippet_score = master.get_score('ThetaSnippetScore') print 'Option 1. ThetaSnippetScore.' snippet_tuples = zip(theta_snippet_score.values, theta_snippet_score.item_id) print_string = '' for values, item_id in snippet_tuples: print_string += 'Item# {0}:\t'.format(item_id) for value in values.value: print_string += '{0:.3f}\t'.format(value) assert (abs(value - theta_value) < theta_tol) print print_string print_string = '' # Option 2. # Getting a full theta matrix cached during last iteration # This does requires "master_component.cache_theta = True" and stores the entire Theta matrix in memory. theta_matrix_info = master.get_theta_info() _, theta_numpy_matrix = master.get_theta_matrix() master.clear_theta_cache() print_string = 'Option 2. Full ThetaMatrix cached during last iteration,' print_string += '#items = {0}'.format(len(theta_matrix_info.item_id)) print print_string print theta_numpy_matrix assert numpy.count_nonzero( theta_numpy_matrix) == theta_numpy_matrix.size assert len(theta_matrix_info.item_id) == total_num_items # Option 3. # Getting theta matrix online during iteration. # This does requires "master_component.cache_theta = True", but never caches the entire Theta # because we clean it. # This is the best alternative to Option 2 if you can not afford caching entire ThetaMatrix in memory. batches = [] for name in os.listdir(batches_folder): _, extension = os.path.splitext(name) if extension == '.batch': batches.append(os.path.join(batches_folder, name)) for batch_index, batch_filename in enumerate(batches): master.clear_score_cache() master.process_batches(pwt, nwt, num_inner_iterations, batches=[batch_filename]) master.normalize_model(pwt, nwt) # The following rule defines when to retrieve Theta matrix. You decide :) if ((batch_index + 1) % 2 == 0) or ((batch_index + 1) == len(batches)): theta_matrix_info = master.get_theta_info() _, theta_numpy_matrix = master.get_theta_matrix() master.clear_theta_cache() print 'Option 3. ThetaMatrix from cache, online, #items = {0}'.format( len(theta_matrix_info.item_id)) print theta_numpy_matrix assert numpy.count_nonzero( theta_numpy_matrix) == theta_numpy_matrix.size assert len(theta_matrix_info.item_id) in pair_num_items # Option 4. # Testing batches by explicitly loading them from disk. This is the right way of testing held-out batches. master.clear_score_cache() info, matrix = master.process_batches(pwt=pwt, nwt=nwt, num_inner_iterations=1, batches=[batches[0]], find_theta=True) print 'Option 4. ThetaMatrix for test batch, #item {0}'.format( len(info.item_id)) assert numpy.count_nonzero(matrix) == matrix.size assert len(info.item_id) in num_items print matrix finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' nwt_hat = 'nwt_hat' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 num_processors = 2 decay_weight = 0.7 apply_weight = 0.3 num_batches = 2 top_tokens_value = 0.5 top_tokens_tol = 0.5 perplexity_first_value = set([6714.673, 6710.324, 6706.906, 6710.120, 6710.327, 6717.755, 6717.757, 6698.847, 6710.120, 6714.667, 6698.852, 6706.903]) batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder}) # Create master component and scores scores = {'Perplexity': messages.PerplexityScoreConfig(), 'TopTokens': messages.TopTokensScoreConfig()} master = mc.MasterComponent(lib, num_processors=num_processors, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join(data_path, vocab)) # Initialize model master.initialize_model(model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) # Get file names of batches to process batches = [] for name in os.listdir(batches_folder): _, extension = os.path.splitext(name) if extension == '.batch': batches.append(os.path.join(batches_folder, name)) # Perform iterations update_every = num_processors batches_to_process = [] for iter in range(num_outer_iterations): for batch_index, batch_filename in enumerate(batches): batches_to_process.append(batch_filename) if ((batch_index + 1) % update_every == 0) or ((batch_index + 1) == len(batches)): master.clear_score_cache() master.process_batches(pwt, nwt_hat, num_document_passes, batches=batches_to_process) master.merge_model({nwt: decay_weight, nwt_hat: apply_weight}, nwt=nwt) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('Perplexity') if iter == 0 and batch_index == 0: assert(perplexity_score.value in perplexity_first_value) assert len(batches_to_process) == num_batches print_string = 'Iteration = {0},'.format(iter) print_string += 'Perplexity = {0:.3f}'.format(perplexity_score.value) print_string += ', num batches = {0}'.format(len(batches_to_process)) print(print_string) batches_to_process = [] # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print('Top tokens per topic:') top_tokens_triplets = zip(top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby(top_tokens_triplets, key=lambda triplet: triplet[0]): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) assert abs(weight - top_tokens_value) < top_tokens_tol print(print_string) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_processors_list = [4, 2, 1] num_topics = 10 num_document_passes = 10 num_outer_iterations = 5 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6710.208, 1: 2434.135, 2: 2202.418, 3: 1936.493, 4: 1774.600 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) for num_processors in num_processors_list: # Create master component and scores scores = {'PerplexityScore': messages.PerplexityScoreConfig()} master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) times = [] for iter in range(num_outer_iterations): start = time.time() # Invoke one scan of the collection and normalize Phi master.clear_score_cache() master.process_batches(pwt, nwt, num_document_passes, batches_folder) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('PerplexityScore') end = time.time() assert abs(expected_perplexity_value_on_iteration[iter] - perplexity_score.value) < perplexity_tol times.append(end - start) string = 'Iter#{0}'.format(iter) string += ': Perplexity = {0:.3f}, Time = {1:.3f}'.format( perplexity_score.value, end - start) print(string) print('Average time per iteration = {0:.3f}'.format( float(sum(times)) / len(times))) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' smsp_phi_tau = -0.2 smsp_theta_tau = -0.1 decor_phi_tau = 1000000 num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6703.161, 1: 2426.277, 2: 2276.476, 3: 1814.072, 4: 1742.911, 5: 1637.142, 6: 1612.946, 7: 1581.725 } sparsity_tol = 0.001 expected_phi_sparsity_value_on_iteration = { 0: 0.059, 1: 0.120, 2: 0.212, 3: 0.306, 4: 0.380, 5: 0.438, 6: 0.483, 7: 0.516 } expected_theta_sparsity_value_on_iteration = { 0: 0.009, 1: 0.036, 2: 0.146, 3: 0.239, 4: 0.278, 5: 0.301, 6: 0.315, 7: 0.319 } expected_perplexity_value_online = 1572.268 expected_phi_sparsity_value_online = 0.528 expected_theta_sparsity_value_online = 0.320 batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = { 'Perplexity': messages.PerplexityScoreConfig(), 'SparsityPhi': messages.SparsityPhiScoreConfig() } master = mc.MasterComponent(lib, scores=scores, num_document_passes=num_document_passes) master.create_score('SparsityTheta', messages.SparsityThetaScoreConfig()) master.create_score('TopTokens', messages.TopTokensScoreConfig()) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig(), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) master.create_regularizer(name='DecorrelatorPhi', config=messages.DecorrelatorPhiConfig(), tau=decor_phi_tau) master.reconfigure_regularizer(name='SmoothSparsePhi', tau=smsp_phi_tau) master.reconfigure_regularizer(name='SmoothSparseTheta', tau=smsp_theta_tau) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): master.fit_offline(batches_folder=batches_folder, num_collection_passes=1) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter#{0}'.format(iter) print_string += ': Perplexity = {0:.3f}'.format( perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print(print_string) assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol assert abs( sparsity_phi_score.value - expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_on_iteration[iter] ) < sparsity_tol perplexity_scores = master.get_score_array('Perplexity') assert len(perplexity_scores) == (iter + 1) # proceed one online iteration batch_filenames = glob.glob(os.path.join(batches_folder, '*.batch')) master.fit_online(batch_filenames=batch_filenames, update_after=[4], apply_weight=[0.5], decay_weight=[0.5]) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter Online' print_string += ': Perplexity = {0:.3f}'.format(perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print(print_string) assert abs(perplexity_score.value - expected_perplexity_value_online) < perplexity_tol assert abs(sparsity_phi_score.value - expected_phi_sparsity_value_online) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_online) < sparsity_tol # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print('Top tokens per topic:') top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda triplet: triplet[0]): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) print(print_string) master.clear_score_array_cache() master.fit_online(batch_filenames=batch_filenames, update_after=[1, 2, 3, 4], apply_weight=[0.5, 0.5, 0.5, 0.5], decay_weight=[0.5, 0.5, 0.5, 0.5]) perplexity_scores = master.get_score_array('Perplexity') assert len(perplexity_scores) == 4 finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants num_tokens = 60 num_items = 100 pwt = 'pwt' nwt = 'nwt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 10 num_top_tokens = 4 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 54.616, 1: 38.472, 2: 28.655, 3: 24.362, 4: 22.355, 5: 21.137, 6: 20.808, 7: 20.791, 8: 20.746, 9: 20.581 } top_tokens_tol = 0.05 expected_top_tokens_weight = 0.1 dictionary_name = 'dictionary' batches_folder = tempfile.mkdtemp() try: # Generate small collection batch = messages.Batch() batch.id = str(uuid.uuid4()) for token_id in range(num_tokens): batch.token.append('token_{0}'.format(token_id)) for item_id in range(num_items): item = batch.item.add() item.id = item_id for token_id in range(num_tokens): item.token_id.append(token_id) background_count = ((item_id + token_id) % 5 + 1) if (token_id >= 40) else 0 target_topics = num_topics if (token_id < 40) and ( (token_id % 10) == (item_id % 10)) else 0 item.token_weight.append(background_count + target_topics) # Create the instance of low-level API lib = artm.wrapper.LibArtm() # Save batch on the disk lib.ArtmSaveBatch(batches_folder, batch) # Create master component and scores scores = { 'PerplexityScore': messages.PerplexityScoreConfig(), 'TopTokensScore': messages.TopTokensScoreConfig(num_tokens=num_top_tokens) } master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): # Invoke one scan of the collection and normalize Phi master.clear_score_cache() master.process_batches(pwt, nwt, num_document_passes, batches_folder) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('PerplexityScore') assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol print('Iteration#{0} : Perplexity = {1:.3f}'.format( iter, perplexity_score.value)) # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokensScore') print('Top tokens per topic:') top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda triplet: triplet[0]): string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: assert abs(weight - expected_top_tokens_weight) < top_tokens_tol string += ' {0}({1:.3f})'.format(token, weight) print(string) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' num_topics = 2 num_inner_iterations = 10 num_outer_iterations = 10 russian_class_weight = 1.0 english_class_weight = 1.0 russian_class = '@russian' english_class = '@english' tolerance = 0.001 expected_values_rus_topic = { 0: { u'документ': 0.125, u'текст': 0.125, u'анализ': 0.125, u'статистический': 0.125, u'модель': 0.125, u'коллекция': 0.083, u'тематическая': 0.083, 'model': 0.042, 'topic': 0.042, 'artm': 0.042 }, 1: { u'ногие': 0.115, u'отряд': 0.115, u'млекопитающие': 0.115, u'семейство': 0.115, u'хищный': 0.077, u'ласто': 0.077, u'моржовых': 0.077, u'тюлень': 0.077, u'ушастый': 0.077, u'коротко': 0.038 } } expected_values_eng_topic = { 0: { 'model': 0.167, 'text': 0.125, 'analysis': 0.125, 'statistical': 0.125, 'topic': 0.125, 'artm': 0.083, 'plsa': 0.083, 'lda': 0.083, 'collection': 0.083, 'not': 0.000 }, 1: { 'mammal': 0.188, 'predatory': 0.125, 'eared': 0.125, 'marine': 0.125, 'seal': 0.125, 'not': 0.062, 'reptile': 0.062, 'crocodilia': 0.062, 'order': 0.062, 'pinnipeds': 0.062 } } expected_sparsity_values = {'russian': 0.5, 'english': 0.5} # Prepare multimodal data ens = [] rus = [] ens.append( u'Topic model statistical analysis text collection LDA PLSA ARTM') rus.append(u'Тематическая модель статистический анализ текст коллекция') ens.append(u'LDA statistical topic model text collection') rus.append( u'LDA статистический тематическая модель текст документ коллекция') ens.append(u'PLSA statistical analysis text model') rus.append(u'PLSA статистический анализ документ текст модель') ens.append(u'ARTM analysis topic model') rus.append(u'ARTM анализ документ topic model') ens.append(u'Pinnipeds seal marine mammal order') rus.append(u'Тюлень семейство млекопитающие моржовых отряд ласто ногие') ens.append(u'Eared seal marine predatory mammal') rus.append( u'Ушастый тюлень семейство млекопитающие отряд хищный семейство моржовых ласто ногие' ) ens.append(u'Eared Crocodilia predatory reptile not mammal') rus.append( u'Ушастый крокодил гена отряд хищный не млекопитающие коротко ногие') ru_dic = {} # mapping from russian token to its index in batch.token list en_dic = {} # mapping from english token to its index in batch.token list batch = messages.Batch() # batch representing the entire collection batch.id = str(uuid.uuid1()) dict_data = messages.DictionaryData( ) # BigARTM dictionary to initialize model dict_data.name = dictionary_name def append(tokens, dic, item, class_id): for token in tokens: if not dic.has_key(token): # New token discovered: dic[token] = len(batch.token) # 1. update ru_dic or en_dic batch.token.append( token) # 2. update batch.token and batch.class_id batch.class_id.append(class_id) dict_data.token.append(token) dict_data.class_id.append(class_id) # Add token to the item. item.token_id.append(dic[token]) # replace '1' with the actual number of token occupancies in the item item.token_weight.append(1) # Iterate through all items and populate the batch for (en, ru) in zip(ens, rus): next_item = batch.item.add() next_item.id = len(batch.item) - 1 append(string.split(ru.lower()), ru_dic, next_item, russian_class) append(string.split(en.lower()), en_dic, next_item, english_class) batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Save batch and dictionary on the disk lib.ArtmSaveBatch(batches_folder, batch) # Create master component and scores scores = { 'SparsityPhiRus': messages.SparsityPhiScoreConfig(class_id=russian_class), 'SparsityPhiEng': messages.SparsityPhiScoreConfig(class_id=english_class), 'TopTokensRus': messages.TopTokensScoreConfig(class_id=russian_class), 'TopTokensEng': messages.TopTokensScoreConfig(class_id=english_class) } master = mc.MasterComponent(lib, scores=scores) # Create the collection dictionary lib.ArtmCreateDictionary(master.master_id, dict_data) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], dictionary_name=dictionary_name) for iter in xrange(num_outer_iterations): # Invoke one scan of the collection, regularize and normalize Phi master.clear_score_cache() master.process_batches( pwt, nwt, num_inner_iterations, batches_folder, class_ids=[russian_class, english_class], class_weights=[russian_class_weight, english_class_weight]) master.normalize_model(pwt, nwt) # Retrieve and print scores top_tokens_rus = master.get_score('TopTokensRus') top_tokens_eng = master.get_score('TopTokensEng') sp_phi_rus = master.get_score('SparsityPhiRus') sp_phi_eng = master.get_score('SparsityPhiEng') print 'Top tokens per russian topic:' _print_top_tokens(top_tokens_rus, expected_values_rus_topic, tolerance) print 'Top tokens per english topic:' _print_top_tokens(top_tokens_eng, expected_values_eng_topic, tolerance) print '\nSparsity Phi: russian {0:.3f}, english {1:.3f}'.format( sp_phi_rus.value, sp_phi_eng.value) assert abs(expected_sparsity_values['russian'] - sp_phi_rus.value) < tolerance assert abs(expected_sparsity_values['english'] - sp_phi_eng.value) < tolerance finally: shutil.rmtree(batches_folder)