def test_func(): # Set some constants data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' rwt = 'rwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' smsp_phi_tau = -0.2 smsp_theta_tau = -0.1 decor_phi_tau = 1000000 num_topics = 10 num_inner_iterations = 10 num_outer_iterations = 8 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6703.161, 1: 2426.277, 2: 2276.476, 3: 1814.072, 4: 1742.911, 5: 1637.142, 6: 1612.946, 7: 1581.725 } sparsity_tol = 0.001 expected_phi_sparsity_value_on_iteration = { 0: 0.059, 1: 0.120, 2: 0.212, 3: 0.306, 4: 0.380, 5: 0.438, 6: 0.483, 7: 0.516 } expected_theta_sparsity_value_on_iteration = { 0: 0.009, 1: 0.036, 2: 0.146, 3: 0.239, 4: 0.278, 5: 0.301, 6: 0.315, 7: 0.319 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_Format_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = { 'Perplexity': messages.PerplexityScoreConfig(), 'SparsityPhi': messages.SparsityPhiScoreConfig() } master = mc.MasterComponent(lib, scores=scores) master.create_score('SparsityTheta', messages.SparsityThetaScoreConfig()) master.create_score('TopTokens', messages.TopTokensScoreConfig()) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig(), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) master.create_regularizer(name='DecorrelatorPhi', config=messages.DecorrelatorPhiConfig(), tau=0.0) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in xrange(num_topics)], dictionary_name=dictionary_name) for iter in xrange(num_outer_iterations): # Invoke one scan of the collection, regularize and normalize Phi master.clear_score_cache() master.process_batches(pwt=pwt, nwt=nwt, num_inner_iterations=num_inner_iterations, batches_folder=batches_folder, regularizer_name=['SmoothSparseTheta'], regularizer_tau=[smsp_theta_tau]) master.regularize_model(pwt, nwt, rwt, ['SmoothSparsePhi', 'DecorrelatorPhi'], [smsp_phi_tau, decor_phi_tau]) master.normalize_model(pwt, nwt, rwt) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter#{0}'.format(iter) print_string += ': Perplexity = {0:.3f}'.format( perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print print_string assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol assert abs( sparsity_phi_score.value - expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_on_iteration[iter] ) < sparsity_tol # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print 'Top tokens per topic:' top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda (topic_index, _): topic_index): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) print print_string finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' rwt = 'rwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 smsp_phi_tau = -20.0 smsp_theta_tau = -3.0 perplexity_tol = 1.0 expected_perp_col_value_on_iteration = { 0: 6650.1, 1: 2300.2, 2: 1996.8, 3: 1786.1, 4: 1692.7, 5: 1644.4, 6: 1612.3, 7: 1589.5 } expected_perp_doc_value_on_iteration = { 0: 6614.6, 1: 2295.0, 2: 1996.4, 3: 1786.1, 4: 1692.7, 5: 1644.2, 6: 1611.7, 7: 1588.6 } expected_perp_zero_words_on_iteration = { 0: 494, 1: 210, 2: 24, 3: 0, 4: 2, 5: 10, 6: 28, 7: 47 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores perplexity_config = messages.PerplexityScoreConfig() perplexity_config.model_type = constants.PerplexityScoreConfig_Type_UnigramCollectionModel perplexity_config.dictionary_name = dictionary_name scores = { 'PerplexityDoc': messages.PerplexityScoreConfig(), 'PerplexityCol': perplexity_config } master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig( dictionary_name=dictionary_name), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): # Invoke one scan of the collection, regularize and normalize Phi master.clear_score_cache() master.process_batches(pwt=pwt, nwt=nwt, num_document_passes=num_document_passes, batches_folder=batches_folder, regularizer_name=['SmoothSparseTheta'], regularizer_tau=[smsp_theta_tau]) master.regularize_model(pwt, nwt, rwt, ['SmoothSparsePhi'], [smsp_phi_tau]) master.normalize_model(pwt, nwt, rwt) # Retrieve perplexity score perplexity_doc_score = master.get_score('PerplexityDoc') perplexity_col_score = master.get_score('PerplexityCol') # Assert and print scores string = 'Iter#{0}'.format(iter) string += ': Collection perp. = {0:.1f}'.format( perplexity_col_score.value) string += ', Document perp. = {0:.1f}'.format( perplexity_doc_score.value) string += ', Zero words = {0}'.format( perplexity_doc_score.zero_words) print(string) print(perplexity_col_score.value, expected_perp_col_value_on_iteration[iter]) assert abs( perplexity_col_score.value - expected_perp_col_value_on_iteration[iter]) < perplexity_tol assert abs( perplexity_doc_score.value - expected_perp_doc_value_on_iteration[iter]) < perplexity_tol assert perplexity_doc_score.zero_words - expected_perp_zero_words_on_iteration[ iter] == 0 finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' nwt_hat = 'nwt_hat' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 num_processors = 2 decay_weight = 0.7 apply_weight = 0.3 num_batches = 2 top_tokens_value = 0.5 top_tokens_tol = 0.5 perplexity_first_value = set([6714.673, 6710.324, 6706.906, 6710.120, 6710.327, 6717.755, 6717.757, 6698.847, 6710.120, 6714.667, 6698.852, 6706.903]) batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder}) # Create master component and scores scores = {'Perplexity': messages.PerplexityScoreConfig(), 'TopTokens': messages.TopTokensScoreConfig()} master = mc.MasterComponent(lib, num_processors=num_processors, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join(data_path, vocab)) # Initialize model master.initialize_model(model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) # Get file names of batches to process batches = [] for name in os.listdir(batches_folder): _, extension = os.path.splitext(name) if extension == '.batch': batches.append(os.path.join(batches_folder, name)) # Perform iterations update_every = num_processors batches_to_process = [] for iter in range(num_outer_iterations): for batch_index, batch_filename in enumerate(batches): batches_to_process.append(batch_filename) if ((batch_index + 1) % update_every == 0) or ((batch_index + 1) == len(batches)): master.clear_score_cache() master.process_batches(pwt, nwt_hat, num_document_passes, batches=batches_to_process) master.merge_model({nwt: decay_weight, nwt_hat: apply_weight}, nwt=nwt) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('Perplexity') if iter == 0 and batch_index == 0: assert(perplexity_score.value in perplexity_first_value) assert len(batches_to_process) == num_batches print_string = 'Iteration = {0},'.format(iter) print_string += 'Perplexity = {0:.3f}'.format(perplexity_score.value) print_string += ', num batches = {0}'.format(len(batches_to_process)) print(print_string) batches_to_process = [] # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print('Top tokens per topic:') top_tokens_triplets = zip(top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby(top_tokens_triplets, key=lambda triplet: triplet[0]): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) assert abs(weight - top_tokens_value) < top_tokens_tol print(print_string) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' num_processors_list = [4, 2, 1] num_topics = 10 num_document_passes = 10 num_outer_iterations = 5 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6710.208, 1: 2434.135, 2: 2202.418, 3: 1936.493, 4: 1774.600 } batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) for num_processors in num_processors_list: # Create master component and scores scores = {'PerplexityScore': messages.PerplexityScoreConfig()} master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) times = [] for iter in range(num_outer_iterations): start = time.time() # Invoke one scan of the collection and normalize Phi master.clear_score_cache() master.process_batches(pwt, nwt, num_document_passes, batches_folder) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('PerplexityScore') end = time.time() assert abs(expected_perplexity_value_on_iteration[iter] - perplexity_score.value) < perplexity_tol times.append(end - start) string = 'Iter#{0}'.format(iter) string += ': Perplexity = {0:.3f}, Time = {1:.3f}'.format( perplexity_score.value, end - start) print(string) print('Average time per iteration = {0:.3f}'.format( float(sum(times)) / len(times))) finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants data_path = os.environ.get('BIGARTM_UNITTEST_DATA') dictionary_name = 'dictionary' pwt = 'pwt' nwt = 'nwt' docword = 'docword.kos.txt' vocab = 'vocab.kos.txt' smsp_phi_tau = -0.2 smsp_theta_tau = -0.1 decor_phi_tau = 1000000 num_topics = 10 num_document_passes = 10 num_outer_iterations = 8 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 6703.161, 1: 2426.277, 2: 2276.476, 3: 1814.072, 4: 1742.911, 5: 1637.142, 6: 1612.946, 7: 1581.725 } sparsity_tol = 0.001 expected_phi_sparsity_value_on_iteration = { 0: 0.059, 1: 0.120, 2: 0.212, 3: 0.306, 4: 0.380, 5: 0.438, 6: 0.483, 7: 0.516 } expected_theta_sparsity_value_on_iteration = { 0: 0.009, 1: 0.036, 2: 0.146, 3: 0.239, 4: 0.278, 5: 0.301, 6: 0.315, 7: 0.319 } expected_perplexity_value_online = 1572.268 expected_phi_sparsity_value_online = 0.528 expected_theta_sparsity_value_online = 0.320 batches_folder = tempfile.mkdtemp() try: # Create the instance of low-level API and master object lib = artm.wrapper.LibArtm() # Parse collection from disk lib.ArtmParseCollection({ 'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci, 'docword_file_path': os.path.join(data_path, docword), 'vocab_file_path': os.path.join(data_path, vocab), 'target_folder': batches_folder }) # Create master component and scores scores = { 'Perplexity': messages.PerplexityScoreConfig(), 'SparsityPhi': messages.SparsityPhiScoreConfig() } master = mc.MasterComponent(lib, scores=scores, num_document_passes=num_document_passes) master.create_score('SparsityTheta', messages.SparsityThetaScoreConfig()) master.create_score('TopTokens', messages.TopTokensScoreConfig()) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder, vocab_file_path=os.path.join( data_path, vocab)) # Configure basic regularizers master.create_regularizer(name='SmoothSparsePhi', config=messages.SmoothSparsePhiConfig(), tau=0.0) master.create_regularizer(name='SmoothSparseTheta', config=messages.SmoothSparseThetaConfig(), tau=0.0) master.create_regularizer(name='DecorrelatorPhi', config=messages.DecorrelatorPhiConfig(), tau=decor_phi_tau) master.reconfigure_regularizer(name='SmoothSparsePhi', tau=smsp_phi_tau) master.reconfigure_regularizer(name='SmoothSparseTheta', tau=smsp_theta_tau) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): master.fit_offline(batches_folder=batches_folder, num_collection_passes=1) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter#{0}'.format(iter) print_string += ': Perplexity = {0:.3f}'.format( perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print(print_string) assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol assert abs( sparsity_phi_score.value - expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_on_iteration[iter] ) < sparsity_tol perplexity_scores = master.get_score_array('Perplexity') assert len(perplexity_scores) == (iter + 1) # proceed one online iteration batch_filenames = glob.glob(os.path.join(batches_folder, '*.batch')) master.fit_online(batch_filenames=batch_filenames, update_after=[4], apply_weight=[0.5], decay_weight=[0.5]) # Retrieve scores perplexity_score = master.get_score('Perplexity') sparsity_phi_score = master.get_score('SparsityPhi') sparsity_theta_score = master.get_score('SparsityTheta') # Assert and print scores print_string = 'Iter Online' print_string += ': Perplexity = {0:.3f}'.format(perplexity_score.value) print_string += ', Phi sparsity = {0:.3f}'.format( sparsity_phi_score.value) print_string += ', Theta sparsity = {0:.3f}'.format( sparsity_theta_score.value) print(print_string) assert abs(perplexity_score.value - expected_perplexity_value_online) < perplexity_tol assert abs(sparsity_phi_score.value - expected_phi_sparsity_value_online) < sparsity_tol assert abs(sparsity_theta_score.value - expected_theta_sparsity_value_online) < sparsity_tol # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokens') print('Top tokens per topic:') top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda triplet: triplet[0]): print_string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: print_string += ' {0}({1:.3f})'.format(token, weight) print(print_string) master.clear_score_array_cache() master.fit_online(batch_filenames=batch_filenames, update_after=[1, 2, 3, 4], apply_weight=[0.5, 0.5, 0.5, 0.5], decay_weight=[0.5, 0.5, 0.5, 0.5]) perplexity_scores = master.get_score_array('Perplexity') assert len(perplexity_scores) == 4 finally: shutil.rmtree(batches_folder)
def test_func(): # Set some constants num_tokens = 60 num_items = 100 pwt = 'pwt' nwt = 'nwt' num_topics = 10 num_document_passes = 10 num_outer_iterations = 10 num_top_tokens = 4 perplexity_tol = 0.001 expected_perplexity_value_on_iteration = { 0: 54.616, 1: 38.472, 2: 28.655, 3: 24.362, 4: 22.355, 5: 21.137, 6: 20.808, 7: 20.791, 8: 20.746, 9: 20.581 } top_tokens_tol = 0.05 expected_top_tokens_weight = 0.1 dictionary_name = 'dictionary' batches_folder = tempfile.mkdtemp() try: # Generate small collection batch = messages.Batch() batch.id = str(uuid.uuid4()) for token_id in range(num_tokens): batch.token.append('token_{0}'.format(token_id)) for item_id in range(num_items): item = batch.item.add() item.id = item_id for token_id in range(num_tokens): item.token_id.append(token_id) background_count = ((item_id + token_id) % 5 + 1) if (token_id >= 40) else 0 target_topics = num_topics if (token_id < 40) and ( (token_id % 10) == (item_id % 10)) else 0 item.token_weight.append(background_count + target_topics) # Create the instance of low-level API lib = artm.wrapper.LibArtm() # Save batch on the disk lib.ArtmSaveBatch(batches_folder, batch) # Create master component and scores scores = { 'PerplexityScore': messages.PerplexityScoreConfig(), 'TopTokensScore': messages.TopTokensScoreConfig(num_tokens=num_top_tokens) } master = mc.MasterComponent(lib, scores=scores) # Create collection dictionary and import it master.gather_dictionary(dictionary_target_name=dictionary_name, data_path=batches_folder) # Initialize model master.initialize_model( model_name=pwt, topic_names=['topic_{}'.format(i) for i in range(num_topics)], dictionary_name=dictionary_name) for iter in range(num_outer_iterations): # Invoke one scan of the collection and normalize Phi master.clear_score_cache() master.process_batches(pwt, nwt, num_document_passes, batches_folder) master.normalize_model(pwt, nwt) # Retrieve and print perplexity score perplexity_score = master.get_score('PerplexityScore') assert abs( perplexity_score.value - expected_perplexity_value_on_iteration[iter]) < perplexity_tol print('Iteration#{0} : Perplexity = {1:.3f}'.format( iter, perplexity_score.value)) # Retrieve and print top tokens score top_tokens_score = master.get_score('TopTokensScore') print('Top tokens per topic:') top_tokens_triplets = zip( top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight)) for topic_index, group in itertools.groupby( top_tokens_triplets, key=lambda triplet: triplet[0]): string = 'Topic#{0} : '.format(topic_index) for _, (token, weight) in group: assert abs(weight - expected_top_tokens_weight) < top_tokens_tol string += ' {0}({1:.3f})'.format(token, weight) print(string) finally: shutil.rmtree(batches_folder)