Ejemplo n.º 1
0
def test_func():
    # Set some constants
    data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    dictionary_name = 'dictionary'
    pwt = 'pwt'
    nwt = 'nwt'
    rwt = 'rwt'
    docword = 'docword.kos.txt'
    vocab = 'vocab.kos.txt'

    smsp_phi_tau = -0.2
    smsp_theta_tau = -0.1
    decor_phi_tau = 1000000

    num_topics = 10
    num_inner_iterations = 10
    num_outer_iterations = 8

    perplexity_tol = 0.001
    expected_perplexity_value_on_iteration = {
        0: 6703.161,
        1: 2426.277,
        2: 2276.476,
        3: 1814.072,
        4: 1742.911,
        5: 1637.142,
        6: 1612.946,
        7: 1581.725
    }
    sparsity_tol = 0.001
    expected_phi_sparsity_value_on_iteration = {
        0: 0.059,
        1: 0.120,
        2: 0.212,
        3: 0.306,
        4: 0.380,
        5: 0.438,
        6: 0.483,
        7: 0.516
    }
    expected_theta_sparsity_value_on_iteration = {
        0: 0.009,
        1: 0.036,
        2: 0.146,
        3: 0.239,
        4: 0.278,
        5: 0.301,
        6: 0.315,
        7: 0.319
    }

    batches_folder = tempfile.mkdtemp()
    try:
        # Create the instance of low-level API and master object
        lib = artm.wrapper.LibArtm()

        # Parse collection from disk
        lib.ArtmParseCollection({
            'format':
            constants.CollectionParserConfig_Format_BagOfWordsUci,
            'docword_file_path':
            os.path.join(data_path, docword),
            'vocab_file_path':
            os.path.join(data_path, vocab),
            'target_folder':
            batches_folder
        })

        # Create master component and scores
        scores = {
            'Perplexity': messages.PerplexityScoreConfig(),
            'SparsityPhi': messages.SparsityPhiScoreConfig()
        }
        master = mc.MasterComponent(lib, scores=scores)

        master.create_score('SparsityTheta',
                            messages.SparsityThetaScoreConfig())
        master.create_score('TopTokens', messages.TopTokensScoreConfig())

        # Create collection dictionary and import it
        master.gather_dictionary(dictionary_target_name=dictionary_name,
                                 data_path=batches_folder,
                                 vocab_file_path=os.path.join(
                                     data_path, vocab))

        # Configure basic regularizers
        master.create_regularizer(name='SmoothSparsePhi',
                                  config=messages.SmoothSparsePhiConfig(),
                                  tau=0.0)
        master.create_regularizer(name='SmoothSparseTheta',
                                  config=messages.SmoothSparseThetaConfig(),
                                  tau=0.0)
        master.create_regularizer(name='DecorrelatorPhi',
                                  config=messages.DecorrelatorPhiConfig(),
                                  tau=0.0)

        # Initialize model
        master.initialize_model(
            model_name=pwt,
            topic_names=['topic_{}'.format(i) for i in xrange(num_topics)],
            dictionary_name=dictionary_name)

        for iter in xrange(num_outer_iterations):
            # Invoke one scan of the collection, regularize and normalize Phi
            master.clear_score_cache()
            master.process_batches(pwt=pwt,
                                   nwt=nwt,
                                   num_inner_iterations=num_inner_iterations,
                                   batches_folder=batches_folder,
                                   regularizer_name=['SmoothSparseTheta'],
                                   regularizer_tau=[smsp_theta_tau])
            master.regularize_model(pwt, nwt, rwt,
                                    ['SmoothSparsePhi', 'DecorrelatorPhi'],
                                    [smsp_phi_tau, decor_phi_tau])
            master.normalize_model(pwt, nwt, rwt)

            # Retrieve scores
            perplexity_score = master.get_score('Perplexity')
            sparsity_phi_score = master.get_score('SparsityPhi')
            sparsity_theta_score = master.get_score('SparsityTheta')

            # Assert and print scores
            print_string = 'Iter#{0}'.format(iter)
            print_string += ': Perplexity = {0:.3f}'.format(
                perplexity_score.value)
            print_string += ', Phi sparsity = {0:.3f}'.format(
                sparsity_phi_score.value)
            print_string += ', Theta sparsity = {0:.3f}'.format(
                sparsity_theta_score.value)
            print print_string

            assert abs(
                perplexity_score.value -
                expected_perplexity_value_on_iteration[iter]) < perplexity_tol
            assert abs(
                sparsity_phi_score.value -
                expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol
            assert abs(sparsity_theta_score.value -
                       expected_theta_sparsity_value_on_iteration[iter]
                       ) < sparsity_tol

        # Retrieve and print top tokens score
        top_tokens_score = master.get_score('TopTokens')

        print 'Top tokens per topic:'
        top_tokens_triplets = zip(
            top_tokens_score.topic_index,
            zip(top_tokens_score.token, top_tokens_score.weight))
        for topic_index, group in itertools.groupby(
                top_tokens_triplets, key=lambda (topic_index, _): topic_index):
            print_string = 'Topic#{0} : '.format(topic_index)
            for _, (token, weight) in group:
                print_string += ' {0}({1:.3f})'.format(token, weight)
            print print_string
    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 2
0
def test_func():
    # Set some constants
    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    dictionary_name = 'dictionary'
    pwt = 'pwt'
    nwt = 'nwt'
    nwt_hat = 'nwt_hat'
    docword = 'docword.kos.txt'
    vocab = 'vocab.kos.txt'

    num_topics = 10
    num_document_passes = 10
    num_outer_iterations = 8
    num_processors = 2
    
    decay_weight = 0.7
    apply_weight = 0.3
    
    num_batches = 2
    top_tokens_value = 0.5
    top_tokens_tol = 0.5
    perplexity_first_value = set([6714.673, 6710.324, 6706.906, 6710.120, 6710.327, 6717.755,
                                  6717.757, 6698.847, 6710.120, 6714.667, 6698.852, 6706.903])

    batches_folder = tempfile.mkdtemp()
    try:
        # Create the instance of low-level API and master object
        lib = artm.wrapper.LibArtm()
        
        # Parse collection from disk
        lib.ArtmParseCollection({'format': constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci,
                                 'docword_file_path': os.path.join(data_path, docword),
                                 'vocab_file_path': os.path.join(data_path, vocab),
                                 'target_folder': batches_folder})

        # Create master component and scores
        scores = {'Perplexity': messages.PerplexityScoreConfig(),
                  'TopTokens': messages.TopTokensScoreConfig()}
        master = mc.MasterComponent(lib, num_processors=num_processors, scores=scores)

        # Create collection dictionary and import it
        master.gather_dictionary(dictionary_target_name=dictionary_name,
                                 data_path=batches_folder,
                                 vocab_file_path=os.path.join(data_path, vocab))

        # Initialize model
        master.initialize_model(model_name=pwt,
                                topic_names=['topic_{}'.format(i) for i in range(num_topics)],
                                dictionary_name=dictionary_name)

        # Get file names of batches to process
        batches = []
        for name in os.listdir(batches_folder):
            _, extension = os.path.splitext(name)
            if extension == '.batch':
                batches.append(os.path.join(batches_folder, name))

        # Perform iterations
        update_every = num_processors
        batches_to_process = []
        for iter in range(num_outer_iterations):
            for batch_index, batch_filename in enumerate(batches):
                batches_to_process.append(batch_filename)
                if ((batch_index + 1) % update_every == 0) or ((batch_index + 1) == len(batches)):
                    master.clear_score_cache()
                    master.process_batches(pwt, nwt_hat, num_document_passes, batches=batches_to_process)
                    master.merge_model({nwt: decay_weight, nwt_hat: apply_weight}, nwt=nwt)
                    master.normalize_model(pwt, nwt)

                    # Retrieve and print perplexity score
                    perplexity_score = master.get_score('Perplexity')
                    if iter == 0 and batch_index == 0:
                        assert(perplexity_score.value in perplexity_first_value)
                    assert len(batches_to_process) == num_batches
                    print_string = 'Iteration = {0},'.format(iter)
                    print_string += 'Perplexity = {0:.3f}'.format(perplexity_score.value)
                    print_string += ', num batches = {0}'.format(len(batches_to_process))
                    print(print_string)
                    batches_to_process = []

        # Retrieve and print top tokens score
        top_tokens_score = master.get_score('TopTokens')

        print('Top tokens per topic:')
        top_tokens_triplets = zip(top_tokens_score.topic_index, zip(top_tokens_score.token, top_tokens_score.weight))
        for topic_index, group in itertools.groupby(top_tokens_triplets, key=lambda triplet: triplet[0]):
            print_string = 'Topic#{0} : '.format(topic_index)
            for _, (token, weight) in group:
                print_string += ' {0}({1:.3f})'.format(token, weight)
                assert abs(weight - top_tokens_value) < top_tokens_tol
            print(print_string)
    finally:
        shutil.rmtree(batches_folder)
def test_func():
    # Set some constants
    num_tokens = 60
    num_items = 100
    pwt = 'pwt'
    nwt = 'nwt'

    num_topics = 10
    num_document_passes = 10
    num_outer_iterations = 10
    num_top_tokens = 4

    perplexity_tol = 0.001
    expected_perplexity_value_on_iteration = {
        0: 54.616,
        1: 38.472,
        2: 28.655,
        3: 24.362,
        4: 22.355,
        5: 21.137,
        6: 20.808,
        7: 20.791,
        8: 20.746,
        9: 20.581
    }

    top_tokens_tol = 0.05
    expected_top_tokens_weight = 0.1

    dictionary_name = 'dictionary'
    batches_folder = tempfile.mkdtemp()
    try:
        # Generate small collection
        batch = messages.Batch()
        batch.id = str(uuid.uuid4())
        for token_id in range(num_tokens):
            batch.token.append('token_{0}'.format(token_id))

        for item_id in range(num_items):
            item = batch.item.add()
            item.id = item_id
            for token_id in range(num_tokens):
                item.token_id.append(token_id)
                background_count = ((item_id + token_id) % 5 +
                                    1) if (token_id >= 40) else 0
                target_topics = num_topics if (token_id < 40) and (
                    (token_id % 10) == (item_id % 10)) else 0
                item.token_weight.append(background_count + target_topics)

        # Create the instance of low-level API
        lib = artm.wrapper.LibArtm()

        # Save batch on the disk
        lib.ArtmSaveBatch(batches_folder, batch)

        # Create master component and scores
        scores = {
            'PerplexityScore':
            messages.PerplexityScoreConfig(),
            'TopTokensScore':
            messages.TopTokensScoreConfig(num_tokens=num_top_tokens)
        }
        master = mc.MasterComponent(lib, scores=scores)

        # Create collection dictionary and import it
        master.gather_dictionary(dictionary_target_name=dictionary_name,
                                 data_path=batches_folder)

        # Initialize model
        master.initialize_model(
            model_name=pwt,
            topic_names=['topic_{}'.format(i) for i in range(num_topics)],
            dictionary_name=dictionary_name)

        for iter in range(num_outer_iterations):
            # Invoke one scan of the collection and normalize Phi
            master.clear_score_cache()
            master.process_batches(pwt, nwt, num_document_passes,
                                   batches_folder)
            master.normalize_model(pwt, nwt)

            # Retrieve and print perplexity score
            perplexity_score = master.get_score('PerplexityScore')
            assert abs(
                perplexity_score.value -
                expected_perplexity_value_on_iteration[iter]) < perplexity_tol
            print('Iteration#{0} : Perplexity = {1:.3f}'.format(
                iter, perplexity_score.value))

        # Retrieve and print top tokens score
        top_tokens_score = master.get_score('TopTokensScore')

        print('Top tokens per topic:')
        top_tokens_triplets = zip(
            top_tokens_score.topic_index,
            zip(top_tokens_score.token, top_tokens_score.weight))
        for topic_index, group in itertools.groupby(
                top_tokens_triplets, key=lambda triplet: triplet[0]):
            string = 'Topic#{0} : '.format(topic_index)
            for _, (token, weight) in group:
                assert abs(weight -
                           expected_top_tokens_weight) < top_tokens_tol
                string += ' {0}({1:.3f})'.format(token, weight)
            print(string)
    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 4
0
def test_func():
    # Set some constants
    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    dictionary_name = 'dictionary'
    pwt = 'pwt'
    nwt = 'nwt'
    docword = 'docword.kos.txt'
    vocab = 'vocab.kos.txt'

    smsp_phi_tau = -0.2
    smsp_theta_tau = -0.1
    decor_phi_tau = 1000000

    num_topics = 10
    num_document_passes = 10
    num_outer_iterations = 8

    perplexity_tol = 0.001
    expected_perplexity_value_on_iteration = {
        0: 6703.161,
        1: 2426.277,
        2: 2276.476,
        3: 1814.072,
        4: 1742.911,
        5: 1637.142,
        6: 1612.946,
        7: 1581.725
    }
    sparsity_tol = 0.001
    expected_phi_sparsity_value_on_iteration = {
        0: 0.059,
        1: 0.120,
        2: 0.212,
        3: 0.306,
        4: 0.380,
        5: 0.438,
        6: 0.483,
        7: 0.516
    }
    expected_theta_sparsity_value_on_iteration = {
        0: 0.009,
        1: 0.036,
        2: 0.146,
        3: 0.239,
        4: 0.278,
        5: 0.301,
        6: 0.315,
        7: 0.319
    }

    expected_perplexity_value_online = 1572.268
    expected_phi_sparsity_value_online = 0.528
    expected_theta_sparsity_value_online = 0.320

    batches_folder = tempfile.mkdtemp()
    try:
        # Create the instance of low-level API and master object
        lib = artm.wrapper.LibArtm()

        # Parse collection from disk
        lib.ArtmParseCollection({
            'format':
            constants.CollectionParserConfig_CollectionFormat_BagOfWordsUci,
            'docword_file_path':
            os.path.join(data_path, docword),
            'vocab_file_path':
            os.path.join(data_path, vocab),
            'target_folder':
            batches_folder
        })

        # Create master component and scores
        scores = {
            'Perplexity': messages.PerplexityScoreConfig(),
            'SparsityPhi': messages.SparsityPhiScoreConfig()
        }
        master = mc.MasterComponent(lib,
                                    scores=scores,
                                    num_document_passes=num_document_passes)

        master.create_score('SparsityTheta',
                            messages.SparsityThetaScoreConfig())
        master.create_score('TopTokens', messages.TopTokensScoreConfig())

        # Create collection dictionary and import it
        master.gather_dictionary(dictionary_target_name=dictionary_name,
                                 data_path=batches_folder,
                                 vocab_file_path=os.path.join(
                                     data_path, vocab))

        # Configure basic regularizers
        master.create_regularizer(name='SmoothSparsePhi',
                                  config=messages.SmoothSparsePhiConfig(),
                                  tau=0.0)
        master.create_regularizer(name='SmoothSparseTheta',
                                  config=messages.SmoothSparseThetaConfig(),
                                  tau=0.0)
        master.create_regularizer(name='DecorrelatorPhi',
                                  config=messages.DecorrelatorPhiConfig(),
                                  tau=decor_phi_tau)

        master.reconfigure_regularizer(name='SmoothSparsePhi',
                                       tau=smsp_phi_tau)
        master.reconfigure_regularizer(name='SmoothSparseTheta',
                                       tau=smsp_theta_tau)

        # Initialize model
        master.initialize_model(
            model_name=pwt,
            topic_names=['topic_{}'.format(i) for i in range(num_topics)],
            dictionary_name=dictionary_name)

        for iter in range(num_outer_iterations):
            master.fit_offline(batches_folder=batches_folder,
                               num_collection_passes=1)

            # Retrieve scores
            perplexity_score = master.get_score('Perplexity')
            sparsity_phi_score = master.get_score('SparsityPhi')
            sparsity_theta_score = master.get_score('SparsityTheta')

            # Assert and print scores
            print_string = 'Iter#{0}'.format(iter)
            print_string += ': Perplexity = {0:.3f}'.format(
                perplexity_score.value)
            print_string += ', Phi sparsity = {0:.3f}'.format(
                sparsity_phi_score.value)
            print_string += ', Theta sparsity = {0:.3f}'.format(
                sparsity_theta_score.value)
            print(print_string)

            assert abs(
                perplexity_score.value -
                expected_perplexity_value_on_iteration[iter]) < perplexity_tol
            assert abs(
                sparsity_phi_score.value -
                expected_phi_sparsity_value_on_iteration[iter]) < sparsity_tol
            assert abs(sparsity_theta_score.value -
                       expected_theta_sparsity_value_on_iteration[iter]
                       ) < sparsity_tol

            perplexity_scores = master.get_score_array('Perplexity')
            assert len(perplexity_scores) == (iter + 1)

        # proceed one online iteration
        batch_filenames = glob.glob(os.path.join(batches_folder, '*.batch'))
        master.fit_online(batch_filenames=batch_filenames,
                          update_after=[4],
                          apply_weight=[0.5],
                          decay_weight=[0.5])

        # Retrieve scores
        perplexity_score = master.get_score('Perplexity')
        sparsity_phi_score = master.get_score('SparsityPhi')
        sparsity_theta_score = master.get_score('SparsityTheta')

        # Assert and print scores
        print_string = 'Iter Online'
        print_string += ': Perplexity = {0:.3f}'.format(perplexity_score.value)
        print_string += ', Phi sparsity = {0:.3f}'.format(
            sparsity_phi_score.value)
        print_string += ', Theta sparsity = {0:.3f}'.format(
            sparsity_theta_score.value)
        print(print_string)

        assert abs(perplexity_score.value -
                   expected_perplexity_value_online) < perplexity_tol
        assert abs(sparsity_phi_score.value -
                   expected_phi_sparsity_value_online) < sparsity_tol
        assert abs(sparsity_theta_score.value -
                   expected_theta_sparsity_value_online) < sparsity_tol

        # Retrieve and print top tokens score
        top_tokens_score = master.get_score('TopTokens')

        print('Top tokens per topic:')
        top_tokens_triplets = zip(
            top_tokens_score.topic_index,
            zip(top_tokens_score.token, top_tokens_score.weight))
        for topic_index, group in itertools.groupby(
                top_tokens_triplets, key=lambda triplet: triplet[0]):
            print_string = 'Topic#{0} : '.format(topic_index)
            for _, (token, weight) in group:
                print_string += ' {0}({1:.3f})'.format(token, weight)
            print(print_string)

        master.clear_score_array_cache()
        master.fit_online(batch_filenames=batch_filenames,
                          update_after=[1, 2, 3, 4],
                          apply_weight=[0.5, 0.5, 0.5, 0.5],
                          decay_weight=[0.5, 0.5, 0.5, 0.5])
        perplexity_scores = master.get_score_array('Perplexity')
        assert len(perplexity_scores) == 4

    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 5
0
def test_func():
    # Set some constants
    dictionary_name = 'dictionary'
    pwt = 'pwt'
    nwt = 'nwt'

    num_topics = 2
    num_inner_iterations = 10
    num_outer_iterations = 10

    russian_class_weight = 1.0
    english_class_weight = 1.0
    russian_class = '@russian'
    english_class = '@english'

    tolerance = 0.001
    expected_values_rus_topic = {
        0: {
            u'документ': 0.125,
            u'текст': 0.125,
            u'анализ': 0.125,
            u'статистический': 0.125,
            u'модель': 0.125,
            u'коллекция': 0.083,
            u'тематическая': 0.083,
            'model': 0.042,
            'topic': 0.042,
            'artm': 0.042
        },
        1: {
            u'ногие': 0.115,
            u'отряд': 0.115,
            u'млекопитающие': 0.115,
            u'семейство': 0.115,
            u'хищный': 0.077,
            u'ласто': 0.077,
            u'моржовых': 0.077,
            u'тюлень': 0.077,
            u'ушастый': 0.077,
            u'коротко': 0.038
        }
    }
    expected_values_eng_topic = {
        0: {
            'model': 0.167,
            'text': 0.125,
            'analysis': 0.125,
            'statistical': 0.125,
            'topic': 0.125,
            'artm': 0.083,
            'plsa': 0.083,
            'lda': 0.083,
            'collection': 0.083,
            'not': 0.000
        },
        1: {
            'mammal': 0.188,
            'predatory': 0.125,
            'eared': 0.125,
            'marine': 0.125,
            'seal': 0.125,
            'not': 0.062,
            'reptile': 0.062,
            'crocodilia': 0.062,
            'order': 0.062,
            'pinnipeds': 0.062
        }
    }
    expected_sparsity_values = {'russian': 0.5, 'english': 0.5}

    # Prepare multimodal data
    ens = []
    rus = []

    ens.append(
        u'Topic model statistical analysis text collection LDA PLSA ARTM')
    rus.append(u'Тематическая модель статистический анализ текст коллекция')

    ens.append(u'LDA statistical topic model text collection')
    rus.append(
        u'LDA статистический тематическая модель текст документ коллекция')

    ens.append(u'PLSA statistical analysis text model')
    rus.append(u'PLSA статистический анализ документ текст модель')

    ens.append(u'ARTM analysis topic model')
    rus.append(u'ARTM анализ документ topic model')

    ens.append(u'Pinnipeds seal marine mammal order')
    rus.append(u'Тюлень семейство млекопитающие моржовых отряд ласто ногие')

    ens.append(u'Eared seal marine predatory mammal')
    rus.append(
        u'Ушастый тюлень семейство млекопитающие отряд хищный семейство моржовых ласто ногие'
    )

    ens.append(u'Eared Crocodilia predatory reptile not mammal')
    rus.append(
        u'Ушастый крокодил гена отряд хищный не млекопитающие коротко ногие')

    ru_dic = {}  # mapping from russian token to its index in batch.token list
    en_dic = {}  # mapping from english token to its index in batch.token list
    batch = messages.Batch()  # batch representing the entire collection
    batch.id = str(uuid.uuid1())
    dict_data = messages.DictionaryData(
    )  # BigARTM dictionary to initialize model
    dict_data.name = dictionary_name

    def append(tokens, dic, item, class_id):
        for token in tokens:
            if not dic.has_key(token):  # New token discovered:
                dic[token] = len(batch.token)  # 1. update ru_dic or en_dic
                batch.token.append(
                    token)  # 2. update batch.token and batch.class_id
                batch.class_id.append(class_id)
                dict_data.token.append(token)
                dict_data.class_id.append(class_id)

            # Add token to the item.
            item.token_id.append(dic[token])
            # replace '1' with the actual number of token occupancies in the item
            item.token_weight.append(1)

    # Iterate through all items and populate the batch
    for (en, ru) in zip(ens, rus):
        next_item = batch.item.add()
        next_item.id = len(batch.item) - 1
        append(string.split(ru.lower()), ru_dic, next_item, russian_class)
        append(string.split(en.lower()), en_dic, next_item, english_class)

    batches_folder = tempfile.mkdtemp()
    try:
        # Create the instance of low-level API and master object
        lib = artm.wrapper.LibArtm()

        # Save batch and dictionary on the disk
        lib.ArtmSaveBatch(batches_folder, batch)

        # Create master component and scores
        scores = {
            'SparsityPhiRus':
            messages.SparsityPhiScoreConfig(class_id=russian_class),
            'SparsityPhiEng':
            messages.SparsityPhiScoreConfig(class_id=english_class),
            'TopTokensRus':
            messages.TopTokensScoreConfig(class_id=russian_class),
            'TopTokensEng':
            messages.TopTokensScoreConfig(class_id=english_class)
        }
        master = mc.MasterComponent(lib, scores=scores)

        # Create the collection dictionary
        lib.ArtmCreateDictionary(master.master_id, dict_data)

        # Initialize model
        master.initialize_model(
            model_name=pwt,
            topic_names=['topic_{}'.format(i) for i in xrange(num_topics)],
            dictionary_name=dictionary_name)

        for iter in xrange(num_outer_iterations):
            # Invoke one scan of the collection, regularize and normalize Phi
            master.clear_score_cache()
            master.process_batches(
                pwt,
                nwt,
                num_inner_iterations,
                batches_folder,
                class_ids=[russian_class, english_class],
                class_weights=[russian_class_weight, english_class_weight])
            master.normalize_model(pwt, nwt)

        # Retrieve and print scores
        top_tokens_rus = master.get_score('TopTokensRus')
        top_tokens_eng = master.get_score('TopTokensEng')
        sp_phi_rus = master.get_score('SparsityPhiRus')
        sp_phi_eng = master.get_score('SparsityPhiEng')

        print 'Top tokens per russian topic:'
        _print_top_tokens(top_tokens_rus, expected_values_rus_topic, tolerance)
        print 'Top tokens per english topic:'
        _print_top_tokens(top_tokens_eng, expected_values_eng_topic, tolerance)

        print '\nSparsity Phi: russian {0:.3f}, english {1:.3f}'.format(
            sp_phi_rus.value, sp_phi_eng.value)
        assert abs(expected_sparsity_values['russian'] -
                   sp_phi_rus.value) < tolerance
        assert abs(expected_sparsity_values['english'] -
                   sp_phi_eng.value) < tolerance
    finally:
        shutil.rmtree(batches_folder)