Example #1
0
def test_func():
    # constants
    num_tokens = 15
    parent_level_weight = 1
    num_collection_passes = 15
    num_document_passes = 10
    num_topics_level0 = 15
    num_topics_level1 = 50
    regularizer_tau = 10 ** 5
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    parent_batch_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes)
        
        level0 = hier.add_level(num_topics=num_topics_level0)

        level0.initialize(dictionary=dictionary)
        
        level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)
        
        hier.tmp_files_path = parent_batch_folder
        level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight)
        
        level1.initialize(dictionary=dictionary)
        
        level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau))
        
        level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

        phi = hier.get_level(1).get_phi()
        assert phi.shape == (vocab_size, num_topics_level1)
        # theta = hier.get_level(1).get_theta()
        # assert theta.shape == (num_topics_level1, num_docs)
        psi = hier.get_level(1).get_psi()
        support = psi.values.max(axis=1).min()

        # This test gives different results on python27 and python35. Authors need to investigate.
        on_python_27 = abs(support - 0.0978 < zero_eps)
        on_python_35 = abs(support - 0.1522 < zero_eps)
        assert(on_python_27 or on_python_35)
        
        assert(level1.clone() is not None)
        assert(hier.clone() is not None)
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(parent_batch_folder)
Example #2
0
    def init_hierarchical_model(class_ids):
        score = [artm.PerplexityScore(name='perplexity_words', class_ids=['body']),
                 artm.PerplexityScore(name='perplexity_bigrams', class_ids=['bigrams'])]

        top_tokens = [artm.TopTokensScore(name='top_words', num_tokens=15, class_id='body'),
                      artm.TopTokensScore(name='top_bigrams', num_tokens=10, class_id='bigrams')]

        sparsity = [artm.SparsityThetaScore(name='sparsity_theta', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_words', class_id='words', eps=1e-6),
                    artm.SparsityPhiScore(name='sparsity_phi_bigrams', class_id='bigrams', eps=1e-6)]

        regularizers = [artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['body'], name='decorr_words'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['bigram'], name='decorr_bigrams'),
                        artm.DecorrelatorPhiRegularizer(tau=0, class_ids=['categories'], name='decorr_categories'),
                        artm.SmoothSparseThetaRegularizer(tau=0, name='sparsity_theta'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['body'], name='sparsity_words'),
                        artm.SmoothSparsePhiRegularizer(tau=0, class_ids=['bigram'], name='sparsity_bigrams')]

        hmodel = artm.hARTM(class_ids=class_ids,
                            cache_theta=True,
                            reuse_theta=True,
                            scores=score + top_tokens + sparsity,
                            regularizers=regularizers,
                            theta_columns_naming='title')
        return hmodel
Example #3
0
def test_func():
    # constants
    num_tokens = 15
    parent_level_weight = 1
    num_collection_passes = 15
    num_document_passes = 10
    num_topics_level0 = 15
    num_topics_level1 = 50
    regularizer_tau = 10 ** 5
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    parent_batch_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes)
        
        level0 = hier.add_level(num_topics=num_topics_level0)

        level0.initialize(dictionary=dictionary)
        
        level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)
        
        hier.tmp_files_path = parent_batch_folder
        level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight)
        
        level1.initialize(dictionary=dictionary)
        
        level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau))
        
        level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

        phi = hier.get_level(1).get_phi()
        assert phi.shape == (vocab_size, num_topics_level1)
        # theta = hier.get_level(1).get_theta()
        # assert theta.shape == (num_topics_level1, num_docs)
        psi = hier.get_level(1).get_psi()
        support = psi.values.max(axis=1).min()

        # This test gives different results on python27 and python35. Authors need to investigate.
        on_python_27 = abs(support - 0.0978 < zero_eps)
        on_python_35 = abs(support - 0.1522 < zero_eps)
        assert(on_python_27 or on_python_35)
        
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(parent_batch_folder)
Example #4
0
    def create_simple(self, iter_count, regularizers={}):
        self.log("Creating simple model...")
        layers_count = self.layers_count
        num_topics = [int(x) for x in self.topics_count.split()]

        batch_vectorizer, dictionary = self.dataset.get_batches()

        model = artm.hARTM(num_document_passes=iter_count,
                           theta_columns_naming="id")
        model.cache_theta = True
        layers = [0 for i in range(layers_count)]

        layers[0] = model.add_level(
            num_topics=num_topics[1],
            topic_names=["topic" + str(t) for t in range(num_topics[1])])
        layers[0].initialize(dictionary=dictionary)
        self.log("Layer 0 initialized.")

        if (regularizers):
            reg_code = ""
            for name, params in regularizers.items():
                params_init = []
                for pname, value in params.items():
                    if len(value) > 10:
                        raise RuntimeError(
                            "Too long value for parameter %s.%s" %
                            (name, pname))
                    params_init.append(pname + "=" + value)
                reg_code += "layers[0].regularizers.add(artm.%s(%s))\n" % (
                    name, ", ".join(params_init))
            self.log("Regularizers to be applied:<br>" +
                     reg_code.replace("\n", "<br>"))
            exec(reg_code)

        layers[0].fit_offline(batch_vectorizer=batch_vectorizer,
                              num_collection_passes=iter_count)
        self.log("Layer 0 fitted.")

        for layer_id in range(1, layers_count):
            layers[layer_id] = model.add_level(
                parent_level_weight=0.1,
                num_topics=num_topics[layer_id + 1],
                topic_names=[
                    "topic" + str(t) for t in range(num_topics[layer_id + 1])
                ])
            layers[layer_id].initialize(dictionary=dictionary)
            self.log("Layer " + str(layer_id) + " initialized.")
            layers[layer_id].fit_offline(batch_vectorizer=batch_vectorizer,
                                         num_collection_passes=iter_count)
            self.log("Layer " + str(layer_id) + " fitted.")

        self.log("Model built.")
        return model
Example #5
0
 def __init__(self, 
              dictionary, 
              class_ids, 
              tmp_files_path='', 
              theta_columns_naming='title',
              cache_theta = True,
              num_levels=None, 
              level_names=None, 
              num_topics=None, 
              topic_names=None, 
              num_backgrounds=None, 
              background_names=None,
              smooth_background_tau=None,
              decorrelate_phi_tau=None,
              parent_topics_proportion=None,
              spars_psi_tau=None,
              smooth_theta_fit=1.0,
              num_collection_passes=1,
              num_tokens=10):
     
     self.model = artm.hARTM(dictionary=dictionary, 
                             class_ids=class_ids, 
                             theta_columns_naming=theta_columns_naming, 
                             tmp_files_path=tmp_files_path, 
                             cache_theta=cache_theta)
     
     self.level_names = _generate_names(num_levels, level_names, 'level')
     
     topic_names = _generate_names_levels(len(self.level_names), num_topics, topic_names, 'topic')
     background_names = _generate_names_levels(len(self.level_names), num_backgrounds, background_names, 'background')
         
     for topic_names_level, background_names_level in zip(topic_names, background_names):
         
         topic_names_level = topic_names_level + background_names_level
         level = self.model.add_level(num_topics=len(topic_names_level), topic_names=topic_names_level)
         
     if smooth_background_tau is not None:
         
         for level, background_names_level in zip(self.model, background_names):
             level.regularizers.add(artm.SmoothSparsePhiRegularizer('SPhi_back', 
                                                                    tau=smooth_background_tau, 
                                                                    gamma=0,
                                                                    topic_names=background_names_level))
         
     if decorrelate_phi_tau is not None:
         
         for level in self.model:
             level.regularizers.add(artm.DecorrelatorPhiRegularizer('DPhi', tau=decorrelate_phi_tau, gamma=0))
         
     if (parent_topics_proportion is not None) and (spars_psi_tau is not None):
         
         for level, parent_topics_proportion_level in zip(self.model[1:], parent_topics_proportion):
             
             for topic_name, parent_topic_proportion in parent_topics_proportion_level.items(): 
                 level.regularizers.add(artm.HierarchySparsingThetaRegularizer(name=f'HSTheta_{topic_name}', 
                                                                               topic_names=topic_name, 
                                                                               tau=spars_psi_tau, 
                                                                               parent_topic_proportion=parent_topic_proportion))
                 
     self.smooth_theta_fit = smooth_theta_fit
     self.num_collection_passes = num_collection_passes
                 
     for level in self.model:
         
         for class_id, weight in class_ids.items():
             
             if weight > 0:
                 level.scores.add(artm.TopTokensScore(name=f'TT_{class_id}', class_id=class_id, num_tokens=num_tokens))
    def _extract_hierarchical_relationship(
            self,
            bank_phi: pd.DataFrame,
            new_model_phi: pd.DataFrame,
            psi_threshold: float = None
    ) -> Tuple[List[int], Dict[int, List[int]]]:

        if bank_phi.shape[1] == 0:
            return list(range(new_model_phi.shape[1])), dict()

        assert bank_phi.shape[0] == new_model_phi.shape[0]

        # TODO: think about bank_phi.shape[1] == 1: alright to proceed?

        _logger.debug('Creating hARTM')

        hierarchy = artm.hARTM(num_processors=1)

        _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics')

        level0 = hierarchy.add_level(num_topics=bank_phi.shape[1])
        level0.initialize(dictionary=self._dictionary)

        _logger.debug(f'Copying phi for the first level.'
                      f' Phi shape: {bank_phi.shape}.'
                      f' First words: {bank_phi.index[:10]}')

        phi_ref0 = _safe_copy_phi(level0,
                                  bank_phi,
                                  self._dataset,
                                  small_num_fit_iterations=1)

        _logger.debug(
            f'Creating second level with {new_model_phi.shape[1]} topics')

        level1 = hierarchy.add_level(num_topics=new_model_phi.shape[1],
                                     parent_level_weight=1)
        level1.initialize(dictionary=self._dictionary)

        # Regularizer may help to refine new topics a bit
        # in search of parent-child relationship
        # However, the regularizer won't affect the topics themselves,
        # only the ARTM hierarchy defined here.

        _logger.debug(
            'Adding HierarchySparsingThetaRegularizer to second level')

        # TODO: or smaller tau? or without regularizer at all? or change the real topics?
        level1.regularizers.add(
            artm.HierarchySparsingThetaRegularizer(name='sparse_hierarchy',
                                                   tau=1.0))

        _logger.debug(f'Copying phi for the second level.'
                      f' Phi shape: {new_model_phi.shape}.'
                      f' First words: {new_model_phi.index[:10]}')

        phi_ref1 = _safe_copy_phi(level1,
                                  new_model_phi,
                                  self._dataset,
                                  small_num_fit_iterations=3)

        psi = level1.get_psi()

        assert psi.shape[0] == new_model_phi.shape[1]
        assert psi.shape[1] == bank_phi.shape[1]

        if psi_threshold is None:
            psi_threshold = 1.0 / psi.shape[0]

        topics_for_append: List[int] = list()
        topics_for_update: Dict[int, List[int]] = defaultdict(list)

        _logger.debug('Analyzing Psi for parent-child relationship')

        for new_topic in range(level1.get_phi().shape[1]):
            psi_row = psi.iloc[new_topic, :]
            parents = np.where(psi_row > psi_threshold)[0]

            if len(parents) > 1:
                pass  # linearly dependent -> skip
            elif len(parents) == 0:
                topics_for_append.append(new_topic)
            elif len(parents) == 1:
                topics_for_update[parents[0]].append(new_topic)
            else:
                assert False

        _logger.debug('Deleting hARTM')

        hierarchy.del_level(1)
        hierarchy.del_level(0)

        del phi_ref1
        del phi_ref0
        del hierarchy

        gc.collect()

        return topics_for_append, topics_for_update
Example #7
0
def test_func():

    # constants
    num_documents = 3430
    vocabulary_size = 6906
    num_document_passes = 10
    num_collection_passes = 15
    num_topics_level_0 = 15
    num_topics_level_1 = 50
    parent_level_weight = 1
    regularizer_tau = 10**5
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')

    batches_folder = tempfile.mkdtemp()
    parent_batch_folder = tempfile.mkdtemp()
    hierarchy_model_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        hierarchy = artm.hARTM(dictionary=dictionary,
                               cache_theta=True,
                               num_document_passes=num_document_passes,
                               tmp_files_path=parent_batch_folder,
                               theta_columns_naming="title")

        level_0 = hierarchy.add_level(num_topics=num_topics_level_0)
        level_0.initialize(dictionary=dictionary)
        level_0.fit_offline(batch_vectorizer=batch_vectorizer,
                            num_collection_passes=num_collection_passes)

        phi_0 = hierarchy.get_level(0).get_phi()
        assert phi_0.shape == (vocabulary_size, num_topics_level_0)

        theta_0 = hierarchy.get_level(0).get_theta()
        assert theta_0.shape == (num_topics_level_0, num_documents)

        level_1 = hierarchy.add_level(num_topics=num_topics_level_1,
                                      parent_level_weight=parent_level_weight)
        level_1.initialize(dictionary=dictionary)
        level_1.regularizers.add(
            artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta",
                                                   tau=regularizer_tau))
        level_1.fit_offline(batch_vectorizer=batch_vectorizer,
                            num_collection_passes=num_collection_passes)

        phi_1 = hierarchy.get_level(1).get_phi()
        assert phi_1.shape == (vocabulary_size, num_topics_level_1)

        theta_1 = hierarchy.get_level(1).get_theta()
        assert theta_1.shape == (num_topics_level_1, num_documents)

        psi = hierarchy.get_level(1).get_psi()
        assert psi.shape == (num_topics_level_1, num_topics_level_0)

        support = psi.values.max(axis=1).min()

        # This test gives different results on python27 and python35. Authors need to investigate.
        on_python_27 = abs(support - 0.0978 < zero_eps)
        on_python_35 = abs(support - 0.1522 < zero_eps)
        assert (on_python_27 or on_python_35)

        assert (level_0.clone() is not None)
        assert (level_1.clone() is not None)
        assert (hierarchy.clone() is not None)

        # Test save and load methods
        hierarchy.save(hierarchy_model_folder)

        hierarchy_load = artm.hARTM()
        hierarchy_load.load(hierarchy_model_folder)

        assert level_0.num_topics == hierarchy_load.get_level(0).num_topics
        assert (phi_0 -
                hierarchy_load.get_level(0).get_phi()).abs().max().max() < 1e-3

        assert level_1.num_topics == hierarchy_load.get_level(1).num_topics
        assert (phi_1 -
                hierarchy_load.get_level(1).get_phi()).abs().max().max() < 1e-3

        # Test add_level method when we definite topic_names but don't definite num_topics

        hierarchy_new = artm.hARTM(dictionary=dictionary,
                                   cache_theta=True,
                                   num_document_passes=num_document_passes,
                                   tmp_files_path=parent_batch_folder,
                                   theta_columns_naming="title")

        level_0_new = hierarchy_new.add_level(topic_names=level_0.topic_names)
        level_0_new.initialize(dictionary=dictionary)
        level_0_new.fit_offline(batch_vectorizer=batch_vectorizer,
                                num_collection_passes=num_collection_passes)

        phi_0_new = hierarchy_new.get_level(0).get_phi()
        assert (phi_0 - phi_0_new).abs().max().max() < 1e-3

        theta_0_new = hierarchy_new.get_level(0).get_theta()
        assert (theta_0 - theta_0_new).abs().max().max() < 1e-3

        level_1_new = hierarchy_new.add_level(topic_names=level_1.topic_names)
        level_1_new.initialize(dictionary=dictionary)
        level_1_new.regularizers.add(
            artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta",
                                                   tau=regularizer_tau))
        level_1_new.fit_offline(batch_vectorizer=batch_vectorizer,
                                num_collection_passes=num_collection_passes)

        phi_1_new = hierarchy_new.get_level(1).get_phi()
        assert (phi_1 - phi_1_new).abs().max().max() < 1e-3

        theta_1_new = hierarchy_new.get_level(1).get_theta()
        assert (theta_1 - theta_1_new).abs().max().max() < 1e-3

        psi_new = hierarchy_new.get_level(1).get_psi()
        assert (psi - psi_new).abs().max().max() < 1e-3

        # Test the same functionality with hARTM, and validate that resulting psi matrix is exactly the same
        level_0_plain = artm.ARTM(num_topics=num_topics_level_0,
                                  num_document_passes=num_document_passes,
                                  cache_theta=True,
                                  seed=level_0.seed,
                                  theta_columns_naming="title")
        level_0_plain.initialize(dictionary=dictionary)
        level_0_plain.fit_offline(num_collection_passes=num_collection_passes,
                                  batch_vectorizer=batch_vectorizer)

        phi_0_plain = level_0_plain.get_phi()
        assert (phi_0 - phi_0_plain).abs().max().max() < 1e-3

        theta_0_plain = level_0_plain.get_theta()
        assert (theta_0 - theta_0_plain).abs().max().max() < 1e-3

        level_1_plain = artm.ARTM(num_topics=num_topics_level_1,
                                  num_document_passes=num_document_passes,
                                  parent_model=level_0_plain,
                                  parent_model_weight=parent_level_weight,
                                  cache_theta=True,
                                  seed=level_1.seed,
                                  theta_columns_naming="title")
        level_1_plain.initialize(dictionary=dictionary)
        level_1_plain.regularizers.add(
            artm.HierarchySparsingThetaRegularizer(name="HierSparsTheta",
                                                   tau=regularizer_tau))
        level_1_plain.fit_offline(num_collection_passes=num_collection_passes,
                                  batch_vectorizer=batch_vectorizer)

        phi_1_plain = level_1_plain.get_phi()
        assert (phi_1 - phi_1_plain).abs().max().max() < 1e-3

        theta_1_plain = level_1_plain.get_theta()
        assert (theta_1 - theta_1_plain).abs().max().max() < 1e-3

        psi_plain = level_1_plain.get_parent_psi()
        assert (psi - psi_plain).abs().max().max() < 1e-3

    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(parent_batch_folder)
        shutil.rmtree(hierarchy_model_folder)