Ejemplo n.º 1
0
def fit():
    batch_id = str(uuid.uuid4())
    app.logger.info("batch %s", batch_id)

    rjson = request.json
    terms = rjson['terms']
    topics_cnt = rjson['topics']

    batch = artm.messages.Batch()
    term_to_id = {}
    all_terms = []

    batch = artm.messages.Batch()
    batch.id = batch_id

    for i, doc in enumerate(terms):
        item = batch.item.add()
        item.id = i
        field = item.field.add()
        for term in doc:
            if not term in term_to_id:
                term_to_id[term] = len(all_terms)
                all_terms.append(term)
            field.token_id.append(term_to_id[term])
            field.token_count.append(1)

    for t in all_terms:
        batch.token.append(t)

    os.mkdir(batch_id)
    with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout:
        fout.write(batch.SerializeToString())

    app.logger.info("batch %s is created", batch_id)

    dictionary = artm.Dictionary()
    dictionary.gather(batch_id)

    model_artm = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)],
        scores=[
            artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)
        ],
        show_progress_bars=False)

    batch_vectorizer = artm.BatchVectorizer(data_path=batch_id,
                                            data_format="batches")

    model_artm.initialize(dictionary=dictionary)
    app.logger.info("model is starting to fit")
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=1)
    app.logger.info("mode was fitted")

    model_artm.save(os.path.join(batch_id, "model"))

    return jsonify({"id": batch_id})
Ejemplo n.º 2
0
def cluster_artm(text):
    bach_vectorizer = artm.BatchVectorizer(data_path=text,
                                           data_format='vowpal_wabbit', target_folder='batch_small',
                                           batch_size=20)
    T = 10  # количество тем
    topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"]

    model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, reuse_theta=True,
                           num_document_passes=1)

    np.random.seed(1)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=bach_vectorizer.data_path)
    model_artm.initialize(dictionary)

    model_artm.scores.add(artm.TopTokensScore(name='metric1', num_tokens=15))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='smoothing', dictionary=dictionary,
                                                                topic_names='bcg', tau=1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)
    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='stimulates',
                                                                dictionary=dictionary,
                                                                topic_names=["sbj" + str(i) for i in range(0, 29)],
                                                                tau=-1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)

    for topic_name in model_artm.topic_names:
        with open('cluster_log_artm.txt', 'a') as f_in:
            f_in.write(topic_name + ':')
            for word in model_artm.score_tracker["metric1"].last_tokens[topic_name]:
                f_in.write(word + ' ')
            f_in.write('\n')
Ejemplo n.º 3
0
    def init_model(self, dictionary_path=None):
        """dictionary_path: optional, used with pretrained model"""
        self.dictionary = artm.Dictionary()
        if dictionary_path is None:
            self.dictionary.gather(data_path=self.batches_path)
            self.dictionary.filter(min_tf=10, max_df_rate=0.1)
            self.dictionary.save_text(
                f"{self.dir_path}/dicts/dict_{self.name_dataset}.txt")
        else:
            self.dictionary.load_text(dictionary_path)

        self.model = artm.ARTM(
            num_topics=self.n_topics,
            dictionary=self.dictionary,
            show_progress_bars=True,
        )

        # scores
        self.model.scores.add(
            artm.PerplexityScore(name="PerplexityScore",
                                 dictionary=self.dictionary))
        self.model.scores.add(
            artm.SparsityThetaScore(name="SparsityThetaScore"))
        self.model.scores.add(artm.SparsityPhiScore(name="SparsityPhiScore"))

        # regularizers
        self.model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name="SparsePhi", tau=-0.1))
        self.model.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name="SparseTheta", tau=-0.5))
        self.model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name="DecorrelatorPhi", tau=1.5e5))
Ejemplo n.º 4
0
    def get_dictionary(self) -> artm.Dictionary:
        """
        Gets dataset's dictionary.

        Returns
        -------
        artm.Dictionary

        """
        if self._cached_dict is not None:
            return self._cached_dict

        dictionary = artm.Dictionary()

        same_collection, path_to_collection = self._check_collection()

        if same_collection:
            if not os.path.isfile(self._dictionary_file_path):
                dictionary.gather(data_path=self._batches_folder_path)
                dictionary.save(dictionary_path=self._dictionary_file_path)

            dictionary.load(dictionary_path=self._dictionary_file_path)
            self._cached_dict = dictionary
        else:
            _ = self.get_batch_vectorizer()
            dictionary.gather(data_path=self._batches_folder_path)

            if os.path.isfile(self._dictionary_file_path):
                os.remove(self._dictionary_file_path)

            dictionary.save(dictionary_path=self._dictionary_file_path)
            dictionary.load(dictionary_path=self._dictionary_file_path)
            self._cached_dict = dictionary

        return self._cached_dict
Ejemplo n.º 5
0
def dictionary_initialization(model_artm, batches_dir, min_df, max_tf):
    my_dictionary = artm.Dictionary()
    my_dictionary.gather(data_path=batches_dir)
    my_dictionary.filter(min_df=min_df, max_tf=max_tf)
    model_artm.initialize(my_dictionary)

    return model_artm, my_dictionary
 def select_from_corpus(self, list_of_files: List[str],
                        preprocessor: BaseTextPreprocessor,
                        spacy_nlp: Language) -> List[str]:
     topic_model_name = os.path.normpath(self.topic_model_name.strip())
     if len(topic_model_name) == 0:
         raise ValueError('A topic model name is empty!')
     dir_name = os.path.dirname(topic_model_name)
     base_name = os.path.basename(topic_model_name)
     if len(dir_name) == 0:
         dir_name = os.path.curdir
     if len(base_name) == 0:
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Base name of file is empty!'
             .format(self.topic_model_name))
     if not os.path.isdir(dir_name):
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Directory `{1}` does not exist!'
             .format(self.topic_model_name, dir_name))
     collection_name = os.path.normpath(
         os.path.join(dir_name, base_name + '.collection'))
     collection_docword_name = os.path.normpath(
         os.path.join(dir_name, 'docword.' + base_name + '.collection'))
     collection_vocab_name = os.path.normpath(
         os.path.join(dir_name, 'vocab.' + base_name + '.collection'))
     if (not os.path.isfile(collection_docword_name)) or (
             not os.path.isfile(collection_vocab_name)):
         self.create_collection_as_bow_uci(list_of_files, preprocessor,
                                           spacy_nlp,
                                           collection_docword_name,
                                           collection_vocab_name)
     batches_path = os.path.normpath(
         os.path.join(dir_name, base_name + '.data_batches'))
     if os.path.isdir(batches_path):
         batch_vectorizer = artm.BatchVectorizer(data_path=batches_path,
                                                 data_format='batches')
     else:
         batch_vectorizer = artm.BatchVectorizer(
             data_path=dir_name,
             data_format='bow_uci',
             collection_name=collection_name,
             target_folder=batches_path)
     dictionary = artm.Dictionary()
     dictionary_name = os.path.normpath(topic_model_name + '.dictionary')
     if os.path.isfile(dictionary_name):
         dictionary.load(dictionary_name)
     else:
         dictionary.gather(data_path=batches_path)
         dictionary.save(dictionary_name)
     topic_model = self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
     if topic_model is None:
         topic_model = self.create_topic_model(topic_model_name,
                                               batch_vectorizer, dictionary)
         if topic_model is None:
             raise ValueError(
                 'The trained topic model cannot be loaded from the file `{0}`!'
                 .format(topic_model_name))
     return self.select_keywords_from_topic_model(topic_model)
Ejemplo n.º 7
0
def test_func():
    # constants
    num_tokens = 15
    parent_level_weight = 1
    num_collection_passes = 15
    num_document_passes = 10
    num_topics_level0 = 15
    num_topics_level1 = 50
    regularizer_tau = 10 ** 5
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    parent_batch_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes)
        
        level0 = hier.add_level(num_topics=num_topics_level0)

        level0.initialize(dictionary=dictionary)
        
        level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)
        
        hier.tmp_files_path = parent_batch_folder
        level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight)
        
        level1.initialize(dictionary=dictionary)
        
        level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau))
        
        level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

        phi = hier.get_level(1).get_phi()
        assert phi.shape == (vocab_size, num_topics_level1)
        # theta = hier.get_level(1).get_theta()
        # assert theta.shape == (num_topics_level1, num_docs)
        psi = hier.get_level(1).get_psi()
        support = psi.values.max(axis=1).min()

        # This test gives different results on python27 and python35. Authors need to investigate.
        on_python_27 = abs(support - 0.0978 < zero_eps)
        on_python_35 = abs(support - 0.1522 < zero_eps)
        assert(on_python_27 or on_python_35)
        
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(parent_batch_folder)
Ejemplo n.º 8
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
Ejemplo n.º 9
0
    def get_batches(self):
        dataset_path = os.path.join(settings.DATA_DIR, "datasets",
                                    self.text_id)
        batches_folder = os.path.join(dataset_path, "batches")
        dictionary_file_name = os.path.join(batches_folder, "dictionary.txt")

        batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                                data_format="batches")
        dictionary = artm.Dictionary(name="dictionary")
        dictionary.load_text(dictionary_file_name)
        return batch_vectorizer, dictionary
Ejemplo n.º 10
0
    def __init__(self, source_file, batches_folder='batches', batch_size=100):
        self.source_file = source_file
        self.batches_folder = batches_folder
        self.batch_vectorizer = artm.BatchVectorizer(data_path=self.source_file, data_format="vowpal_wabbit",
                                                     target_folder=self.batches_folder, batch_size=batch_size)

        dict_name = os.path.join(self.batches_folder, "dictionary.dict")
        self.dictionary = artm.Dictionary()
        if not os.path.exists(dict_name):
            self.dictionary.gather(batches_folder)
            self.dictionary.save(dict_name)
        else:
            self.dictionary.load(dict_name)
    def _compute_lift(
            self,
            phi: pd.DataFrame,
            chosen_words_array: List[pd.Index] = None,
            ):
        # inspired by gist.github.com/jrnold/daa039f02486009a24cf3e83403dabf0
        artm_dict = artm.Dictionary(dictionary_path=self._dict_path)
        dict_df = artm_dict2df(artm_dict).query("class_id in @self.modalities")

        # TODO: this is possible to do using aggregate / join and stuff
        for m in self.modalities:
            subdf = dict_df.query("class_id == @m")
            idx = subdf.index
            # theretically, token_freq is unnecasry duplicate of token_value
            # in practice, we have float32 errors and also user could run dictionary filtering
            # without setting recalculate_value=True
            dict_df.loc[idx, 'token_freq'] = dict_df.loc[idx, 'token_tf'] / subdf.token_tf.sum()

        dict_df.set_index(["class_id", "token"], inplace=True)
        dict_df.index.names = ['modality', 'token']
        dict_df.sort_index(inplace=True)
        phi.sort_index(inplace=True)

        known_chosen_words_array = [
            words.intersection(dict_df.index)
            for words in chosen_words_array
        ]

        if known_chosen_words_array:
            merged_index = reduce(
                lambda idx1, idx2: idx1.union(idx2),
                known_chosen_words_array
            )
            chosen_words = merged_index.drop_duplicates()
            dict_df = dict_df.loc[chosen_words]
            phi = phi.loc[chosen_words]

        data = np.log(phi.values) - np.log(dict_df[['token_freq']].values)
        log_lift = pd.DataFrame(data=np.log(data), index=phi.index, columns=phi.columns)

        if not known_chosen_words_array:
            return log_lift

        result = []

        for t, words in zip(phi.columns, known_chosen_words_array):
            result.append(log_lift.loc[words, t].sum())

        log_lift_total = pd.Series(data=result, index=phi.columns)

        return log_lift_total
Ejemplo n.º 12
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="/home/goncharoff/PythonLab/labs/labs/lab5/result/result.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
Ejemplo n.º 13
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="../data/lenta.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
    subprocess.call(['./clear.sh'])
def test_func():
    topic_selection_tau = 0.5
    num_collection_passes = 3
    num_document_passes = 10
    num_topics = 15

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    perplexity_eps = 0.1
    perplexity_value = [
        6676.941798754971, 2534.963709464024, 2463.1544861984794
    ]

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary(data_path=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary,
                          num_document_passes=num_document_passes)

        model.regularizers.add(
            artm.TopicSelectionThetaRegularizer(name='TopicSelection',
                                                tau=topic_selection_tau))
        model.scores.add(artm.PerplexityScore(name='PerplexityScore'))
        model.scores.add(
            artm.TopicMassPhiScore(name='TopicMass',
                                   model_name=model.model_nwt))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        # Verify that only 8 topics are non-zero (due to TopicSelection regularizer)
        topics_left = sum(x == 0
                          for x in model.get_score('TopicMass').topic_mass)
        assert 8 == topics_left

        # the following asssertion fails on travis-ci builds, but passes locally
        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perplexity_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)
    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 15
0
def create_thematic_model(checked_list, num_topics, num_tokens, phi_tau,
                          theta_tau, decorr_tau):
    """ Create a thematic model """
    gluing_bag_of_words(checked_list)

    batch_vectorizer = artm.BatchVectorizer(data_path=COLLECTION_PATH,
                                            data_format='vowpal_wabbit',
                                            target_folder=TARGET_FOLDER,
                                            batch_size=len(checked_list))
    dictionary = artm.Dictionary(data_path=TARGET_FOLDER)
    model = artm.ARTM(
        num_topics=num_topics,
        num_document_passes=len(checked_list),
        dictionary=dictionary,
        regularizers=[
            artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',
                                            tau=phi_tau),
            artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer',
                                              tau=theta_tau),
            artm.DecorrelatorPhiRegularizer(
                name='decorrelator_phi_regularizer', tau=decorr_tau),
        ],
        scores=[
            artm.PerplexityScore(name='perplexity_score',
                                 dictionary=dictionary),
            artm.SparsityPhiScore(name='sparsity_phi_score'),
            artm.SparsityThetaScore(name='sparsity_theta_score'),
            artm.TopTokensScore(name='top_tokens_score', num_tokens=num_tokens)
        ])

    model.fit_offline(batch_vectorizer=batch_vectorizer,
                      num_collection_passes=len(checked_list))

    top_tokens = model.score_tracker['top_tokens_score']

    topic_dictionary = OrderedDict()

    for topic_name in model.topic_names:
        list_name = []
        for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                                   top_tokens.last_weights[topic_name]):
            list_name.append(token + '-' + str(round(weight, 3)))
        topic_dictionary[str(topic_name)] = list_name

    return model.score_tracker[
        'perplexity_score'].last_value, model.score_tracker[
            'sparsity_phi_score'].last_value, model.score_tracker[
                'sparsity_theta_score'].last_value, topic_dictionary
Ejemplo n.º 16
0
    def init_model(self, params_string, dict_path):

        self.set_params(params_string)
        self.back = ['back{}'.format(i) for i in range(self.B)]
        self.dictionary = artm.Dictionary()
        self.dictionary.load_text(dictionary_path=dict_path)

        self.model = artm.ARTM(
            num_topics=self.S + self.B,
            class_ids=['@default_class'],
            dictionary=self.dictionary,
            show_progress_bars=False,
            #                   cache_theta=True,
            topic_names=self.specific + self.back,
            num_processors=32)

        self.set_scores()
Ejemplo n.º 17
0
def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model_plsa = artm.ARTM(topic_names=topic_names,
                           class_ids={
                               '@text': 1.0,
                               '@first': 1.0,
                               '@second': 1.0,
                               '@third': 1.0
                           },
                           cache_theta=True,
                           theta_columns_naming='title',
                           scores=[
                               artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)
                           ])

    model_plsa.initialize(dictionary=dictionary)

    model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model_plsa.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model_plsa.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))

    model_plsa.num_document_passes = 1

    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer_train,
                           num_collection_passes=num_collection_passes)

    theta_train = model_plsa.transform(batch_vectorizer=batch_vectorizer_train)

    return model_plsa, theta_train
Ejemplo n.º 18
0
def main():
    print artm.version()
    config = ConfigPaths('config.cfg')
    plot_maker = PlotMaker()
    printer = PrintHelper()
    print config.models_file_name

    batch_vectorizer = artm.BatchVectorizer(
        data_path=config.output_batches_path, data_format='batches')
    dictionary = artm.Dictionary()
    dictionary.load(dictionary_path=config.dictionary_path + '.dict')

    models_file = open(config.models_file_name, 'a')
    # model = process_one_model(config, batch_vectorizer, models_file, printer, plot_maker,
    #                           dictionary, _n_topics=50, _n_doc_passes=5, _seed_value=100, _n_top_tokens=10, _p_mass_threshold=0.25,
    #               _n_iterations=20, _model_name='model1')

    exp = Experiment(
        Pool(topics_filter=OptimizationTopicsFilter(eps=10**(-2.5),
                                                    verbose=False),
             save_topics=True))
    for i in xrange(3):
        model_artm = process_one_model(config,
                                       batch_vectorizer,
                                       models_file,
                                       printer,
                                       plot_maker,
                                       dictionary,
                                       _n_topics=50,
                                       _n_doc_passes=5,
                                       _seed_value=100,
                                       _n_top_tokens=10,
                                       _p_mass_threshold=0.25,
                                       _n_iterations=20,
                                       _model_name='model_{}'.format(i))
        #display_points(model_artm.get_phi())
        exp.collect_topics(model_artm.get_phi(), model_artm.get_theta())
        vals, bins = exp.topics_pool.topics_filter.plot_hist()
        save_hist(vals, bins, "data_iter_{}.csv".format(i))
        print exp.topics_pool.get_basic_topics_count()
    #
    models_file.close()
Ejemplo n.º 19
0
    def build_model(self, d_dir, n_document_passes=1):
        batch_vectorizer_train = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_train'),
                                                      data_format="batches")

        batch_vectorizer_test = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_test'),
                                                     data_format="batches")

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=os.path.join(d_dir, 'for_dict'))

        model = artm.ARTM(num_topics=self.n_topics,
                          dictionary=dictionary,
                          cache_theta=True,
                          reuse_theta=True)

        # Sparsity p(c|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreC',
                                  class_id=self.c))

        # Sparsity p(w|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreGram3',
                                  class_id=self.gram3))

        #Regularization of sparsity p(gram3|t)
        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhiGram3Regularizer',
                                            class_ids=[self.gram3]))

        #Regularization of decorr p(gram3|t)
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='DecorrelatorPhiGram3Regularizer',
                class_ids=[self.gram3]))

        model.num_document_passes = n_document_passes
        return (model, batch_vectorizer_train, batch_vectorizer_test)
Ejemplo n.º 20
0
def read_collection(target_folder, vw_name):
    if len(glob.glob(os.path.join(target_folder, '*.batch'))) < 1:
        batch_vectorizer = artm.BatchVectorizer(
            data_path=vw_name,
            data_format='vowpal_wabbit',
            target_folder=target_folder)
    else:
        batch_vectorizer = artm.BatchVectorizer(
            data_path=target_folder,
            data_format='batches')

    dictionary = artm.Dictionary()
    dict_path = os.path.join(target_folder, 'dict.dict')

    if not os.path.isfile(dict_path):
        dictionary.gather(data_path=batch_vectorizer.data_path)
        dictionary.save(dictionary_path=dict_path)

    dictionary.load(dictionary_path=dict_path)
    return batch_vectorizer, dictionary
Ejemplo n.º 21
0
    def get_dictionary(self, batch_vectorizer_path=None):
        """
        Get dictionary.

        Parameters
        ----------
        batch_vectorizer_path : str
             (Default value = None)

        Returns
        -------
        dictionary :

        """
        if self._cached_dict is not None:
            return self._cached_dict

        if batch_vectorizer_path is None:
            batch_vectorizer_path = self._batch_vectorizer_path

        dictionary = artm.Dictionary()
        dict_path = os.path.join(batch_vectorizer_path, 'dict.dict')

        same_collection, path_to_collection = self._check_collection(
            batch_vectorizer_path
        )

        if same_collection:
            if not os.path.isfile(dict_path):
                dictionary.gather(data_path=batch_vectorizer_path)
                dictionary.save(dictionary_path=dict_path)
            dictionary.load(dictionary_path=dict_path)
            self._cached_dict = dictionary
            return dictionary
        else:
            _ = self.get_batch_vectorizer(batch_vectorizer_path)
            dictionary.gather(data_path=batch_vectorizer_path)
            dictionary.save(dictionary_path=dict_path)
            dictionary.load(dictionary_path=dict_path)
            self._cached_dict = dictionary
            return dictionary
Ejemplo n.º 22
0
class DatasetCollection(object):
    dir_path = attr.ib(init=True, converter=str, validator=_id_dir, repr=True)

    allowed_modality_names = attr.ib(
        init=True, default=['@labels_class', '@ideology_class'])
    name = attr.ib(init=False,
                   default=attr.Factory(
                       lambda self: path.basename(self.dir_path),
                       takes_self=True))
    vocab_file = attr.ib(init=False,
                         default=attr.Factory(lambda self: path.join(
                             self.dir_path, 'vocab.{}.txt'.format(self.name)),
                                              takes_self=True))
    lexicon = attr.ib(init=False,
                      default=attr.Factory(
                          lambda self: artm.Dictionary(name=self.name),
                          takes_self=True))
    doc_labeling_modality_name = attr.ib(init=False, default='')
    class_names = attr.ib(init=False, default=[], validator=_class_names)
    # nb_docs = attr.ib(init=False, default=attr.Factory(lambda self: _file_len(path.join(self.dir_path, 'vowpal.{}.txt'.format(self.name))), takes_self=True))
    ppmi_file = attr.ib(init=False,
                        default=attr.Factory(lambda self: self._cooc_tf(),
                                             takes_self=True))

    def __attrs_post_init__(self):
        self.lexicon.gather(data_path=self.dir_path,
                            cooc_file_path=self.ppmi_file,
                            vocab_file_path=self.vocab_file,
                            symmetric_cooc_values=True)

    def _cooc_tf(self):
        c = glob('{}/ppmi_*tf.txt'.format(self.dir_path))
        if not c:
            raise RuntimeError(
                "Did not find any 'ppmi' (computed with simple 'tf' scheme) files in dataset directory '{}'"
                .format(self.dir_path))
        return c[0]
Ejemplo n.º 23
0
def test_func():
    biterms_tau = 0.0
    num_collection_passes = 1
    num_document_passes = 1
    num_topics = 3
    phi_first_elem = 0.2109  # check that initialization had not changed
    phi_eps = 0.0001

    batches_folder = tempfile.mkdtemp()
    vocab_file_name = os.path.join(batches_folder, 'vocab.txt')
    cooc_file_name = cooc_file_path = os.path.join(batches_folder,
                                                   'cooc_data.txt')

    phi_values = [[0.380308, 0.659777,
                   0.429884], [0.330372, 0.012429, 0.081726],
                  [0.277840, 0.020186, 0.334808],
                  [0.011480, 0.307608, 0.153582]]

    try:
        batch = artm.messages.Batch()
        batch.token.append('A')
        batch.token.append('B')
        batch.token.append('C')
        batch.token.append('D')

        item = batch.item.add()
        item.token_id.append(0)
        item.token_id.append(2)
        item.token_id.append(3)
        item.token_id.append(0)

        item.token_weight.append(2)
        item.token_weight.append(4)
        item.token_weight.append(1)
        item.token_weight.append(1)

        item = batch.item.add()
        item.token_id.append(1)
        item.token_id.append(2)
        item.token_id.append(0)
        item.token_id.append(3)

        item.token_weight.append(3)
        item.token_weight.append(2)
        item.token_weight.append(4)
        item.token_weight.append(1)

        with open(
                os.path.join(batches_folder, '{}.batch'.format(uuid.uuid4())),
                'wb') as fout:
            fout.write(batch.SerializeToString())

        batch = artm.messages.Batch()
        batch.token.append('A')
        batch.token.append('B')
        batch.token.append('D')

        item = batch.item.add()
        item.token_id.append(0)
        item.token_id.append(1)
        item.token_id.append(2)

        item.token_weight.append(2)
        item.token_weight.append(1)
        item.token_weight.append(1)

        item = batch.item.add()
        item.token_id.append(0)
        item.token_id.append(2)

        item.token_weight.append(6)
        item.token_weight.append(2)

        with open(
                os.path.join(batches_folder, '{}.batch'.format(uuid.uuid4())),
                'wb') as fout:
            fout.write(batch.SerializeToString())

        with open(vocab_file_name, 'w') as fout:
            for e in ['A', 'B', 'C', 'D']:
                fout.write('{0}\n'.format(e))

        with open(cooc_file_name, 'w') as fout:
            fout.write('0 3 5.0\n')
            fout.write('0 1 4.0\n')
            fout.write('0 2 5.0\n')
            fout.write('1 3 2.0\n')
            fout.write('1 2 2.0\n')
            fout.write('2 3 2.0\n')

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batches_folder,
                          vocab_file_path=vocab_file_name,
                          cooc_file_path=cooc_file_name)
        batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                                data_format='batches')

        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary,
                          num_document_passes=num_document_passes)
        model.regularizers.add(
            artm.BitermsPhiRegularizer(name='Biterms',
                                       tau=biterms_tau,
                                       dictionary=dictionary))

        assert abs(model.phi_.as_matrix()[0][0] - phi_first_elem) < phi_eps

        model.fit_offline(batch_vectorizer=batch_vectorizer)
        for i in range(len(phi_values)):
            for j in range(len(phi_values[0])):
                assert abs(model.phi_.as_matrix()[i][j] -
                           phi_values[i][j]) < phi_eps
    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 24
0
def test_func():
    # constants
    num_tokens = 11
    probability_mass_threshold = 0.9
    sp_reg_tau = -0.1
    decor_tau = 1.5e+5
    decor_rel_tau = 0.3
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    sp_zero_eps = 0.001
    sparsity_phi_value = [
        0.034, 0.064, 0.093, 0.120, 0.145, 0.170, 0.194, 0.220, 0.246, 0.277,
        0.312, 0.351, 0.390, 0.428, 0.464
    ]

    sparsity_phi_rel_value = [
        0.442, 0.444, 0.444, 0.446, 0.448, 0.449, 0.458, 0.468, 0.476, 0.488,
        0.501, 0.522, 0.574, 0.609, 0.670
    ]

    sparsity_theta_value = [0.0] * num_collection_passes

    perp_zero_eps = 2.0
    perplexity_value = [
        6873, 2590, 2685, 2578, 2603, 2552, 2536, 2481, 2419, 2331, 2235, 2140,
        2065, 2009, 1964
    ]

    perplexity_rel_value = [
        6873, 2667, 2458, 2323, 2150, 2265, 2015, 1967, 1807, 1747, 1713, 1607,
        1632, 1542, 1469
    ]

    top_zero_eps = 0.0001
    top_tokens_num_tokens = [num_tokens * num_topics] * num_collection_passes
    top_tokens_topic_0_tokens = [
        u'party', u'state', u'campaign', u'tax', u'political', u'republican',
        u'senate', u'candidate', u'democratic', u'court', u'president'
    ]
    top_tokens_topic_0_weights = [
        0.0209, 0.0104, 0.0094, 0.0084, 0.0068, 0.0067, 0.0065, 0.0058, 0.0053,
        0.0053, 0.0051
    ]

    ker_zero_eps = 0.02
    topic_kernel_topic_0_contrast = 0.96
    topic_kernel_topic_0_purity = 0.014
    topic_kernel_topic_0_size = 18.0
    topic_kernel_average_size = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13, 0.6, 1.6, 3.53, 7.15, 12.6,
        20.4, 29.06
    ]
    topic_kernel_average_contrast = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12, 0.31, 0.7, 0.96, 0.96, 0.96,
        0.96, 0.97
    ]
    topic_kernel_average_purity = [
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.015, 0.017, 0.02,
        0.03, 0.04, 0.05
    ]

    len_last_document_ids = 10

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model = artm.ARTM(
            topic_names=['topic_{}'.format(i) for i in range(num_topics)],
            dictionary=dictionary.name,
            cache_theta=True)

        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=sp_reg_tau))
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_tau))

        model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))
        model.scores.add(
            artm.TopicKernelScore(
                name='TopicKernelScore',
                probability_mass_threshold=probability_mass_threshold))
        model.scores.add(artm.ThetaSnippetScore(name='ThetaSnippetScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityThetaScore'].value[i] -
                       sparsity_theta_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perp_zero_eps

        for i in range(num_collection_passes):
            assert model.score_tracker['TopTokensScore'].num_tokens[
                i] == top_tokens_num_tokens[i]

        for i in range(num_tokens):
            assert model.score_tracker['TopTokensScore'].last_tokens[
                model.topic_names[0]][i] == top_tokens_topic_0_tokens[i]
            assert abs(model.score_tracker['TopTokensScore'].last_weights[
                model.topic_names[0]][i] -
                       top_tokens_topic_0_weights[i]) < top_zero_eps

        assert len(model.score_tracker['TopicKernelScore'].last_tokens[
            model.topic_names[0]]) > 0

        assert abs(topic_kernel_topic_0_contrast -
                   model.score_tracker['TopicKernelScore'].last_contrast[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_purity -
                   model.score_tracker['TopicKernelScore'].last_purity[
                       model.topic_names[0]]) < ker_zero_eps
        assert abs(topic_kernel_topic_0_size -
                   model.score_tracker['TopicKernelScore'].last_size[
                       model.topic_names[0]]) < ker_zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model.score_tracker['TopicKernelScore'].average_size[i] -
                topic_kernel_average_size[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_contrast[i] -
                topic_kernel_average_contrast[i]) < ker_zero_eps
            assert abs(
                model.score_tracker['TopicKernelScore'].average_purity[i] -
                topic_kernel_average_purity[i]) < ker_zero_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)

        info = model.info
        assert info is not None
        assert len(info.config.topic_name) == num_topics
        assert len(info.score) >= len(model.score_tracker)
        assert len(info.regularizer) == len(model.regularizers.data)
        assert len(info.cache_entry) > 0

        temp = model.score_tracker['ThetaSnippetScore'].last_document_ids
        assert len_last_document_ids == len(temp)
        assert len(model.score_tracker['ThetaSnippetScore'].last_snippet[
            temp[0]]) == num_topics

        phi = model.get_phi()
        assert phi.shape == (vocab_size, num_topics)
        theta = model.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model.library_version.count('.') == 2  # major.minor.patch

        # test relative coefficients for Phi matrix regularizers
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary.name,
                          cache_theta=False)

        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi',
                                            tau=decor_rel_tau))
        model.regularizers['DecorrelatorPhi'].gamma = 0.0

        model.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 use_unigram_document_model=False,
                                 dictionary=dictionary))
        model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))

        model.num_document_passes = num_document_passes
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['SparsityPhiScore'].value[i] -
                       sparsity_phi_rel_value[i]) < sp_zero_eps

        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_rel_value[i]) < perp_zero_eps
    finally:
        shutil.rmtree(batches_folder)
Ejemplo n.º 25
0
num_working_files = 20

# Vowpal Wabbit
batch_vectorizer = None

if len(glob.glob(os.path.join(pn_folder, vw_folder, '*.batch'))) < 1:
    batch_vectorizer = artm.BatchVectorizer(
        data_path=os.path.join(pn_folder, vw_folder, 'vw.txt'),
        data_format='vowpal_wabbit',
        target_folder=os.path.join(pn_folder, vw_folder))
else:
    batch_vectorizer = artm.BatchVectorizer(data_path=os.path.join(
        pn_folder, vw_folder),
                                            data_format='batches')

dictionary = artm.Dictionary()

dict_path = os.path.join(pn_folder, vw_folder, 'dict.dict')

if not os.path.isfile(dict_path):
    dictionary.gather(data_path=batch_vectorizer.data_path)
    dictionary.save(dictionary_path=dict_path)

dictionary.load(dictionary_path=dict_path)

dictionary.filter(min_df=2, max_df_rate=0.4)

N = 1
# model
model = create_model(dictionary=dictionary,
                     num_tokens=num_top_tokens,
Ejemplo n.º 26
0
N_DOCUMENTS = 1000
N_TOPICS = 15
N_PASSES = 15

DATA_DIR = "../Files/DataPreprocessing/{}_documents".format(N_DOCUMENTS)
BASE_DIR = "../Files/TopicModeling/{}_documents".format(N_DOCUMENTS)
SAVE_DIR = os.path.join(BASE_DIR, "models/lda")

BATCHES_DIR = os.path.join(BASE_DIR, "batches")
DICTIONARY_FILE = os.path.join(BASE_DIR, "dictionary.dict")
COOC_FILE = os.path.join(BASE_DIR, "cooc_tf")
VOCAB_FILE = os.path.join(DATA_DIR, "vocab")

start = time.time()
bv = artm.BatchVectorizer(data_path=BATCHES_DIR, data_format="batches")
dictionary = artm.Dictionary()
dictionary.load(DICTIONARY_FILE)

cooc_dict = artm.Dictionary()
cooc_dict.gather(data_path=BATCHES_DIR,
                 cooc_file_path=COOC_FILE,
                 vocab_file_path=VOCAB_FILE,
                 symmetric_cooc_values=True)

coherence_score = artm.TopTokensScore(name='TopTokensCoherenceScore',
                                      dictionary=cooc_dict,
                                      num_tokens=15)

model_artm = artm.LDA(num_topics=N_TOPICS)

model_artm._internal_model.scores.add(
Ejemplo n.º 27
0
            result += sys.getsizeof(v)

    return result

if __name__ == "__main__":
    global_time_start = time.time()
    batches_folder, window_size = __read_params()
    batches_list = glob.glob(os.path.join(batches_folder, '*.batch'))
    dictionaries_list = [name for name in glob.glob(os.path.join(batches_folder, '*.dict'))]

    if len(batches_list) < 1 or len(dictionaries_list) < 1:
        raise RuntimeError('No batches or dictionaries were found in given folder')
    else:
        print('{} batches were found, start processing'.format(len(batches_list)))

    temp_dict = artm.Dictionary()
    temp_dict.load(dictionaries_list[0])
    file_name = '../cooc_info/{}_temp_dict.txt'.format(time.time())
    temp_dict.save_text(file_name)

    dictionary = {}
    with codecs.open(file_name, 'r', 'utf-8') as fin:
        fin.next()
        fin.next()
        for index, line in enumerate(fin):
            dictionary[line.split(' ')[0][0: -1]] = index
    os.remove(file_name)

    global_cooc_dictionary = {}
    for index, filename in enumerate(batches_list):
        local_time_start = time.time()
Ejemplo n.º 28
0
    def train(self):
        vocabulary_file = self._prepare_texts_full(
        ) if self.analyze_full_doc == True else self._prepare_texts_from_summary(
        )
        target_folder = self._get_bigARTM_dir()

        batch_vectorizer = artm.BatchVectorizer(data_path=vocabulary_file,
                                                data_format='vowpal_wabbit',
                                                target_folder=target_folder,
                                                batch_size=100)

        dict_path = self._get_dictionary_path()
        dict_file = '{}.dict'.format(dict_path)

        if os.path.isfile(dict_file):
            os.remove(dict_file)

        my_dictionary = artm.Dictionary()
        my_dictionary.gather(data_path=target_folder,
                             vocab_file_path=vocabulary_file)
        my_dictionary.save(dictionary_path=dict_path)
        my_dictionary.load(dictionary_path=dict_file)

        T = self.num_of_topics
        topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"]

        self.model_artm = artm.ARTM(num_topics=T,
                                    topic_names=topic_names,
                                    class_ids={
                                        "text": 1,
                                        "doc_guid": 1
                                    },
                                    dictionary=my_dictionary,
                                    cache_theta=True)

        self.model_artm.initialize(dictionary=my_dictionary)
        self.model_artm.scores.add(
            artm.TopTokensScore(name="text_words",
                                num_tokens=15,
                                class_id="text"))
        self.model_artm.scores.add(
            artm.TopTokensScore(name="doc_guid_words",
                                num_tokens=15,
                                class_id="doc_guid"))

        self.model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi',
                                            tau=1e5,
                                            dictionary=my_dictionary,
                                            class_ids="text",
                                            topic_names="bcg"))

        self.model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                    num_collection_passes=30)

        self.model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(
                name='SparsePhi-1e5',
                tau=-1e5,
                dictionary=my_dictionary,
                class_ids="text",
                topic_names=["sbj" + str(i) for i in range(T - 1)]))

        self.model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                                    num_collection_passes=15)

        self.training_done = True
Ejemplo n.º 29
0
    def gather_dictionary(self, custom_vocab=False):
        self.log("Creating ARTM dictionary...")
        dictionary = artm.Dictionary(name="dictionary")
        batches_folder = os.path.join(self.get_folder(), "batches")
        vocab_file_path = os.path.join(self.get_folder(), "vocab.txt")
        if custom_vocab:
            dictionary.gather(batches_folder, vocab_file_path=vocab_file_path)
        else:
            dictionary.gather(batches_folder)
            vocab_file = open(vocab_file_path, "w", encoding="utf-8")
        dictionary_file_name = os.path.join(self.get_folder(), "batches",
                                            "dictionary.txt")
        dictionary.save_text(dictionary_file_name)

        self.log("Saving terms to database...")
        term_index_id = -3
        self.modalities_count = 0
        self.terms_index = dict()
        modalities_index = dict()
        with open(dictionary_file_name, "r", encoding='utf-8') as f:
            for line in f:
                term_index_id += 1
                if term_index_id < 0:
                    continue
                parsed = line.replace(',', ' ').split()
                term = Term()
                term.dataset = self
                term.text = parsed[0]
                term.index_id = term_index_id
                term.token_value = float(parsed[2])
                term.token_tf = int(parsed[3].split('.')[0])
                term.token_df = int(parsed[4].split('.')[0])
                modality_name = parsed[1]
                if modality_name not in modalities_index:
                    modality = Modality()
                    modality.index_id = self.modalities_count
                    self.modalities_count += 1
                    modality.name = modality_name
                    modality.dataset = self
                    modality.save()
                    modalities_index[modality_name] = modality
                modality = modalities_index[modality_name]
                term.modality = modality
                modality.terms_count += 1

                term.save()

                if not custom_vocab:
                    vocab_file.write("%s %s\n" % (parsed[0], parsed[1]))

                self.terms_index[term.text] = term
                self.terms_index[term.text + "$#" + term.modality.name] = term
                self.terms_index[term.index_id] = term

                if term_index_id % 10000 == 0:
                    self.log(str(term_index_id))
                    # print(term_index_id)

        if not custom_vocab:
            vocab_file.close()

        self.terms_count = term_index_id + 1
        self.terms_count = term_index_id + 1

        self.log("Saving modalities...")
        max_modality_size = 0
        word_modality_id = -1
        for key, modality in modalities_index.items():
            if modality.terms_count > max_modality_size:
                word_modality_id = modality.id
                max_modality_size = modality.terms_count

        for key, modality in modalities_index.items():
            if modality.id == word_modality_id:
                modality.weight_spectrum = 1
                modality.weight_naming = 1
            if 'tag' in modality.name:
                modality.is_tag = True
            modality.save()

        self.normalize_modalities_weights()
Ejemplo n.º 30
0
def test_func():
    # constants
    num_tokens = 15
    alpha = 0.01
    beta = 0.02
    num_collection_passes = 15
    num_document_passes = 1
    num_topics = 15
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        model_artm = artm.ARTM(num_topics=num_topics,
                               dictionary=dictionary,
                               cache_theta=True,
                               reuse_theta=True)

        model_artm.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=beta))
        model_artm.regularizers.add(
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=alpha))

        model_artm.scores.add(
            artm.SparsityThetaScore(name='SparsityThetaScore'))
        model_artm.scores.add(
            artm.PerplexityScore(name='PerplexityScore',
                                 dictionary=dictionary))
        model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
        model_artm.scores.add(
            artm.TopTokensScore(name='TopTokensScore', num_tokens=num_tokens))

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=beta,
                             dictionary=dictionary,
                             cache_theta=True)
        model_lda.initialize(dictionary=dictionary)

        model_artm.num_document_passes = num_document_passes
        model_lda.num_document_passes = num_document_passes

        model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                               num_collection_passes=num_collection_passes)
        model_lda.fit_offline(batch_vectorizer=batch_vectorizer,
                              num_collection_passes=num_collection_passes)

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['SparsityPhiScore'].value[i] -
                       model_lda.sparsity_phi_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(
                model_artm.score_tracker['SparsityThetaScore'].value[i] -
                model_lda.sparsity_theta_value[i]) < zero_eps

        for i in range(num_collection_passes):
            assert abs(model_artm.score_tracker['PerplexityScore'].value[i] -
                       model_lda.perplexity_value[i]) < zero_eps

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens)
        assert len(lda_tt) == num_topics

        for i in range(num_topics):
            for j in range(num_tokens):
                assert model_artm.score_tracker['TopTokensScore'].last_tokens[
                    model_artm.topic_names[i]][j] == lda_tt[i][j]

        lda_tt = model_lda.get_top_tokens(num_tokens=num_tokens,
                                          with_weights=True)
        for i in range(num_tokens):
            assert abs(model_artm.score_tracker['TopTokensScore'].last_weights[
                model_artm.topic_names[0]][i] - lda_tt[0][i][1]) < zero_eps

        model_lda.fit_online(batch_vectorizer=batch_vectorizer)

        phi = model_lda.phi_
        assert phi.shape == (vocab_size, num_topics)
        theta = model_lda.get_theta()
        assert theta.shape == (num_topics, num_docs)

        assert model_lda.library_version.count('.') == 2  # major.minor.patch

        model_lda = artm.LDA(num_topics=num_topics,
                             alpha=alpha,
                             beta=([0.1] * num_topics),
                             dictionary=dictionary,
                             cache_theta=True)
        assert model_lda._internal_model.regularizers.size() == num_topics + 1
    finally:
        shutil.rmtree(batches_folder)