Ejemplo n.º 1
0
    def create_batches(self, d_dir):
        new_folder = os.path.join(d_dir, 'data_batches')
        target_folder_train = "%s%s" % (new_folder, '_train')
        target_folder_test = "%s%s" % (new_folder, '_test')

        batch_vectorizer = artm.BatchVectorizer(
            batch_size=100000,
            data_path=os.path.join(d_dir, 'train.txt'),
            data_format='vowpal_wabbit',
            target_folder=target_folder_train)

        batch_vectorizer = artm.BatchVectorizer(
            batch_size=100000,
            data_path=os.path.join(d_dir, 'test.txt'),
            data_format='vowpal_wabbit',
            target_folder=target_folder_test)

        os.rename(os.path.join(target_folder_train, 'aaaaaa.batch'),
                  os.path.join(target_folder_train, 'train.batch'))

        os.rename(os.path.join(target_folder_test, 'aaaaaa.batch'),
                  os.path.join(target_folder_test, 'test.batch'))

        folder_for_dict = os.path.join(d_dir, 'for_dict')
        os.mkdir(folder_for_dict)

        shutil.copy(os.path.join(target_folder_train, 'train.batch'),
                    folder_for_dict)

        shutil.copy(os.path.join(target_folder_test, 'test.batch'),
                    folder_for_dict)
 def select_from_corpus(self, list_of_files: List[str],
                        preprocessor: BaseTextPreprocessor,
                        spacy_nlp: Language) -> List[str]:
     topic_model_name = os.path.normpath(self.topic_model_name.strip())
     if len(topic_model_name) == 0:
         raise ValueError('A topic model name is empty!')
     dir_name = os.path.dirname(topic_model_name)
     base_name = os.path.basename(topic_model_name)
     if len(dir_name) == 0:
         dir_name = os.path.curdir
     if len(base_name) == 0:
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Base name of file is empty!'
             .format(self.topic_model_name))
     if not os.path.isdir(dir_name):
         raise ValueError(
             '`{0}` is incorrect name for a topic model! Directory `{1}` does not exist!'
             .format(self.topic_model_name, dir_name))
     collection_name = os.path.normpath(
         os.path.join(dir_name, base_name + '.collection'))
     collection_docword_name = os.path.normpath(
         os.path.join(dir_name, 'docword.' + base_name + '.collection'))
     collection_vocab_name = os.path.normpath(
         os.path.join(dir_name, 'vocab.' + base_name + '.collection'))
     if (not os.path.isfile(collection_docword_name)) or (
             not os.path.isfile(collection_vocab_name)):
         self.create_collection_as_bow_uci(list_of_files, preprocessor,
                                           spacy_nlp,
                                           collection_docword_name,
                                           collection_vocab_name)
     batches_path = os.path.normpath(
         os.path.join(dir_name, base_name + '.data_batches'))
     if os.path.isdir(batches_path):
         batch_vectorizer = artm.BatchVectorizer(data_path=batches_path,
                                                 data_format='batches')
     else:
         batch_vectorizer = artm.BatchVectorizer(
             data_path=dir_name,
             data_format='bow_uci',
             collection_name=collection_name,
             target_folder=batches_path)
     dictionary = artm.Dictionary()
     dictionary_name = os.path.normpath(topic_model_name + '.dictionary')
     if os.path.isfile(dictionary_name):
         dictionary.load(dictionary_name)
     else:
         dictionary.gather(data_path=batches_path)
         dictionary.save(dictionary_name)
     topic_model = self.load_topic_model(
         artm.ARTM(num_topics=self.number_of_topics,
                   dictionary=dictionary,
                   cache_theta=False), topic_model_name)
     if topic_model is None:
         topic_model = self.create_topic_model(topic_model_name,
                                               batch_vectorizer, dictionary)
         if topic_model is None:
             raise ValueError(
                 'The trained topic model cannot be loaded from the file `{0}`!'
                 .format(topic_model_name))
     return self.select_keywords_from_topic_model(topic_model)
Ejemplo n.º 3
0
    def get_batch_vectorizer(self, batch_vectorizer_path=None):
        """
        Get batch vectorizer.

        Parameters
        ----------
        batch_vectorizer_path : str
             (Default value = None)

        Returns
        -------
        batch_vectorizer :

        """
        if batch_vectorizer_path is None:
            batch_vectorizer_path = self._batch_vectorizer_path

        same_collection, path_to_collection = self._check_collection(
            batch_vectorizer_path
        )

        if same_collection:
            batches_exist = len(glob(os.path.join(batch_vectorizer_path, '*.batch'))) > 0
            if not batches_exist:
                self.write_vw(path_to_collection)
                batch_vectorizer = artm.BatchVectorizer(
                    data_path=path_to_collection,
                    data_format='vowpal_wabbit',
                    target_folder=batch_vectorizer_path,
                    batch_size=self.batch_size
                )
            else:
                batch_vectorizer = artm.BatchVectorizer(
                    data_path=batch_vectorizer_path,
                    data_format='batches'
                )
        else:
            warnings.warn(W_DIFF_BATCHES_1 + W_DIFF_BATCHES_2.format(batch_vectorizer_path))
            try:
                shutil.rmtree(batch_vectorizer_path)
            except FileNotFoundError:
                pass
            os.mkdir(batch_vectorizer_path)
            self.write_vw(path_to_collection)

            batch_vectorizer = artm.BatchVectorizer(
                data_path=path_to_collection,
                data_format='vowpal_wabbit',
                target_folder=batch_vectorizer_path,
                batch_size=self.batch_size
            )

        return batch_vectorizer
Ejemplo n.º 4
0
 def _init_batch_vectorizer(self, vwpath='data/lenta.vw',
                            batches_path='data/batches',
                            create_batch_files=True):
     self.logger.info("Init BatchVectorizer for %s ...", batches_path)
     if create_batch_files:
         batch_vect = artm.BatchVectorizer(data_path=vwpath,
                                           data_format='vowpal_wabbit',
                                           target_folder=batches_path)
     else:
         self.logger.info("Create from existing batches")
         batch_vect = artm.BatchVectorizer(data_path=batches_path,
                                           data_format='batches')
     self.batch_vectorizer = batch_vect
Ejemplo n.º 5
0
def prepare_batch_vectorizer(batches_dir, vw_path, data_path, column_name):
    if not glob.glob(os.path.join(batches_dir, "*")):
        prepare_batches(batches_dir,
                        vw_path,
                        data_path,
                        column_name=column_name)
        batch_vectorizer = artm.BatchVectorizer(data_path=vw_path,
                                                data_format="vowpal_wabbit",
                                                target_folder=batches_dir,
                                                batch_size=100)
    else:
        batch_vectorizer = artm.BatchVectorizer(data_path=batches_dir,
                                                data_format='batches')

    return batch_vectorizer
Ejemplo n.º 6
0
def pipeline_lda_bigartm(lines,
                         n_clusters,
                         ngram_range,
                         topnwords,
                         LOGS_DATA_PATH="plsa.txt",
                         TARGET_FOLDER="plsa"):

    make_file(lines, ngram_range, LOGS_DATA_PATH)

    bv = artm.BatchVectorizer(data_path=LOGS_DATA_PATH,
                              data_format='vowpal_wabbit',
                              target_folder=TARGET_FOLDER)

    lda = artm.LDA(num_topics=n_clusters,
                   alpha=0.01,
                   beta=0.001,
                   cache_theta=True,
                   dictionary=bv.dictionary)
    lda.fit_offline(batch_vectorizer=bv)

    top_tokens = lda.get_top_tokens(num_tokens=topnwords)
    topic_names = {}
    for i, token_list in enumerate(top_tokens):
        topic_names[i] = token_list

    return label_after_bigarm(lda), topic_names
Ejemplo n.º 7
0
    def __init__(self, uci_dir, dictionary, n_topics):
        bv = artm.BatchVectorizer(data_format='bow_uci', data_path=uci_dir, collection_name='corpus',
                                  target_folder=uci_dir + '/artm_batches')
        bv_dict = bv.dictionary

        logging.info("Fitting the ARTM model")
        model = artm.ARTM(dictionary=bv_dict, num_topics=n_topics)

        model.fit_offline(batch_vectorizer=bv, num_collection_passes=10)

        logging.info("Processing word-topic matrices")

        # Create a new word-topic matrix according to dictionary indices
        self.phi = np.zeros(model.phi_.shape, dtype=np.float64)
        for word, vec in model.phi_.iterrows():
            idx = dictionary.token2id[word[1]]
            self.phi[idx, :] = vec

        logging.info("Building the index for ARTM")
        corpus = model.transform(bv).T.sort_index()
        corpus = [matutils.full2sparse(row) for index, row in corpus.iterrows()]
        self.index = similarities.MatrixSimilarity(corpus, num_features=n_topics, num_best=self.N_BEST)

        self.model = model
        self.dictionary = dictionary
Ejemplo n.º 8
0
def fit():
    batch_id = str(uuid.uuid4())
    app.logger.info("batch %s", batch_id)

    rjson = request.json
    terms = rjson['terms']
    topics_cnt = rjson['topics']

    batch = artm.messages.Batch()
    term_to_id = {}
    all_terms = []

    batch = artm.messages.Batch()
    batch.id = batch_id

    for i, doc in enumerate(terms):
        item = batch.item.add()
        item.id = i
        field = item.field.add()
        for term in doc:
            if not term in term_to_id:
                term_to_id[term] = len(all_terms)
                all_terms.append(term)
            field.token_id.append(term_to_id[term])
            field.token_count.append(1)

    for t in all_terms:
        batch.token.append(t)

    os.mkdir(batch_id)
    with open(os.path.join(batch_id, "batch.batch"), 'wb') as fout:
        fout.write(batch.SerializeToString())

    app.logger.info("batch %s is created", batch_id)

    dictionary = artm.Dictionary()
    dictionary.gather(batch_id)

    model_artm = artm.ARTM(
        topic_names=['topic_{}'.format(i) for i in xrange(topics_cnt)],
        scores=[
            artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)
        ],
        regularizers=[
            artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)
        ],
        show_progress_bars=False)

    batch_vectorizer = artm.BatchVectorizer(data_path=batch_id,
                                            data_format="batches")

    model_artm.initialize(dictionary=dictionary)
    app.logger.info("model is starting to fit")
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=1)
    app.logger.info("mode was fitted")

    model_artm.save(os.path.join(batch_id, "model"))

    return jsonify({"id": batch_id})
Ejemplo n.º 9
0
def bigartm_predict(mess_bigartm, top1_cat):
    try:
        mess_bigartm = ' |text ' + mess_bigartm

        #загрузка модели
        T = 10
        model_artm = artm.ARTM(num_topics=T,
                               topic_names=['sbj' + str(i) for i in range(T)],
                               class_ids={'text': 1})
        model_artm.load(os.path.join(artm_dir, top1_cat + ".dump"))

        #сохранить текст в файл
        with open(os.path.join(currdir, 'flask/test_artm.txt'), 'w') as f:
            f.write(mess_bigartm + '\n')

        batch_vectorizer_test = artm.BatchVectorizer(
            data_path=os.path.join(currdir, 'flask/test_artm.txt'),
            data_format='vowpal_wabbit',
            target_folder=os.path.join(currdir, 'flask/test'),
            batch_size=100)
        theta_test = model_artm.transform(
            batch_vectorizer=batch_vectorizer_test)

    except Exception:
        print('Ошибка загрузки bigartm')
    return theta_test
Ejemplo n.º 10
0
def transform_one(model, vw_path, batch_path):
    transform_batch = artm.BatchVectorizer(data_format="vowpal_wabbit",
                                           data_path=vw_path,
                                           batch_size=1,
                                           target_folder=batch_path)
    transform_theta = model.transform(transform_batch)
    return transform_theta[:-1]  #the last topic is background
Ejemplo n.º 11
0
    def prepare_data(self, data):
        """
        data -- array of tokenized text
        """
        self.vwpath_dir = f"{self.dir_path}/vwpath/"
        if not os.path.exists(self.vwpath_dir):
            print("creating vw path...\n")
            os.makedirs(self.vwpath_dir)

        with open(self.vwpath, "w") as fp:
            for i, text in enumerate(data):
                fp.write("{} |default {}\n".format(i, text))

        self.batches_path = f"{self.dir_path}/batches/{self.name_dataset}"

        if not os.path.exists(self.batches_path):
            print("creating batches path...\n")
            os.makedirs(self.batches_path)

        self.batch_vectorizer = artm.BatchVectorizer(
            data_path=self.vwpath,
            data_format="vowpal_wabbit",
            target_folder=self.batches_path,
            gather_dictionary=False,
        )

        if not os.path.exists(f"{self.dir_path}/dicts/"):
            print("creating dicts path...\n")
            print(f"{self.dir_path}/dicts/")
            os.makedirs(f"{self.dir_path}/dicts/")
Ejemplo n.º 12
0
def cluster_artm(text):
    bach_vectorizer = artm.BatchVectorizer(data_path=text,
                                           data_format='vowpal_wabbit', target_folder='batch_small',
                                           batch_size=20)
    T = 10  # количество тем
    topic_names = ["sbj" + str(i) for i in range(T - 1)] + ["bcg"]

    model_artm = artm.ARTM(num_topics=T, topic_names=topic_names, reuse_theta=True,
                           num_document_passes=1)

    np.random.seed(1)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=bach_vectorizer.data_path)
    model_artm.initialize(dictionary)

    model_artm.scores.add(artm.TopTokensScore(name='metric1', num_tokens=15))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='smoothing', dictionary=dictionary,
                                                                topic_names='bcg', tau=1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)
    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='stimulates',
                                                                dictionary=dictionary,
                                                                topic_names=["sbj" + str(i) for i in range(0, 29)],
                                                                tau=-1e5))

    model_artm.fit_offline(batch_vectorizer=bach_vectorizer, num_collection_passes=6)

    for topic_name in model_artm.topic_names:
        with open('cluster_log_artm.txt', 'a') as f_in:
            f_in.write(topic_name + ':')
            for word in model_artm.score_tracker["metric1"].last_tokens[topic_name]:
                f_in.write(word + ' ')
            f_in.write('\n')
Ejemplo n.º 13
0
    def fit_offline(self,
                    batch_vectorizer,
                    num_collection_passes=1,
                    *args,
                    **kwargs):
        modified_batch_vectorizer = artm.BatchVectorizer(
            batches=[''],
            data_path=batch_vectorizer.data_path,
            batch_size=batch_vectorizer.batch_size,
            gather_dictionary=False)

        del modified_batch_vectorizer.batches_list[0]
        del modified_batch_vectorizer.weights[0]
        for batch, weight in zip(batch_vectorizer.batches_list,
                                 batch_vectorizer.weights):
            modified_batch_vectorizer.batches_list.append(batch)
            modified_batch_vectorizer.weights.append(weight)
        modified_batch_vectorizer.batches_list.append(
            artm.batches_utils.Batch(self._phi_batch_path))
        modified_batch_vectorizer.weights.append(self._phi_batch_weight)
        # import_batches_args = artm.wrapper.messages_pb2.ImportBatchesArgs(
        #                           batch=[self.parent_batch])
        # self._lib.ArtmImportBatches(self.master.master_id, import_batches_args)

        super(ARTM_Level,
              self).fit_offline(modified_batch_vectorizer,
                                num_collection_passes=num_collection_passes,
                                *args,
                                **kwargs)
Ejemplo n.º 14
0
 def load_batches(self, batch_files_list):
     self._mod_tr.batch_vectorizer = artm.BatchVectorizer(
         collection_name=self._col,
         data_path=self._batches_target_dir,
         data_format='batches')
     print("Vectorizer initialized from [{}] 'batch' files found in '{}'".
           format(', '.join(batch_files_list), self._batches_target_dir))
Ejemplo n.º 15
0
def get_topic_weights(data_folder, tm_index):
    import artm
    import os

    from dags.bigartm.services.bigartm_utils import load_monkey_patch
    from util.constants import BASE_DAG_DIR

    print("!!!", "Get topic weights")
    batches_folder = os.path.join(data_folder, "batches")
    batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                            data_format='batches')
    model_folder = os.path.join(BASE_DAG_DIR, "bigartm_models")
    model_artm = artm.ARTM(num_topics=tm_index.number_of_topics,
                           class_ids={"text": 1},
                           theta_columns_naming="title",
                           reuse_theta=True,
                           cache_theta=True,
                           num_processors=4)
    model_artm.load = load_monkey_patch
    model_artm.load(model_artm,
                    os.path.join(model_folder, f"model_{tm_index.name}.model"))

    theta = model_artm.transform(batch_vectorizer)

    theta_values = theta.values.transpose().astype(float)
    theta_topics = theta.index.array.to_numpy().astype(str)
    theta_documents = theta.columns.array.to_numpy().astype(str)

    return theta_values, theta_topics, theta_documents
Ejemplo n.º 16
0
def test_func():
    # constants
    num_tokens = 15
    parent_level_weight = 1
    num_collection_passes = 15
    num_document_passes = 10
    num_topics_level0 = 15
    num_topics_level1 = 50
    regularizer_tau = 10 ** 5
    vocab_size = 6906
    num_docs = 3430
    zero_eps = 0.001

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()
    parent_batch_folder = tempfile.mkdtemp()

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=batch_vectorizer.data_path)

        hier = artm.hARTM(dictionary=dictionary, cache_theta=True, num_document_passes=num_document_passes)
        
        level0 = hier.add_level(num_topics=num_topics_level0)

        level0.initialize(dictionary=dictionary)
        
        level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)
        
        hier.tmp_files_path = parent_batch_folder
        level1 = hier.add_level(num_topics=num_topics_level1, parent_level_weight=parent_level_weight)
        
        level1.initialize(dictionary=dictionary)
        
        level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=regularizer_tau))
        
        level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

        phi = hier.get_level(1).get_phi()
        assert phi.shape == (vocab_size, num_topics_level1)
        # theta = hier.get_level(1).get_theta()
        # assert theta.shape == (num_topics_level1, num_docs)
        psi = hier.get_level(1).get_psi()
        support = psi.values.max(axis=1).min()

        # This test gives different results on python27 and python35. Authors need to investigate.
        on_python_27 = abs(support - 0.0978 < zero_eps)
        on_python_35 = abs(support - 0.1522 < zero_eps)
        assert(on_python_27 or on_python_35)
        
    finally:
        shutil.rmtree(batches_folder)
        shutil.rmtree(parent_batch_folder)
Ejemplo n.º 17
0
def create_and_learn_ARTM_decorPhi_modal(name="",
                                         topic_number=750,
                                         num_collection_passes=1,
                                         weigths=[1., 1., 1., 1.],
                                         decorTau=1.0):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model = artm.ARTM(topic_names=topic_names,
                      class_ids={
                          '@text': weigths[0],
                          '@first': weigths[1],
                          '@second': weigths[2],
                          '@third': weigths[3]
                      },
                      cache_theta=True,
                      theta_columns_naming='title',
                      scores=[
                          artm.PerplexityScore(name='PerplexityScore',
                                               dictionary=dictionary)
                      ])
    model.regularizers.add(
        artm.DecorrelatorPhiRegularizer(
            name='DecorrelatorPhi_modals',
            tau=decorTau,
            class_ids=['@first', '@second', '@third']))

    model.initialize(dictionary=dictionary)

    model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
    model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
    model.scores.add(
        artm.TopicKernelScore(name='TopicKernelScore',
                              class_id='@text',
                              probability_mass_threshold=0.3))
    model.scores.add(
        artm.TopTokensScore(name='TopTokensScore',
                            num_tokens=6,
                            class_id='@text'))
    model.scores.add(
        artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@third'))

    model.num_document_passes = 1

    model.fit_offline(batch_vectorizer=batch_vectorizer_train,
                      num_collection_passes=num_collection_passes)

    theta_train = model.transform(batch_vectorizer=batch_vectorizer_train)

    return model, theta_train
Ejemplo n.º 18
0
 def create_batches(self, use_ideology_information=False):
     self._mod_tr.batch_vectorizer = artm.BatchVectorizer(
         collection_name=self._col,
         data_path=self._data_path_hash[use_ideology_information],
         data_format=self.
         ideology_flag2data_format[use_ideology_information],
         target_folder=self._batches_target_dir)
     print("Vectorizer initialized from '{}' file".format(
         self.ideology_flag2data_format[use_ideology_information]))
Ejemplo n.º 19
0
    def build_model(self, d_dir, n_document_passes=1):
        batch_vectorizer_train = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_train'),
                                                      data_format="batches")

        batch_vectorizer_test = artm.BatchVectorizer(data_path=os.path.join(
            d_dir, 'data_batches_test'),
                                                     data_format="batches")

        dictionary = artm.Dictionary()
        dictionary.gather(data_path=os.path.join(d_dir, 'for_dict'))

        model = artm.ARTM(num_topics=self.n_topics,
                          dictionary=dictionary,
                          cache_theta=True,
                          reuse_theta=True)

        # Sparsity p(c|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreC',
                                  class_id=self.c))

        # Sparsity p(w|t)
        model.scores.add(
            artm.SparsityPhiScore(eps=EPS,
                                  name='SparsityPhiScoreGram3',
                                  class_id=self.gram3))

        #Regularization of sparsity p(gram3|t)
        model.regularizers.add(
            artm.SmoothSparsePhiRegularizer(name='SparsePhiGram3Regularizer',
                                            class_ids=[self.gram3]))

        #Regularization of decorr p(gram3|t)
        model.regularizers.add(
            artm.DecorrelatorPhiRegularizer(
                name='DecorrelatorPhiGram3Regularizer',
                class_ids=[self.gram3]))

        model.num_document_passes = n_document_passes
        return (model, batch_vectorizer_train, batch_vectorizer_test)
Ejemplo n.º 20
0
def read_collection(target_folder, vw_name):
    if len(glob.glob(os.path.join(target_folder, '*.batch'))) < 1:
        batch_vectorizer = artm.BatchVectorizer(
            data_path=vw_name,
            data_format='vowpal_wabbit',
            target_folder=target_folder)
    else:
        batch_vectorizer = artm.BatchVectorizer(
            data_path=target_folder,
            data_format='batches')

    dictionary = artm.Dictionary()
    dict_path = os.path.join(target_folder, 'dict.dict')

    if not os.path.isfile(dict_path):
        dictionary.gather(data_path=batch_vectorizer.data_path)
        dictionary.save(dictionary_path=dict_path)

    dictionary.load(dictionary_path=dict_path)
    return batch_vectorizer, dictionary
Ejemplo n.º 21
0
    def get_batches(self):
        dataset_path = os.path.join(settings.DATA_DIR, "datasets",
                                    self.text_id)
        batches_folder = os.path.join(dataset_path, "batches")
        dictionary_file_name = os.path.join(batches_folder, "dictionary.txt")

        batch_vectorizer = artm.BatchVectorizer(data_path=batches_folder,
                                                data_format="batches")
        dictionary = artm.Dictionary(name="dictionary")
        dictionary.load_text(dictionary_file_name)
        return batch_vectorizer, dictionary
Ejemplo n.º 22
0
 def transform_one(self, vw_path, batch_path):
     transform_batch = artm.BatchVectorizer(data_format="vowpal_wabbit",
                                            data_path=vw_path,
                                            batch_size=1,
                                            target_folder=batch_path)
     transform_theta = self._model.transform(transform_batch)
     response = {}
     for artm_tid, pdt in transform_theta["upload"].items():
         if artm_tid in self._from_artm_tid_map:
             topic_id = self._from_artm_tid_map[artm_tid]
             response[topic_id] = float(pdt)
     return response
Ejemplo n.º 23
0
    def __init__(self, source_file, batches_folder='batches', batch_size=100):
        self.source_file = source_file
        self.batches_folder = batches_folder
        self.batch_vectorizer = artm.BatchVectorizer(data_path=self.source_file, data_format="vowpal_wabbit",
                                                     target_folder=self.batches_folder, batch_size=batch_size)

        dict_name = os.path.join(self.batches_folder, "dictionary.dict")
        self.dictionary = artm.Dictionary()
        if not os.path.exists(dict_name):
            self.dictionary.gather(batches_folder)
            self.dictionary.save(dict_name)
        else:
            self.dictionary.load(dict_name)
Ejemplo n.º 24
0
def experiment(filename, tau_phi, tau_theta):
    batch_vectorizer = artm.BatchVectorizer(data_path=filename, data_format='vowpal_wabbit',
                                            target_folder='batches')

    dictionary = batch_vectorizer.dictionary

    topic_num = 30
    tokens_num = 100
    print("ARTM training")
    topic_names = ['topic_{}'.format(i) for i in range(topic_num)]
    model_artm = artm.ARTM(topic_names=topic_names, dictionary=dictionary, cache_theta=True)
    model_plsa = artm.ARTM(topic_names=topic_names, cache_theta=True,
                           scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)])
    model_lda = artm.LDA(num_topics=topic_num)

    model_artm.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_artm.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_artm.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_artm.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=tokens_num))
    model_artm.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_artm.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_artm.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_artm.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_plsa.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    model_plsa.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
    model_plsa.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    model_plsa.scores.add(artm.TopTokensScore(name='top_tokens_score'))
    model_plsa.scores.add(artm.TopicKernelScore(name='topic_kernel_score', probability_mass_threshold=0.3))
    model_plsa.scores.add(artm.BackgroundTokensRatioScore(name='background_tokens_ratio_score'))
    model_plsa.scores.add(artm.ClassPrecisionScore(name='class_precision_score'))
    model_plsa.scores.add(artm.TopicMassPhiScore(name='topic_mass_phi_score'))

    model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
    model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))

    model_artm.regularizers['sparse_phi_regularizer'].tau = tau_phi
    model_artm.regularizers['sparse_theta_regularizer'].tau = tau_theta
    model_artm.regularizers['decorrelator_phi_regularizer'].tau = 1e+3

    model_plsa.initialize(dictionary=dictionary)
    model_artm.initialize(dictionary=dictionary)
    model_lda.initialize(dictionary=dictionary)

    passes = 100
    model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)
    model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=passes)

    print_measures(model_plsa, model_artm, model_lda)
Ejemplo n.º 25
0
    def get_batch_vectorizer(self) -> artm.BatchVectorizer:
        """
        Gets batch vectorizer.

        Returns
        -------
        artm.BatchVectorizer

        """
        same_collection, path_to_collection = self._check_collection()

        if same_collection:
            batches_exist = len(
                glob(os.path.join(self._batches_folder_path, '*.batch'))) > 0

            if not batches_exist:
                self.write_vw(path_to_collection)

                return artm.BatchVectorizer(
                    data_path=path_to_collection,
                    data_format='vowpal_wabbit',
                    target_folder=self._batches_folder_path,
                    batch_size=self.batch_size)
            else:
                return artm.BatchVectorizer(
                    data_path=self._batches_folder_path, data_format='batches')

        if os.path.isdir(self._batches_folder_path):
            warnings.warn(W_DIFF_BATCHES_1 +
                          W_DIFF_BATCHES_2.format(self._batches_folder_path))
            self.clear_batches_folder()

        self.write_vw(path_to_collection)

        return artm.BatchVectorizer(data_path=path_to_collection,
                                    data_format='vowpal_wabbit',
                                    target_folder=self._batches_folder_path,
                                    batch_size=self.batch_size)
Ejemplo n.º 26
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="/home/goncharoff/PythonLab/labs/labs/lab5/result/result.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
Ejemplo n.º 27
0
    def prepare_batch_files(name, batch_size, batches_path, id2word=None):
        if not os.path.exists(batches_path):
            vw_path = os.path.join(DATA_PATH, '{}.vw'.format(name))
            if not os.path.exists(vw_path):
                print 'Converting {}.mm -> {}.vw'.format(name, name)
                convert_mm_to_vw(os.path.join(DATA_PATH, '{}.mm'.format(name)),
                                 vw_path)

            bv = artm.BatchVectorizer(data_path=vw_path,
                                      data_format='vowpal_wabbit',
                                      target_folder=batches_path,
                                      batch_size=batch_size,
                                      gather_dictionary=False)
            if not id2word is None:
                with open(os.path.join(batches_path, 'vocab.txt'),
                          'w') as fout:
                    for i in xrange(len(id2word.keys())):
                        fout.write('{}\n'.format(i + 1))
            return bv
        else:
            return artm.BatchVectorizer(data_path=batches_path,
                                        data_format='batches',
                                        batch_size=batch_size)
Ejemplo n.º 28
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="../data/lenta.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
    subprocess.call(['./clear.sh'])
Ejemplo n.º 29
0
def processPath(path, subd):
    batch_vectorizer = artm.BatchVectorizer(data_path=path + "\\" + subd +
                                            "\\" + "batches",
                                            data_format='batches')
    # if not os.path.isfile('kos/dictionary.dict'):
    #     dictionary.gather(data_path=batch_vectorizer.data_path)
    #     dictionary.save(dictionary_path='kos/dictionary.dict')
    # dictionary = artm.Dictionary()
    # dictionary.gather(data_path='my_collection_batches')
    if not os.path.exists(path + "\\" + subd + "\\" + "dictionary"):
        os.makedirs(path + "\\" + subd + "\\" + "dictionary", mode=0o777)
    batch_vectorizer.dictionary.save(dictionary_path=path + "\\" + subd +
                                     "\\" + "dictionary\\dictionary.dict")
    batch_vectorizer.dictionary.save_text(dictionary_path=path + "\\" + subd +
                                          "\\" + "dictionary\\dictionary.txt")
def test_func():
    topic_selection_tau = 0.5
    num_collection_passes = 3
    num_document_passes = 10
    num_topics = 15

    data_path = os.environ.get('BIGARTM_UNITTEST_DATA')
    batches_folder = tempfile.mkdtemp()

    perplexity_eps = 0.1
    perplexity_value = [
        6676.941798754971, 2534.963709464024, 2463.1544861984794
    ]

    try:
        batch_vectorizer = artm.BatchVectorizer(data_path=data_path,
                                                data_format='bow_uci',
                                                collection_name='kos',
                                                target_folder=batches_folder)

        dictionary = artm.Dictionary(data_path=batches_folder)
        model = artm.ARTM(num_topics=num_topics,
                          dictionary=dictionary,
                          num_document_passes=num_document_passes)

        model.regularizers.add(
            artm.TopicSelectionThetaRegularizer(name='TopicSelection',
                                                tau=topic_selection_tau))
        model.scores.add(artm.PerplexityScore(name='PerplexityScore'))
        model.scores.add(
            artm.TopicMassPhiScore(name='TopicMass',
                                   model_name=model.model_nwt))
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        # Verify that only 8 topics are non-zero (due to TopicSelection regularizer)
        topics_left = sum(x == 0
                          for x in model.get_score('TopicMass').topic_mass)
        assert 8 == topics_left

        # the following asssertion fails on travis-ci builds, but passes locally
        for i in range(num_collection_passes):
            assert abs(model.score_tracker['PerplexityScore'].value[i] -
                       perplexity_value[i]) < perplexity_eps

        model.fit_online(batch_vectorizer=batch_vectorizer)
    finally:
        shutil.rmtree(batches_folder)