Esempio n. 1
0
    def get_jsonable_from_parameters(self):
        """
        Gets artm model params.

        Returns
        -------
        dict
            artm model parameters

        """
        parameters = transform_complex_entity_to_dict(self._model)

        regularizers = {}
        for name, regularizer in iteritems(self._model._regularizers.data):
            tau = None
            gamma = None
            try:
                tau = regularizer.tau
                gamma = regularizer.gamma
            except KeyError:
                pass
            regularizers[name] = [str(regularizer.config), tau, gamma]
        for name, regularizer in iteritems(self.custom_regularizers):
            tau = getattr(regularizer, 'tau', None)
            gamma = getattr(regularizer, 'gamma', None)
            config = str(getattr(regularizer, 'config', ''))
            regularizers[name] = [config, tau, gamma]

        parameters['regularizers'] = regularizers
        parameters['version'] = artm.version()

        return parameters
Esempio n. 2
0
def main():
    print artm.version()
    config = ConfigPaths('config.cfg')
    plot_maker = PlotMaker()
    printer = PrintHelper()
    print config.models_file_name

    batch_vectorizer = artm.BatchVectorizer(
        data_path=config.output_batches_path, data_format='batches')
    dictionary = artm.Dictionary()
    dictionary.load(dictionary_path=config.dictionary_path + '.dict')

    models_file = open(config.models_file_name, 'a')
    # model = process_one_model(config, batch_vectorizer, models_file, printer, plot_maker,
    #                           dictionary, _n_topics=50, _n_doc_passes=5, _seed_value=100, _n_top_tokens=10, _p_mass_threshold=0.25,
    #               _n_iterations=20, _model_name='model1')

    exp = Experiment(
        Pool(topics_filter=OptimizationTopicsFilter(eps=10**(-2.5),
                                                    verbose=False),
             save_topics=True))
    for i in xrange(3):
        model_artm = process_one_model(config,
                                       batch_vectorizer,
                                       models_file,
                                       printer,
                                       plot_maker,
                                       dictionary,
                                       _n_topics=50,
                                       _n_doc_passes=5,
                                       _seed_value=100,
                                       _n_top_tokens=10,
                                       _p_mass_threshold=0.25,
                                       _n_iterations=20,
                                       _model_name='model_{}'.format(i))
        #display_points(model_artm.get_phi())
        exp.collect_topics(model_artm.get_phi(), model_artm.get_theta())
        vals, bins = exp.topics_pool.topics_filter.plot_hist()
        save_hist(vals, bins, "data_iter_{}.csv".format(i))
        print exp.topics_pool.get_basic_topics_count()
    #
    models_file.close()
Esempio n. 3
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="/home/goncharoff/PythonLab/labs/labs/lab5/result/result.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
Esempio n. 4
0
def run():
    print 'BigARTM version ', artm.version(), '\n\n\n'
    preprocessing_for_artm(True)
    topics = 10
    batch_vectorizer = artm.BatchVectorizer(
        data_path="../data/lenta.txt",
        data_format="vowpal_wabbit",
        target_folder="batch_vectorizer_target_folder",
        batch_size=10)
    topic_names = ["topic#1" + str(i) for i in range(topics - 1)] + ["bcg"]
    dictionary = artm.Dictionary("dictionary")
    dictionary.gather(batch_vectorizer.data_path)
    artm_plsa(batch_vectorizer, topics, topic_names, dictionary)
    artm_lda(batch_vectorizer, topics, dictionary)
    subprocess.call(['./clear.sh'])
import artm
print(artm.version())
print(artm.ARTM(num_topics=10).info)
Esempio n. 6
0
import pytest
import warnings
import shutil
import artm

from ..cooking_machine.models.dummy_topic_model import DummyTopicModel
from ..cooking_machine.models.topic_model import TopicModel
from ..cooking_machine.experiment import Experiment
from ..cooking_machine.dataset import Dataset, W_DIFF_BATCHES_1
from ..cooking_machine.models.example_score import ScoreExample
from ..cooking_machine.models.blei_lafferty_score import BleiLaffertyScore

ARTM_NINE = artm.version().split(".")[1] == "9"
MAIN_MODALITY = "@text"
NGRAM_MODALITY = "@ngramms"
EXTRA_MODALITY = "@str"

# to run all test
@pytest.fixture(scope="function")
def experiment_enviroment(request):
    """ """
    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", message=W_DIFF_BATCHES_1)
        dataset = Dataset('tests/test_data/test_dataset.csv')
        dictionary = dataset.get_dictionary()

    model_artm = artm.ARTM(
        num_topics=5,
        class_ids={MAIN_MODALITY: 1.0, NGRAM_MODALITY: 1.0, EXTRA_MODALITY: 1.0},
        num_document_passes=1, dictionary=dictionary,
        scores=[artm.PerplexityScore(name='PerplexityScore', )],
Esempio n. 7
0
 def __init__(self, model):
     self.model = model
     self.phi = model.get_phi()
     if '10' in artm.version():
         self.phi = self.phi.set_index(
             pd.MultiIndex.from_tuples(self.phi.index))
Esempio n. 8
0
import artm

print('artm.version()', artm.version())


def create_and_learn_PLSA(name="", topic_number=750, num_collection_passes=1):

    batch_vectorizer_train = None
    batch_vectorizer_train = artm.BatchVectorizer(data_path='./' + name,
                                                  data_format='vowpal_wabbit',
                                                  target_folder='folder' +
                                                  name)
    dictionary = artm.Dictionary()
    dictionary.gather(data_path=batch_vectorizer_train.data_path)
    topic_names = ['topic_{}'.format(i) for i in range(topic_number)]

    model_plsa = artm.ARTM(topic_names=topic_names,
                           class_ids={
                               '@text': 1.0,
                               '@first': 1.0,
                               '@second': 1.0,
                               '@third': 1.0
                           },
                           cache_theta=True,
                           theta_columns_naming='title',
                           scores=[
                               artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)
                           ])

    model_plsa.initialize(dictionary=dictionary)
Esempio n. 9
0
from numbers import Number

import artm
from artm.wrapper.exceptions import ArtmException

from six import iteritems
from copy import deepcopy

from inspect import signature

# change log style
lc = artm.messages.ConfigureLoggingArgs()
lc.minloglevel = 3
lib = artm.wrapper.LibArtm(logging_config=lc)

LIBRARY_VERSION = artm.version()
ARTM_NINE = LIBRARY_VERSION.split(".")[1] == "9"

SUPPORTED_SCORES_WITHOUT_VALUE_PROPERTY = (
    artm.score_tracker.TopTokensScoreTracker,
    artm.score_tracker.ThetaSnippetScoreTracker,
    artm.score_tracker.TopicKernelScoreTracker,
)


class TopicModel(BaseModel):
    """
    Topic Model contains artm model and all necessary information: scores, training pipeline, etc.

    """
    def __init__(self, artm_model=None, model_id=None,