예제 #1
0
K = 10  # Number of topics
V = 900  # Vocabulary Size
gamma = 0.25  # hyperparameter over doc-topic distribution

Defaults = dict()
Defaults['nDocTotal'] = 1000
Defaults['nWordsPerDoc'] = 500

# GLOBAL PROB DISTRIBUTION OVER TOPICS
trueBeta = np.ones(K)
trueBeta /= trueBeta.sum()
Defaults['topic_prior'] = gamma * trueBeta

# TOPIC by WORD distribution
Defaults['topics'] = Bars2D.Create2DBarsTopicWordParams(V,
                                                        K,
                                                        PRNG=PRNG,
                                                        fracMassOnTopic=0.999)


def get_data_info():
    s = 'Clean Bars Data with %d true topics. Each doc uses 1-3 bars.' % (K)
    return s


def get_data(seed=SEED, **kwargs):
    ''' Create toy dataset using bars topics.

    Keyword Args
    ------------
    seed : int
        Determines pseudo-random generator used to make the toy data.
gamma = 0.85  # hyperparameter over doc-topic distribution

Defaults = dict()
Defaults['nDocTotal'] = 1000
Defaults['nWordsPerDoc'] = 300

# GLOBAL PROB DISTRIBUTION OVER TOPICS
B = 3.0
trueBeta = [B, 1, B, 1, B, 1, B, 1, B, 1.0]
trueBeta = np.asarray(trueBeta) / np.sum(trueBeta)
Defaults['topic_prior'] = gamma * trueBeta

# TOPIC by WORD distribution
Defaults['topics'] = Bars2D.Create2DBarsTopicWordParams2(
    V,
    K,
    r=0.1,
    PRNG=PRNG)


def get_data_info(**kwargs):
    if 'nDocTotal' in kwargs:
        nDocTotal = kwargs['nDocTotal']
    else:
        nDocTotal = Defaults['nDocTotal']
    return 'Toy Bars2 Data. Ktrue=%d. nDocTotal=%d.' % (K, nDocTotal)


def get_data(**kwargs):
    '''
        Args