Esempio n. 1
0
def get_lda_model(X, n_topics):
    """
    do LDA on the doc2term matrix and return the model and corpus in Gensim format 
    """
    corpus = convert_to_gensim_corpus(X)
    print "Performing LDA..."
    model = LdaMulticore(corpus, num_topics=n_topics)
    return model, corpus
Esempio n. 2
0
    RANDOM_STATE = 0
    
    from scipy.io import loadmat
    import cPickle as pkl
    X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat'
    Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat'
        
    X = loadmat(X_path)['featureData']
    Y = loadmat(Y_path)['labelData']

    rng = np.random.RandomState(RANDOM_STATE)
    rows = rng.permutation(X.shape[0])[:SAMPLE_N]

    X = X[rows, :]
    Y = Y[rows, :]
    
    train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1,
                                                        random_state=RANDOM_STATE)
    model, train_corpus = get_lda_model(train_X, N_TOPICS)

    train_lda_repr = infer_lda_topics(model, train_corpus)
    test_lda_repr = infer_lda_topics(model, convert_to_gensim_corpus(test_X))

    print "train_lda_repr.shape: {}".format(train_lda_repr.shape)
    print "test_lda_repr.shape: {}".format(test_lda_repr.shape)
    
    pkl.dump(train_lda_repr, 
             open('data/train_X_lda_ntopic_{}_rng_{}.pkl'.format(N_TOPICS, RANDOM_STATE), 'w'))
    pkl.dump(test_lda_repr, 
             open('data/test_X_lda_ntopic_{}_rng_{}.pkl'.format(N_TOPICS, RANDOM_STATE), 'w'))