def get_lda_model(X, n_topics): """ do LDA on the doc2term matrix and return the model and corpus in Gensim format """ corpus = convert_to_gensim_corpus(X) print "Performing LDA..." model = LdaMulticore(corpus, num_topics=n_topics) return model, corpus
RANDOM_STATE = 0 from scipy.io import loadmat import cPickle as pkl X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat' Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat' X = loadmat(X_path)['featureData'] Y = loadmat(Y_path)['labelData'] rng = np.random.RandomState(RANDOM_STATE) rows = rng.permutation(X.shape[0])[:SAMPLE_N] X = X[rows, :] Y = Y[rows, :] train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.1, random_state=RANDOM_STATE) model, train_corpus = get_lda_model(train_X, N_TOPICS) train_lda_repr = infer_lda_topics(model, train_corpus) test_lda_repr = infer_lda_topics(model, convert_to_gensim_corpus(test_X)) print "train_lda_repr.shape: {}".format(train_lda_repr.shape) print "test_lda_repr.shape: {}".format(test_lda_repr.shape) pkl.dump(train_lda_repr, open('data/train_X_lda_ntopic_{}_rng_{}.pkl'.format(N_TOPICS, RANDOM_STATE), 'w')) pkl.dump(test_lda_repr, open('data/test_X_lda_ntopic_{}_rng_{}.pkl'.format(N_TOPICS, RANDOM_STATE), 'w'))