Python NMF.fit_predict Examples

Programming Language: Python

Namespace/Package Name: sklearn.decomposition

Class/Type: NMF

Method/Function: fit_predict

Examples at hotexamples.com: 2

Python NMF.fit_predict - 2 examples found. These are the top rated real world Python examples of sklearn.decomposition.NMF.fit_predict extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NMF(30)

fit_transform(30)

fit(30)

transform(30)

inverse_transform(26)

predict(6)

components_(5)

n_components_(4)

perplexity(4)

compile(4)

get_params(3)

evaluate(3)

add(2)

score(2)

set_params(2)

n_components(2)

train(1)

save_model(1)

show_topic(1)

save(1)

shuffleData(1)

reshape(1)

reduced_space(1)

predict_generator(1)

summary(1)

to_csv(1)

make_params(1)

partial_fit(1)

eval(1)

_get_param_names(1)

add_prefix(1)

basis(1)

build_vocab(1)

coef(1)

dump(1)

fit_predict(1)

min_alpha(1)

get_document_topics(1)

get_topics(1)

loglikelihood(1)

mae(1)

__init__(1)

matrix_factorization(1)

update(1)

Example #1

Show file

File: topicmodel.py Project: christinakraus/political-affiliation-prediction

class Topicmodel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kmeans',topics=100,topwords=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = cPickle.load(open(folder+'/BoW_transformer.pickle'))
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics
        self.topwords = topwords
        if self.modeltype is 'kmeans':
            from sklearn.cluster import KMeans
            self.model = KMeans(n_clusters=topics,n_init=50)
        if self.modeltype is 'kpcakmeans':
            from sklearn.cluster import KMeans
            from sklearn.decomposition import KernelPCA
            self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\
                'kmeans':KMeans(n_clusters=topics,n_init=50)}
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow['tfidf_transformer'].fit_transform(\
            self.bow['count_vectorizer'].fit_transform(X))

        # transform word to BoW index into reverse lookup table
        words = self.bow['count_vectorizer'].vocabulary_.values()
        wordidx = self.bow['count_vectorizer'].vocabulary_.keys()
        self.idx2word = dict(zip(words,wordidx))         

        # depending on the model, train
        if self.modeltype is 'kmeans':
            Xc = self.model.fit_predict(X)
        if self.modeltype is 'kpcakmeans':
            Xc = self.model['kpca'].fit_transform(X)
            Xc = self.model['kmeans'].fit_predict(Xc)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X).argmax(axis=0)
        # for each cluster/topic compute covariance of word with cluster label
        # this measure is indicative of the importance of the word for the topic
        ass = zeros(self.topics)
        self.topicstats = []
        for cluster in range(self.topics): 
            # this is a binary vector, true if a data point was in this cluster
            y = double(Xc==cluster)
            # this is the covariance of the data with the cluster label
            Xcov = X.T.dot(y)
            # find the most strongly covarying (with the cluster label) words
            wordidx = reversed(Xcov.argsort()[-self.topwords:])
            topicwords = dict([(self.idx2word[idx],Xcov[idx]) for idx in wordidx])
            self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\
                'words': topicwords})

            print 'Topic %d: %3d Assignments '%(cluster,y.sum())\
                + 'Topwords: ' + ' '.join(topicwords.keys()[:10])

        datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        fn = self.folder+'/topicmodel-%s-'%self.modeltype +datestr+'.json'
        print "Saving model stats to "+fn
        open(fn,'wb').write(json.dumps(self.topicstats))

    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow['tfidf_transformer'].transform(\
            self.bow['count_vectorizer'].transform(X))
        if self.modeltype is 'kmeans':
            return self.model.predict(X)
        if self.modeltype is 'kpcakmeans':
            return self.model['kmeans'].predict(self.model['kpca'].transform(X))
        if self.modeltype is 'nmf':
            return self.model.transform(X).argmax(axis=0)

Example #2

Show file

class Topicmodel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,
                 folder='model',
                 modeltype='kmeans',
                 topics=100,
                 topwords=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = cPickle.load(open(folder + '/BoW_transformer.pickle'))
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics
        self.topwords = topwords
        if self.modeltype is 'kmeans':
            from sklearn.cluster import KMeans
            self.model = KMeans(n_clusters=topics, n_init=50)
        if self.modeltype is 'kpcakmeans':
            from sklearn.cluster import KMeans
            from sklearn.decomposition import KernelPCA
            self.model = {'kpca':KernelPCA(kernel='rbf',gamma=.1),\
                'kmeans':KMeans(n_clusters=topics,n_init=50)}
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self, X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow['tfidf_transformer'].fit_transform(\
            self.bow['count_vectorizer'].fit_transform(X))

        # transform word to BoW index into reverse lookup table
        words = self.bow['count_vectorizer'].vocabulary_.values()
        wordidx = self.bow['count_vectorizer'].vocabulary_.keys()
        self.idx2word = dict(zip(words, wordidx))

        # depending on the model, train
        if self.modeltype is 'kmeans':
            Xc = self.model.fit_predict(X)
        if self.modeltype is 'kpcakmeans':
            Xc = self.model['kpca'].fit_transform(X)
            Xc = self.model['kmeans'].fit_predict(Xc)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X).argmax(axis=0)
        # for each cluster/topic compute covariance of word with cluster label
        # this measure is indicative of the importance of the word for the topic
        ass = zeros(self.topics)
        self.topicstats = []
        for cluster in range(self.topics):
            # this is a binary vector, true if a data point was in this cluster
            y = double(Xc == cluster)
            # this is the covariance of the data with the cluster label
            Xcov = X.T.dot(y)
            # find the most strongly covarying (with the cluster label) words
            wordidx = reversed(Xcov.argsort()[-self.topwords:])
            topicwords = dict([(self.idx2word[idx], Xcov[idx])
                               for idx in wordidx])
            self.topicstats.append({'assignments':y.sum(),'clusterid':cluster,\
                'words': topicwords})

            print 'Topic %d: %3d Assignments '%(cluster,y.sum())\
                + 'Topwords: ' + ' '.join(topicwords.keys()[:10])

        datestr = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        fn = self.folder + '/topicmodel-%s-' % self.modeltype + datestr + '.json'
        print "Saving model stats to " + fn
        open(fn, 'wb').write(json.dumps(self.topicstats))

    def predict(self, X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow['tfidf_transformer'].transform(\
            self.bow['count_vectorizer'].transform(X))
        if self.modeltype is 'kmeans':
            return self.model.predict(X)
        if self.modeltype is 'kpcakmeans':
            return self.model['kmeans'].predict(
                self.model['kpca'].transform(X))
        if self.modeltype is 'nmf':
            return self.model.transform(X).argmax(axis=0)