Exemple #1
0
def nmf_new(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components,
            init='nndsvdar', random_state=0):
    # Numerical solver to use: ‘pg’ is a Projected Gradient solver (deprecated).
    # ‘cd’ is a Coordinate Descent solver (recommended).
    model = NMF(n_components=n_components, init=init,
                random_state=random_state)
    # TODO en boucle
    model.fit(mut_final)
    gene_comp = model.components_.copy()
    patient_strat = np.argmax(model.fit_transform(mut_final), axis=1).copy()
    # fit_transform more efficient than calling fit followed by transform

    model.fit(mut_diff)
    gene_comp_diff = model.components_.copy()
    patient_strat_diff = np.argmax(
        model.fit_transform(mut_diff), axis=1).copy()

    model.fit(mut_mean_qn)
    gene_comp_mean_qn = model.components_.copy()
    patient_strat_mean_qn = np.argmax(
        model.fit_transform(mut_mean_qn), axis=1).copy()

    model.fit(mut_median_qn)
    gene_comp_median_qn = model.components_.copy()
    patient_strat_median_qn = np.argmax(
        model.fit_transform(mut_median_qn), axis=1).copy()

    return (gene_comp, patient_strat,
            gene_comp_diff, patient_strat_diff,
            gene_comp_mean_qn, patient_strat_mean_qn,
            gene_comp_median_qn, patient_strat_median_qn)
def extractTemplate(y, w=d_w, h=d_h, n_components=nc):
    model = NMF(n_components=n_components, max_iter=max_iter, beta=beta)
    S = librosa.core.stft(y, n_fft=w, hop_length=h)
    model.fit_transform(np.abs(S).T)
    components = model.components_.T
    #components, activation = librosa.decompose.decompose(np.abs(S), n_components=3)
    return components
Exemple #3
0
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    m = NMF(n_components=4, init="random", random_state=0)
    m.fit_transform(A)
    t = m.transform(A)
    A_new = m.inverse_transform(t)
    assert_array_almost_equal(A, A_new, decimal=2)
class TopicEmbeddingModel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].fit_transform(\
        #    self.bow['count_vectorizer'].fit_transform(X))

        # depending on the model, train
        if self.modeltype is 'kpca':
            Xc = self.model.fit_transform(X)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X)


    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].transform(\
        #    self.bow['count_vectorizer'].transform(X))
        
        if self.modeltype is 'kpca':
            return self.model.transform(X)
        if self.modeltype is 'nmf':
            return self.model.transform(X)
Exemple #5
0
def test_nmf_transform_custom_init():
    # Smoke test that checks if NMF.transform works with custom initialization
    A = np.abs(random_state.randn(6, 5))
    n_components = 4
    avg = np.sqrt(A.mean() / n_components)
    H_init = np.abs(avg * random_state.randn(n_components, 5))
    W_init = np.abs(avg * random_state.randn(6, n_components))

    m = NMF(solver="cd", n_components=n_components, init="custom", random_state=0)
    m.fit_transform(A, W=W_init, H=H_init)
    m.transform(A)
    def get_features(head_and_body):
        filename = "NMF_topics" + str(n_topics) + "topics"

        if include_holdout == True:
            filename += "_holdout"

        if include_unlbled_test == True:
            filename += "unlbled_test"

        if not (os.path.exists(features_dir + "/" + filename + ".pkl")):
            X_all, vocab = get_all_data(head_and_body, filename)

            # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
            # more important topic words a body contains of a certain topic, the higher its value for this topic
            nfm = NMF(n_components=n_topics, random_state=1, alpha=.1)

            print("NMF_topics: fit and transform body")
            t0 = time()
            nfm.fit_transform(X_all)
            print("done in %0.3fs." % (time() - t0))

            with open(features_dir + "/" + filename + ".pkl", 'wb') as handle:
                joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            vocab = get_vocab(head_and_body, filename)
            with open(features_dir + "/" + filename + ".pkl", 'rb') as handle:
                nfm = joblib.load(handle)

        vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        print("NMF_topics: transform head and body")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        nfm_head_matrix = nfm.transform(X_train_head)
        nfm_body_matrix = nfm.transform(X_train_body)

        if cosinus_dist == False:
            return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1)
        else:
            # calculate cosine distance between the body and head
            X = []
            for i in range(len(nfm_head_matrix)):
                X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1))  # 1d array is deprecated
                X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1))
                cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
                X.append(cos_dist.tolist())
            return X
Exemple #7
0
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    m = NMF(n_components=4, init="nndsvd", random_state=0)
    ft = m.fit_transform(A)
    t = m.transform(A)
    assert_array_almost_equal(ft, t, decimal=2)
Exemple #8
0
    def nmf(self, **kwargs):
        """Perform dimensionality reduction using NMF."""
        nmf = NMF(**kwargs)

        reduced_matrix = nmf.fit_transform(self.matrix)
        # TODO: it is incorrect to pass self.column_labels! There are not column labels.
        return Space(reduced_matrix, self.row_labels, self.column_labels)
Exemple #9
0
def test_nmf_fit_nn_output():
    # Test that the decomposition does not contain negative values
    A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)]
    for init in (None, "nndsvd", "nndsvda", "nndsvdar"):
        model = NMF(n_components=2, init=init, random_state=0)
        transf = model.fit_transform(A)
        assert_false((model.components_ < 0).any() or (transf < 0).any())
Exemple #10
0
def find_template(music_stft, sr, min_t, n_components, start, end):
    """
    from Prem
    :param music_stft:
    :param sr:
    :param min_t:
    :param n_components:
    :param start:
    :param end:
    :return:
    """
    template_stft = music_stft[:, start:end]
    layer = librosa.istft(template_stft)
    layer_rms = np.sqrt(np.mean(layer * layer))

    comps = []
    acts = []
    errors = []

    for T in range(min_t, n_components):
        transformer = NMF(n_components=T)
        comps.append(transformer.fit_transform(np.abs(template_stft)))
        acts.append(transformer.components_)
        errors.append(transformer.reconstruction_err_)

    # knee = np.diff(errors, 2)
    # knee = knee.argmax() + 2
    knee = 0

    # print 'Using %d components' % (knee + min_t)
    return comps[knee], acts[knee]
Exemple #11
0
def hog2hognmf(hog_feature):
    """Transform HOG feature into HOG-NMF feature.

    Parameters
    ----------
    hog_feature: np.ndarray
      HOG feature.
    """
    mat = np.zeros((500, 8), dtype=np.float32)
    NMFmodel = NMF(n_components=2, init="random", random_state=0)
    # Transform 3780 into 500 * 8
    for i in range(7):
        mat[:, i] = hog_feature[i * 500 : (i + 1) * 500]
    mat[:280, 7] = hog_feature[3500:]
    W = NMFmodel.fit_transform(mat)
    H = NMFmodel.components_
    hognmf_feature = np.array([], dtype=np.float32)
    for i in range(8):
        _sum = np.sum(H[:, i])
        if _sum == 0:
            H[:, i] *= 0.0
        else:
            H[:, i] /= _sum
        hognmf_feature = np.append(hognmf_feature, H[:, i])
    for i in range(500):
        _sum = np.sum(W[i, :])
        if _sum == 0:
            W[i, :] *= 0.0
        else:
            W[i, :] /= _sum
        hognmf_feature = np.append(hognmf_feature, W[i, :])
    return hognmf_feature
def get_LDA(X, num_components=10, show_topics=True):
	''' Latent Dirichlet Allication by NMF.
	21 Nov 2015, Keunwoo Choi

	LDA for a song-tag matrix. The motivation is same as get_LSI. 
	With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix,
	where X ~= X' = W*H as a result of NMF. 
	It is also good to have non-negative elements, straight-forward for both W and H.

	'''

	from sklearn.decomposition import NMF
	
	nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt.
	W = nmf.fit_transform(X)
	H = nmf.components_
	print '='*60
	print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1]))

	term_rankings = []
	moodnames = cP.load(open(PATH_DATA + FILE_DICT['sorted_tags'], 'r')) #list, 100
	for topic_index in range( H.shape[0] ):
		top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]
		term_ranking = [moodnames[i] for i in top_indices]
		term_rankings.append(term_ranking)
		if show_topics:	
			print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) )
	print '='*60
	cP.dump(nmf, open(PATH_DATA + 'NMF_object.cP', 'w'))
	cP.dump(term_rankings, open(PATH_DATA + ('topics_strings_%d_components.cP' % num_components), 'w'))
	for row_idx, row in enumerate(W):
		if np.max(row) != 0:
			W[row_idx] = row / np.max(row)
	return W / np.max(W) # return normalised matrix, [0, 1]
	''''''
 def infer_topics(self, num_topics=10):
     self.nb_topics = num_topics
     nmf = NMF(n_components=num_topics)
     topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space)
     self.topic_word_matrix = []
     self.document_topic_matrix = []
     vocabulary_size = len(self.corpus.vocabulary)
     row = []
     col = []
     data = []
     for (topic_idx, topic) in enumerate(nmf.components_):
         for i in range(vocabulary_size):
             row.append(topic_idx)
             col.append(i)
             data.append(topic[i])
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     row = []
     col = []
     data = []
     doc_count = 0
     for doc in topic_document:
         topic_count = 0
         for topic_weight in doc:
             row.append(doc_count)
             col.append(topic_count)
             data.append(topic_weight)
             topic_count += 1
         doc_count += 1
     self.document_topic_matrix = coo_matrix((data, (row, col)),
                                             shape=(self.corpus.size, self.nb_topics)).tocsr()
def reduceDimensionality(n_components=100):
	# import the csv into a pandas df
	df = pd.read_csv('data/gameData.csv')

	# Normalize the numeric columns to values in [0,1]
	numericColumns = ['maxPlayers','maxPlaytime','minAge','minPlayers','minPlaytime','playtime']
	colsToNormalize = []
	for col in numericColumns:
		if col in df.columns:
			colsToNormalize.append(col)

	df[colsToNormalize] = df[colsToNormalize].apply(lambda x: (x - x.min())/(x.max() - x.min())/2)

	# Drop string columns
	colsToDrop = ['artists','categories','designers','families','publishers','mechanics','boardGameId','yearPublished']

	# Convert df to an array for NMF and stor the board game id column to attach later
	boardGameIds = df['boardGameId']
	arr = df.as_matrix([col for col in df.columns if col not in colsToDrop])
	arr = np.nan_to_num(arr)

	# Perform NMF with n_dimensions
	model = NMF(n_components=n_components)
	W = model.fit_transform(arr)
	W = np.insert(W, 0, boardGameIds, axis=1)

	np.savetxt("data/reducedGameFeatures.csv", W, delimiter=",")
Exemple #15
0
 def extract_tfidf_nmf_feats(self, df_data, n_components):
     """
     Extract tfidf features using nmf.     
     """        
     df_feat = pd.DataFrame(index=range(df_data.shape[0]))
     tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
     tsvd = TruncatedSVD(n_components=n_components, random_state = 2016)
     nmf = NMF(solver='cd', n_components=n_components, init='nndsvda',
                 random_state=0, tol=1e-3)
     df_data['q'].to_csv('q', index=False)
     df_data['t'].to_csv('t', index=False)
     df_data['d'].to_csv('d', index=False)
     print('fitting in tfidf')
     tfidf.set_params(input='filename')        
     tfidf.fit(['q','t','d'])
     tfidf.set_params(input='content')  
     for col in ['d', 't', 'q', 'b']:
         print('process column', col)
         txt = df_data[col]
         tfidf_mat = tfidf.transform(txt)
         nd_feat = nmf.fit_transform(tfidf_mat)
         tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \
                                     for i in range(n_components)])
         df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True)
     saveit(df_feat, 'df_tfidf_nmf_feats')
def nnMatrixFactorisation(data, labels, new_dimension):
    print "non negative matrix factorisation..."
    start = time.time()
    mf = NMF(n_components=new_dimension)
    reduced = mf.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
Exemple #17
0
def nmf_model2(n_topics,document_term_mat):
    # print("\n\n---------\n decomposition")
    nmf = NMF(n_components=n_topics, l1_ratio=0.0)
    W_sklearn = nmf.fit_transform(document_term_mat)
    H_sklearn = nmf.components_
    # describe_nmf_results(document_term_mat, W_sklearn, H_sklearn)
    return W_sklearn, H_sklearn
	def __Factorize_NMF(self,K):
		model = NMF(n_components=K,max_iter=self._iteration)
		model.fit(self._mat)
		user_fmat = model.fit_transform(self._mat)
		item_fmat = model.components_.T

		return user_fmat,item_fmat
def do_NMF(sparse_matrix):
  t0 = time.time()
  print("* Performing NMF on sparse matrix ... ")
  nmf = NMF(n_components=3)
  coordinates = nmf.fit_transform(sparse_matrix)
  print("done in %0.3fs." % (time.time() - t0))
  return(coordinates)
Exemple #20
0
def nmf_df(sym, k, coll):
    data = [ item for item in coll.find({'text': { '$in' :[re.compile(sym)] }}) ]
    sents = [ sentence['text'] for sentence in data ]
    dates = [ str(text['created_at']) for text in data ]
    d = np.array(dates).T
    d = d.reshape(len(dates), 1)

    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(sents)
    #features = vectorizer.get_feature_names()

    model = NMF(n_components=k, init='random', random_state=0)
    latent_features = model.fit_transform(X)

    # lat0 = list(latent_features[:,0])
    # lat1 = list(latent_features[:,1])
    # lat2 = list(latent_features[:,2])
    # lat3 = list(latent_features[:,3])

    df = pd.DataFrame(latent_features)   #np.concatenate((d, latent_features), axis=1)
    df.columns = [ 'lat'+ str(n) for n in xrange(len(df.columns)) ]
    df['time_stamp'] = d
    #print df.head()

    df['date'] = pd.to_datetime(df['time_stamp']).apply(pd.datetools.normalize_date)
    df.pop('time_stamp')
    #print df.head()
    grouped_data = df.groupby(['date']).mean()
    grouped_data['sym'] = sym

    return grouped_data
Exemple #21
0
def find_aspects(sentences, city, n_top_words=15):
    '''
    INPUT sentences, city(str, lower case)
    OUTPUT aspects dictionary
    '''
    vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')
    document_term_mat = vectorizer.fit_transform(sentences)
    feature_words = vectorizer.get_feature_names()

    nmf = NMF(n_components=n_topics)
    W_sklearn = nmf.fit_transform(document_term_mat)
    H_sklearn = nmf.components_
    important_words = []

    for topic in H_sklearn:
        for i in topic.argsort()[:-n_top_words - 1:-1]:
                important_words.append(feature_words[i])
    important_words = set(important_words)
    important_words = list(important_words)

    nouns = []
    for i in sentences: nouns.extend(list(TextBlob(i).noun_phrases))
    noun_list = list(set(filter(lambda x: (len(x.split(' '))>1)&('...' not in x.split(' ')), nouns)))
    aspects_dict = defaultdict(list)

    for i in important_words:
        if i not in [city, city.lower(),'okay','ok','thing','things','time','times','greasy','awful'] and TextBlob(i).tags[0][1] in ['NN', 'NNS']:
            for j in noun_list:
                if i in j.split(' '):
                    aspects_dict[i].append(j)
    for i in aspects_dict: aspects_dict[i] = list(set(aspects_dict[i]))

    return aspects_dict
def extract_reconstruction_error_beats(comps, music_stft, beats):
	K = comps.shape[1]
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	start = 0
	errors = []
	lookback = 0
	weight = np.array([1 for i in range(2, music_stft.shape[0] + 2)])
	weight = weight/np.max(weight)
	for i in range(lookback+1, len(beats)):
		block = music_stft[:, beats[i-(lookback+1)]:beats[i]]
		
		H = np.random.rand(K, block.shape[1])
		W[:, 0:K] = comps
		
		params = {'W': W, 'H': H, 'update_W': False}
		comps_block = transformer.fit_transform(np.abs(block), **params)
		acts_block = transformer.components_

		#reconstruct the signal
		block_reconstruction = comps_block.dot(acts_block)
		
		block_reconstruction = block_reconstruction.T*weight
		block = block.T*weight
		distance = norm(block_reconstruction - np.abs(block))
		#errors.append(transformer.reconstruction_err_)
		errors.append(distance)
	return errors
def extract_template(comps, music_stft):
	K = comps.shape[1]
	
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	H = np.random.rand(K, music_stft.shape[1])
	
	#set W to be the template components you want to extract
	W[:, 0:K] = comps

	#don't let W get updated in the non-negative matrix factorization
	params = {'W': W, 'H': H, 'update_W': False}
	comps_music = transformer.fit_transform(np.abs(music_stft), **params)
	acts_music = transformer.components_
	
	#reconstruct the signal
	music_reconstruction = comps_music.dot(acts_music)

	#mask the input signal
	music_stft_max = np.maximum(music_reconstruction, np.abs(music_stft))
	mask = np.divide(music_reconstruction, music_stft_max)
	mask = np.nan_to_num(mask)
	
	#binary mask
	mask = np.round(mask)

	#template - extracted template, residual - everything that's leftover.
	template = np.multiply(music_stft, mask)
	residual = np.multiply(music_stft, 1 - mask)

	return template, residual
def extract_reconstruction_errors(comps, music_stft, window_length, hop):
	K = comps.shape[1]
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	start = 0
	errors = []

	while (start + window_length < music_stft.shape[1]):
		block = music_stft[:, start:start+window_length]
		
		H = np.random.rand(K, block.shape[1])
		W[:, 0:K] = comps
		
		params = {'W': W, 'H': H, 'update_W': False}
		comps_block = transformer.fit_transform(np.abs(block), **params)
		acts_block = transformer.components_
	
		#reconstruct the signal
		block_reconstruction = comps_block.dot(acts_block)
		errors.append(transformer.reconstruction_err_)

		start = start + hop
	return errors
def doNMF(datan,n_components=4):
    # from Mitsu
    #alternatively PCA ... might me faster
    nmf=NMF(n_components=n_components,init='nndsvd')
    data_decomp_all=nmf.fit_transform(datan)
    data_components_all=nmf.components_
    return data_decomp_all,data_components_all
def get_LDA(X, num_components=10, show_topics=True):
	""" Latent Dirichlet Allication by NMF.
	21 Nov 2015, Keunwoo Choi

	LDA for a song-tag matrix. The motivation is same as get_LSI. 
	With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix,
	where X ~= X' = W*H as a result of NMF. 
	It is also good to have non-negative elements, straight-forward for both W and H.

	"""

	from sklearn.decomposition import NMF
	if X == None:
		print 'X is omitted, so just assume it is the mood tag mtx w audio.'
		X = np.load(PATH_DATA + FILE_DICT["mood_tags_matrix"]) #np matrix, 9320-by-100

	nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt.
	W = nmf.fit_transform(X)
	H = nmf.components_
	print '='*60
	print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1]))

	term_rankings = []
	moodnames = cP.load(open(PATH_DATA + FILE_DICT["moodnames"], 'r')) #list, 100
	for topic_index in range( H.shape[0] ):
		top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]
		term_ranking = [moodnames[i] for i in top_indices]
		term_rankings.append(term_ranking)
		if show_topics:	
			print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) )
	print '='*60
	cP.dump(term_rankings, open(PATH_DATA + (FILE_DICT["mood_topics_strings"] % num_components), 'w'))
	return W / np.max(W) # return normalised matrix, [0, 1]
Exemple #27
0
    def _make_test_matrix(self, matrix, test_decomp='svd'):
        '''
        Input: a matrix
        Output: a recomposed estimated ratings matrix

        Decomposes input matrix according to decomposition type
        and then makes an estimated ratings matrix
        '''
        if test_decomp == 'svd':
            _, s1, V = svd(matrix)
            how = self.s_option
            how = self.test_how
            #print "s1", s1
            #print "how", how
            s = self._get_s(s1, how)
            #print s
            #print V
            #print self.matrix_1.U
            return np.dot(self.matrix_1.U, np.dot(s, V))
        elif test_decomp == 'nmf':
            model = NMF()
            H = model.fit_transform(matrix)
            print H
            W = model.components_
            return np.dot(self.matrix_1.H, W)
        else:
            pass

        '''
    def _fit_local(self, data):

        from sklearn.decomposition import NMF

        nmf = NMF(n_components=self.k, tol=self.tol, max_iter=self.max_iter, random_state=self.seed)
        w = nmf.fit_transform(data)

        return w, nmf.components_,
def nmf (matriztfxidf):
	
	nmf = NMF(n_components = 50, init='random', random_state=0)
	matrizReduzida = nmf.fit_transform(matriztfxidf)			# w
	#h = nmf.components_										# h
	#resultado = np.dot(matrizReduzida, h)						# w.h -> volta na matriz original aproximada

	return matrizReduzida
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0)
        ft = m.fit_transform(A)
        t = m.transform(A)
        assert_array_almost_equal(ft, t, decimal=2)
Exemple #31
0
class Archetypes:
    '''
    Archetypes: Performs NMF of order n on X and stores the result as attributes. 
    Archetypes are normalized: cosine similarity a(i) @ a(i) = 1. 
    Atributes:
        my_archetypes.n         - order / number of archetypes
        my_archetypes.X         - input matrix
        
        my_archetypes.model     - NMF model 
        my_archetypes.w         - NMF w-matrix 
        my_archetypes.h         - NMF h-matrix
        
        my_archetypes.o         - occupations x archetypes matrix (from w-matrix)
        my_archetypes.on        - occupations x normalized archetypes matrix (from w-matrix) - SOCP number as index. 
        my_archetypes.occ       - occupations x normalized archetypes matrix - Occupation names as index
        
        my_archetypes.f         - features x archetypes matrix (from h-matrix)
        my_archetypes.fn        - features x normalized archetypes matrix
        
    '''
    def __init__(self,X,n,norm = norm_dot):
        self.n = n
        self.X = X
        self.model = NMF(n_components=n, init='random', random_state=0, max_iter = 1000, tol = 0.0000001)
        self.w = self.model.fit_transform(self.X)
        self.o = pd.DataFrame(self.w,index=self.X.index)
        self.on = self.o.T.apply(norm).T
        self.occ = self.on.copy()
        self.occ['Occupations'] = self.occ.index
#        self.occ['Occupations'] = self.occ['Occupations'].apply(onet_socp_name)
        self.occ = self.occ.set_index('Occupations')
        self.h = self.model.components_
        self.f = pd.DataFrame(self.h,columns=X.columns)
        self.fn =self.f.T.apply(norm).T
        self.plot_occupations_dic ={}
        self.plot_features_dic ={}

        
    def plot_features(self,fig_scale = (1,3.5),metric='cosine', method = 'single',vertical = False): 
        '''
        Plot Archetypes as x and features as y. 
        Utilizes Seaborn Clustermap, with hierarchical clustering along both axes. 
        This clusters features and archetypes in a way that visualizes similarities and diffferences
        between the archetypes. 
        
        Archetypes are normalized (cosine-similarity): dot product archetype[i] @ archetype[i] = 1.
        The plot shows intensities (= squared feature coefficients) so that the sum of intensities = 1.  

        fig_scale: default values (x/1, y/3.5) scales the axes so that all feature labels are included in the plot.
        
        For other hyperparameters, see seaborn.clustermap
     
        '''
        param = (fig_scale,metric,method,vertical)
        if param in self.plot_features_dic.keys():
            fig = self.plot_features_dic[param]
            return fig.fig

        df = np.square(self.fn)

        if vertical:
            fig = sns.clustermap(df.T,robust = True, z_score=1,figsize=(
                self.n/fig_scale[0],self.X.shape[1]/fig_scale[1]),method = method,metric = metric)        
        else: # horizontal
            fig = sns.clustermap(df,robust = True, z_score=0,figsize=(
                self.X.shape[1]/fig_scale[1],self.n/fig_scale[0]),method = method,metric = metric)        
        self.features_plot = fig
        return fig


    def plot_occupations(self,fig_scale = (1,3.5),metric='cosine', method = 'single',vertical = False):
        '''
        Plot Archetypes as x and occupations as y. 
        Utilizes Seaborn Clustermap, with hierarchical clustering along both axes. 
        This clusters occupations and archetypes in a way that visualizes similarities and diffferences
        between the archetypes. 
        
        Occupations are normalized (cosine-similarity): dot product occupation[i] @ occupation[i] = 1.
        The plot shows intensities (= squared feature coefficients) so that the sum of intensities = 1.  

        fig_scale: default values (x/1, y/3.5) scales the axes so that all feature labels are included in the plot.
        
        For other hyperparameters, see seaborn.clustermap
     
        '''
        param = (fig_scale,metric,method,vertical)
        if param in self.plot_occupations_dic.keys():
            fig = self.plot_occupations_dic[param]
            #return
            return fig.fig

        df = np.square(self.occ)
        if vertical:
            fig = sns.clustermap(df, figsize=(
                self.n/fig_scale[0],self.X.shape[0]/fig_scale[1]),method = method,metric = metric)
        else: # horizontal
            fig = sns.clustermap(df.T, figsize=(
                self.X.shape[0]/fig_scale[1],self.n/fig_scale[0]),method = method,metric = metric)
        self.plot_occupations_dic[param] = fig
        #return
        return fig.fig
for comp in W_zero:
    comp[pitch-pitch_min_number] = 1.0
    p = pitch + 12
    while p < W_zero.shape[1] - 2:
        for epsilon in range(-2, 2):
            comp[p - pitch_min + epsilon] = 1.0
        p += 12
        
H_zero = np.random.rand(V.shape[0], pitch_max - pitch_min_number)

print V.shape

from sklearn.decomposition import NMF

model = NMF(init='custom', n_components=pitch_max-pitch_min_number)
comps = model.fit_transform(V, W=H_zero, H=W_zero)
acts = model.components_

#from librosa.decompose import decompose

#comps, acts = decompose(V, n_components=n_components, sort=True)

# visualisation matters
import matplotlib.pyplot as plt
from librosa.display import specshow
import matplotlib.gridspec as gridspec

plt.close('all')

plt.subplot2grid((4, 2), (0,0), colspan=2)
specshow(midi_mat, sr=sr, x_axis='time', y_axis='cqt_note')
Exemple #33
0
def extract_components(mov_tot,
                       n_components: int = 6,
                       normalize_std: bool = True,
                       max_iter_DL=-30,
                       method_factorization: str = 'nmf',
                       **kwargs) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    From optical flow images can extract spatial and temporal components

    Args:
        mov_tot: ndarray (can be 3 or 4D)
            contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates)
            the input is generated by the compute_optical_flow function

        n_components: int
            number of components to look for

        normalize_std: bool
            whether to normalize each oof the optical flow components

        normalize_output_traces: boolean
            whether to normalize the behavioral traces so that they match the units in the movie

    Returns:
        spatial_filter: ndarray
            set of spatial inferred filters

        time_trace: ndarray
            set of time components

        norm_fact: ndarray
            used notmalization factors
    """

    if mov_tot.ndim == 4:
        if normalize_std:
            norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3))
            mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis,
                                                 np.newaxis])
        else:
            norm_fact = np.array([1., 1.])
        c, T, d1, d2 = np.shape(mov_tot)

    else:
        norm_fact = 1
        T, d1, d2 = np.shape(mov_tot)
        c = 1

    tt = time.time()
    newm = np.reshape(mov_tot, (c * T, d1 * d2))

    if method_factorization == 'nmf':
        nmf = NMF(n_components=n_components, **kwargs)

        time_trace = nmf.fit_transform(newm)
        spatial_filter = nmf.components_
        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter
        ],
                                        axis=0)

    elif method_factorization == 'dict_learn':
        import spams
        newm = np.asfortranarray(newm, dtype=np.float32)
        time_trace = spams.trainDL(newm,
                                   K=n_components,
                                   mode=0,
                                   lambda1=1,
                                   posAlpha=True,
                                   iter=max_iter_DL)

        spatial_filter = spams.lasso(newm,
                                     D=time_trace,
                                     return_reg_path=False,
                                     lambda1=0.01,
                                     mode=spams.spams_wrap.PENALTY,
                                     pos=True)

        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :]
            for sp in spatial_filter.toarray()
        ],
                                        axis=0)

    time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T]

    el_t = time.time() - tt
    print(el_t)
    return spatial_filter, time_trace, norm_fact
Exemple #34
0
def find_model(dataset,
               train_size,
               problem,
               label="",
               datatype="numerical",
               dim_reduction=False,
               components="auto",
               contains_negative=True,
               ensembling=True,
               priority="accuracy"):
    if datatype != "nominal":
        # Label encode data to ensure everything is numeric
        print("Label encoding. . .")
        dataset = dataset.apply(LabelEncoder().fit_transform)

    # Reduce dimensionality of dataset
    if dim_reduction:
        print("Performing dimensionality reduction. . .")
        print("Features' shape before reduction is", X.shape)
        if contains_negative:  # If dataset contains negative values, use principal component analysis
            if components == "auto":
                print(
                    "Using default number of components for principal component analysis. . ."
                )
                pca = PCA(n_components=2)
            else:
                print("Using", components,
                      "components for principal component analysis. . .")
                pca = PCA(n_components=components)
            X = pca.fit_transform(X)
        else:  # Otherwise, use non-negative matrix factorization
            if components == "auto":
                print(
                    "Using default number of components for non-negative matrix factorization. . ."
                )
                nmf = NMF(n_components=2)
            else:
                print("Using", components,
                      "components for principal component analysis. . .")
                nmf = NMF(n_components=components)
            X = nmf.fit_transform(X)
        print("Features' shape after reduction is", X.shape)

    if problem != "clustering":
        # Split X and y into training and testing datasets
        print("Splitting datasets for training and testing. . .")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, train_size=train_size)
        # Scale variables to standardize values
        print("Standardizing values. . .")
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
    else:
        # Scale variables to standardize values
        print("Standardizing values. . .")
        sc = StandardScaler()
        X = sc.fit_transform(X)

    if priority == "accuracy":
        if problem == "classification":
            find_classification_model(X_train,
                                      X_test,
                                      y_train,
                                      y_test,
                                      priority="accuracy",
                                      ensembling=ensembling,
                                      datatype=datatype)
        elif problem == "regression":
            find_regression_model(X_train,
                                  X_test,
                                  y_train,
                                  y_test,
                                  ensembling=ensembling)
        elif problem == "clustering":
            find_clustering_model(X)

    if priority == "time":
        if problem == "classification":
            find_classification_model(X_train,
                                      X_test,
                                      y_train,
                                      y_test,
                                      priority="time",
                                      ensembling=ensembling,
                                      datatype=datatype)
        elif problem == "regression":
            find_regression_model(X_train,
                                  X_test,
                                  y_train,
                                  y_test,
                                  ensembling=ensembling)
        elif problem == "clustering":
            find_clustering_model(X)
Exemple #35
0
class MFRecommender(BaseRecommender):
    """Matrix factorization recommender

    Uses Matrix Factorization to determine which pipeline to recommend.

    Args:
        n_components (int): Corresponds to the number of features to keep in matrix decomposition.
            Must be greater than the number of rows in matrix.
        r_minimum (int): The minimum number of past results this recommender needs in order to use
            Matrix Factorization for prediction. If not enough results are present during a
            ``predict``, a uniform recommender is used.
    """

    def __init__(self, dpp_matrix, n_components=100, r_minimum=5):
        super(MFRecommender, self).__init__(dpp_matrix)

        self.n_components = n_components
        self.r_minimum = r_minimum

        # Matrix Factorization model that reduces dimensionality from num pipelines space to
        # n_components space.
        self.mf_model = NMF(n_components=n_components, init='nndsvd')

        dpp_decomposed = self.mf_model.fit_transform(dpp_matrix)

        # Matrix of rankings for each row of dpp_matrix after matrix facorization has been applied.
        self.dpp_ranked = np.empty(dpp_decomposed.shape)
        for i in range(dpp_decomposed.shape[0]):
            rankings = stats.rankdata(
                dpp_decomposed[i, :],
                method='dense'
            )
            self.dpp_ranked[i, :] = rankings

        random_matching_index = np.random.randint(self.dpp_matrix.shape[0])

        # Row from dpp_matrix representing pipeline performances for the dataset that most closely
        # matches the new dataset D. Identified in fit.
        self.matching_dataset = self.dpp_matrix[random_matching_index, :]

    def fit(self, dpp_vector):
        """
        Finds row of self.dpp_matrix most closely corresponds to X by means
        of Kendall tau distance.
        https://en.wikipedia.org/wiki/Kendall_tau_distance

        Args:
            dpp_vector (np.array): Array with shape (n_components, )
        """

        # decompose X and generate the rankings of the elements in the
        # decomposed matrix
        dpp_vector_decomposed = self.mf_model.transform(dpp_vector)
        dpp_vector_ranked = stats.rankdata(
            dpp_vector_decomposed,
            method='dense',
        )

        max_agreement_index = None
        max_agreement = -1  # min value of Kendall Tau agremment
        for i in range(self.dpp_ranked.shape[0]):
            # calculate agreement between current row and X
            agreement, _ = stats.kendalltau(
                dpp_vector_ranked,
                self.dpp_ranked[i, :],
            )
            if agreement > max_agreement:
                max_agreement_index = i
                max_agreement = agreement

        if max_agreement_index is None:
            max_agreement_index = np.random.randint(self.dpp_matrix.shape[0])
        # store the row with the highest agreement for prediction
        self.matching_dataset = self.dpp_matrix[max_agreement_index, :]

    def predict(self, indices):
        num_tried_candidates = len(np.where(self.dpp_vector != 0)[0])
        if num_tried_candidates < self.r_minimum:
            return UniformRecommender(self.dpp_matrix).predict(indices)
        matching_scores = np.array(
            [self.matching_dataset[each] for each in indices]
        )
        return stats.rankdata(matching_scores, method='dense')
test_encoded = encoder.predict(test_signal)
train_encoded = encoder.predict(aggregate_signal)


################
# buid nmf model
################

alpha = 0.012
model = NMF(n_components = encoding_dim, init = 'random', max_iter=500, solver='cd')

#################
# train nmf model
#################
print('*'*5,'evaluate NMF','*'*5)
W = model.fit_transform(train_encoded)
train_error = disag_error(train_encoded,W,model.components_)
print('train error: ', train_error)

####################
# evaluate nmf model
####################
W_test = model.transform(test_encoded)
test_error = disag_error(test_encoded,W_test,model.components_)
print('Test error: ', test_error)

#################
# decoder outputs
#################

decoded_signal = decoder.predict(W.dot(model.components_))
Exemple #37
0
def NMF_TFIDF():

	english_stemmer = Stemmer.Stemmer('en')
	class StemmedTfidfVectorizer(TfidfVectorizer):

		def build_analyzer(self):
			analyzer = super(TfidfVectorizer, self).build_analyzer()
			return lambda doc: english_stemmer.stemWords(analyzer(doc))

	cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

	print("Loading 20 newsgroups dataset for categories:")
	pprint(list(cats))

	newsgroups = fetch_20newsgroups(subset='all', categories = cats)
	
	print("%d documents" % len(newsgroups.data))
	print("%d categories" % len(newsgroups.target_names))

	print("Creating stemmed TFxIDF representation...")
	t0 = time()

	vect = StemmedTfidfVectorizer(stop_words='english')
	vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation

	print("Done in %fs" % (time() - t0))
	print("n_samples: %d, n_features: %d" % vectors.shape)

	workbook = xlsxwriter.Workbook('part3_NMF.xlsx')

	purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score']

	metric_list = {}

	for i in range(1,21):

		print("Implementing NMF on data...")
		nmf_ = NMF(n_components=i) # 
		nmf_data = nmf_.fit_transform(vectors)
		print("Done.")

		labels = newsgroups.target
		labels_2 = []

		# Changing the labels from 0-7 to 0-1 
		for mark in labels:
			if mark <= 3:
				labels_2.append(0)
			else:
				labels_2.append(1)

		k = 2

		km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

		print("Clustering sparse data with %s" % km)
		t0 = time()
		km.fit(nmf_data)
		print("done in %0.3fs" % (time() - t0))

		print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_2, km.labels_))
		print("Completeness: %0.3f" % metrics.completeness_score(labels_2, km.labels_))
		print("V-measure: %0.3f" % metrics.v_measure_score(labels_2, km.labels_))
		print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels_2, km.labels_))
		print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(labels_2, km.labels_))
		print metrics.confusion_matrix(labels_2,km.labels_)

		purityMetrics = [metrics.homogeneity_score(labels_2, km.labels_), metrics.completeness_score(labels_2, km.labels_),metrics.v_measure_score(labels_2, km.labels_),metrics.adjusted_rand_score(labels_2, km.labels_),metrics.adjusted_mutual_info_score(labels_2, km.labels_)]

		# Writing to .xlsx file (For Confusion Matrix)
		worksheet = workbook.add_worksheet()
		obs = zip(km.labels_,labels_2)

		row = 0
		col = 0

		worksheet.write(row,col,'Predictions')
		worksheet.write(row,col+1,'Actuals')
		worksheet.write(row,col+6,'Dimension')
		worksheet.write(row+1,col+6,i)

		metric_list = dict(zip(purityMetricsNames,purityMetrics))
		pprint(dict(metric_list))

		for key in metric_list.keys():
			row += 1
			worksheet.write(row,col+11,key)
			worksheet.write(row,col+12,metric_list[key])

		row = 0
		col = 0

		for pred, actual in (obs):
			row += 1
			worksheet.write(row,col, pred)
			worksheet.write(row,col+1,actual)

		row = 1

		for things in labels:
			worksheet.write(row,col+2,things)
			row += 1

	workbook.close()
Exemple #38
0
def NMF_2():

	english_stemmer = Stemmer.Stemmer('en')
	class StemmedTfidfVectorizer(TfidfVectorizer):

		def build_analyzer(self):
			analyzer = super(TfidfVectorizer, self).build_analyzer()
			return lambda doc: english_stemmer.stemWords(analyzer(doc))

	cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

	print("Loading 20 newsgroups dataset for categories:")
	pprint(list(cats))

	newsgroups = fetch_20newsgroups(subset='all', categories = cats)
	
	print("%d documents" % len(newsgroups.data))
	print("%d categories" % len(newsgroups.target_names))

	print("Creating stemmed TFxIDF representation...")
	t0 = time()

	vect = StemmedTfidfVectorizer(stop_words='english')
	vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation

	print("Done in %fs" % (time() - t0))
	print("n_samples: %d, n_features: %d" % vectors.shape)

	workbook = xlsxwriter.Workbook('partC_NMF.xlsx')

	print("Implementing NMF of dimension 2 on data...")

	nmf_ = NMF(n_components=2) # alpha value? l1 value?
	nmf_data = nmf_.fit_transform(vectors)

	print("Done.")

	print("Implementing non-linear transform on data...")

	offset = 0.001
	nmf_data_off=np.add(nmf_data,offset)
	log_nmfdata=np.log(nmf_data_off)

	print("Done.")

	labels = newsgroups.target
	labels_2 = []

	# Changing the labels from 0-7 to 0-1 
	for mark in labels:
		if mark <= 3:
			labels_2.append(0)
		else:
			labels_2.append(1)

	k = 2

	km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

	print("Clustering sparse data with %s" % km)
	t0 = time()
	km.fit(nmf_data)
	km.fit(log_nmfdata)
	print("done in %0.3fs" % (time() - t0))

	# Transforming data back
	data2D = km.transform(nmf_data)
	data2D_logarithm =  km.transform(log_nmfdata)

	plt.figure(1)

	plt.subplot(221)
	print("Plotting labels of Kmeans algorithm using NMF")
	plt.title('NMF Dim 2 Kmeans Algorithm with NMF')
	plt.scatter(nmf_data[:,0], nmf_data[:,1], c=km.labels_)
	
	plt.subplot(222)
	print("Plotting ground truth")
	plt.title('True labels of data')
	plt.scatter(nmf_data[:,0], nmf_data[:,1], c=labels_2)

	plt.subplot(223)
	print("Plotting labels of Kmeans algorithm with nonlinear transform NMF")
	plt.title('NMF Dim 2 Kmeans Algorithm Nonlinear transform')
	plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=km.labels_)
	
	plt.subplot(224)
	print("Plotting ground truth with nonlinear transform")
	plt.title('Ground truth, nonlinear transform')
	plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=labels_2)


	plt.show()

	print ("Done.")
Exemple #39
0
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
         [5,3,0,0]
        ]

    R = np.array(R)

    N = len(R)
    M = len(R[0])
    K = 2

    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)

    print('Simple matrix factorization')
    P, Q = matrix_factorization(R, P, Q, K)
    print(P)
    print(Q)
    print(P.dot(Q.T))

    print('Non-negative matrix factorization')
    model = NMF(n_components=2, init='random', random_state=0)
    W = model.fit_transform(R)
    H = model.components_
    print(W)
    print(H.T)
    print(W.dot(H))
Exemple #40
0
def _greedyROI(scan, num_components=200, neuron_size=(11, 11),
               num_background_components=1):
    """ Initialize components by searching for gaussian shaped, highly active squares.
    #one by one by moving a gaussian window over every pixel and
    taking the highest activation as the center of the next neuron.

    :param np.array scan: 3-dimensional scan (image_height, image_width, num_frames).
    :param int num_components: The desired number of components.
    :param (float, float) neuron_size: Expected size of the somas in pixels (y, x).
    :param int num_background_components: Number of components that model the background.
    """
    from scipy import ndimage

    # Get some params
    image_height, image_width, num_frames = scan.shape

    # Get the gaussian kernel
    gaussian_stddev = np.array(neuron_size) / 4 # entire neuron in four standard deviations
    gaussian_kernel = _gaussian2d(gaussian_stddev)

    # Create residual scan (scan minus background)
    residual_scan = scan - np.mean(scan, axis=(0, 1)) # image-wise brightness
    background = ndimage.gaussian_filter(np.mean(residual_scan, axis=-1), neuron_size)
    residual_scan -= np.expand_dims(background, -1)

    # Create components
    masks = np.zeros([image_height, image_width, num_components], dtype=np.float32)
    traces = np.zeros([num_components, num_frames], dtype=np.float32)
    mean_frame = np.mean(residual_scan, axis=-1)
    for i in range(num_components):

        # Get center of next component
        neuron_locations = ndimage.gaussian_filter(mean_frame, gaussian_stddev)
        y, x = np.unravel_index(np.argmax(neuron_locations), [image_height, image_width])

        # Compute initial trace (bit messy because of edges)
        half_kernel = np.fix(np.array(gaussian_kernel.shape) / 2).astype(np.int32)
        big_yslice = slice(max(y - half_kernel[0], 0), y + half_kernel[0] + 1)
        big_xslice = slice(max(x - half_kernel[1], 0), x + half_kernel[1] + 1)
        kernel_yslice = slice(max(0, half_kernel[0] - y),
                              None if image_height > y + half_kernel[0] else image_height - y - half_kernel[0] - 1)
        kernel_xslice = slice(max(0, half_kernel[1] - x),
                              None if image_width > x + half_kernel[1] else image_width - x - half_kernel[1] - 1)
        cropped_kernel = gaussian_kernel[kernel_yslice, kernel_xslice]
        trace = np.average(residual_scan[big_yslice, big_xslice].reshape(-1, num_frames),
                           weights=cropped_kernel.ravel(), axis=0)

        # Get mask and trace using 1-rank NMF
        half_neuron = np.fix(np.array(neuron_size) / 2).astype(np.int32)
        yslice = slice(max(y - half_neuron[0], 0), y + half_neuron[0] + 1)
        xslice = slice(max(x - half_neuron[1], 0), x + half_neuron[1] + 1)
        mask, trace = _rank1_NMF(residual_scan[yslice, xslice], trace)

        # Update residual scan
        neuron_activity = np.expand_dims(mask, -1) * trace
        residual_scan[yslice, xslice] -= neuron_activity
        mean_frame[yslice, xslice] = np.mean(residual_scan[yslice, xslice], axis=-1)

        # Store results
        masks[yslice, xslice, i] = mask
        traces[i] = trace

    # Create background components
    residual_scan += np.mean(scan, axis=(0, 1)) # add back overall brightness
    residual_scan += np.expand_dims(background, -1) # and background
    if num_background_components == 1:
        background_masks = np.expand_dims(np.mean(residual_scan, axis=-1), axis=-1)
        background_traces = np.expand_dims(np.mean(residual_scan, axis=(0, 1)), axis=0)
    else:
        from sklearn.decomposition import NMF
        print("Warning: Fitting more than one background component uses scikit-learn's "
              "NMF and may take some time.""")
        model = NMF(num_background_components, random_state=123, verbose=True)

        flat_masks = model.fit_transform(residual_scan.reshape(-1, num_frames))
        background_masks = flat_masks.reshape([image_height, image_width, -1])
        background_traces = model.components_

    return masks, traces, background_masks, background_traces
Exemple #41
0
# IPython log file

from __future__ import division
import numpy as np
from sklearn.decomposition import NMF
nmf = NMF()
from sklearn.datasets import load_iris
iris = load_iris()
NMF(iris.data)
nmf.fit_transform(iris.data)
nmf = NMF(n_components=2)
fits = nmf.fit_transform(iris.data)
fits
len(fits)
fits[:5]
nmf.reconstruction_err_
exit()
def nmf_on_data(vectorizer, uk_transcipts, topics, num_top_words):
    trans_vectorized = vectorizer.fit_transform(uk_transcipts)
    nmf_model = NMF(topics)
    doc_topic = nmf_model.fit_transform(trans_vectorized)
    print('explained variance: ', get_score(nmf_model, trans_vectorized.toarray()))
    return doc_topic, display_topics(nmf_model, vectorizer.get_feature_names(), num_top_words)
Exemple #43
0
def calculate_topics(features, n_topics):
    # http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html
    nmf = NMF(n_components=n_topics)
    return nmf, nmf.fit_transform(features)
Exemple #44
0
# LSA
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#example-text-document-clustering-py
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
svd = TruncatedSVD(dim)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
sklearn_tfidf_train_svd = lsa.fit_transform(sklearn_tfidf_train)
sklearn_tfidf_test_svd = lsa.fit_transform(sklearn_tfidf_test)
sklearn_tf_train_svd = lsa.fit_transform(sklearn_tf_train)
sklearn_tf_test_svd = lsa.fit_transform(sklearn_tf_test)

from sklearn.decomposition import NMF
nmfModel = NMF(n_components=dim, init='random', random_state=0)
sklearn_tfidf_train_nmf = nmfModel.fit_transform(sklearn_tfidf_train)
sklearn_tfidf_test_nmf = nmfModel.fit_transform(sklearn_tfidf_test)
sklearn_tf_train_nmf = nmfModel.fit_transform(sklearn_tf_train)
sklearn_tf_test_nmf = nmfModel.fit_transform(sklearn_tf_test)

#sklearn_tf_train_nmf = nmfModel.fit_transform(sklearn_tfidf_train)

#%% 4. Traing LDA Model and Vectoring
# Train model
#from gensim.models.ldamodel import LdaModel
#lda = LdaModel(corpus=train_corpus_tfidf, id2word=dictionary, num_topics=dim)#, alpha=alpha)
#
## Vectorizing
#trainTopicDistArr = lda.inference(train_corpus_tfidf)[0]
#testTopicDistArr = lda.inference(test_corpus_tfidf)[0]
def nmf(X, K):
    nmf = NMF(n_components=K)
    X_red = nmf.fit_transform(X)
    X_red = normalizer.fit_transform(X_red)
    return X_red
Exemple #46
0
def generate_clustering(loom,
                        layername,
                        clustering_depth=3,
                        starting_clustering_depth=0,
                        max_clusters='sqrt_rule',
                        mode='pca',
                        silhouette_threshold=0.1,
                        clusteringcachedir='clusteringcachedir/'):
    """

    Parameters
    ----------
    loom :
        
    clustering_depth :
        (Default value = 3)
    starting_clustering_depth :
        (Default value = 0)
    max_clusters :
        (Default value = 200)
    layername :
        
    mode :
        (Default value = 'pca')
    silhouette_threshold :
        (Default value = 0.1)
    clusteringcachedir :
         (Default value = 'clusteringcachedir/')

    Returns
    -------

    
    """
    if type(clustering_depth) != int or clustering_depth < 1 or type(
            starting_clustering_depth) != int:
        raise Exception(
            "clustering_depth and starting_clustering_depth must be natural numbers."
        )
    if (starting_clustering_depth > 0) and (
            'ClusteringIteration{}'.format(starting_clustering_depth - 1)
            not in loom.ca.keys()):
        raise Exception(
            "starting_clustering_depth not yet computed; please run with lower starting_clustering depth, or 0"
        )
    if mode not in ['pca', 'nmf']:
        raise Exception("Currently only implemented for modes:  pca and nmf")

    from time import time
    from sklearn.decomposition import IncrementalPCA
    from tqdm import tqdm
    from panopticon.analysis import get_subclustering
    if mode == 'pca':
        from sklearn.decomposition import PCA
    elif mode == 'nmf':
        from sklearn.decomposition import NMF

    if starting_clustering_depth == 0:
        if mode == 'nmf':
            n_nmf_cols = loom.attrs['NumberNMFComponents']
            nmf_loadings = []
            for col in [
                    '{} NMF Loading Component {}'.format(layername, x)
                    for x in range(1, n_nmf_cols + 1)
            ]:
                nmf_loadings.append(loom.ca[col])
            X = np.vstack(nmf_loadings).T
        elif mode == 'pca':
            n_pca_cols = loom.attrs['NumberPrincipalComponents_{}'.format(
                layername)]
            pca_loadings = []
            for col in [
                    '{} PC {} Loading'.format(layername, x)
                    for x in range(1, n_pca_cols + 1)
            ]:
                pca_loadings.append(loom.ca[col])
            X = np.vstack(pca_loadings).T
        if max_clusters == 'sqrt_rule':
            clustering = get_subclustering(
                X,
                silhouette_threshold,
                max_clusters=int(np.floor(np.sqrt(X.shape[0]))),
                clusteringcachedir=clusteringcachedir
            )  # This shouldn't be hard-coded S Markson 9 June 2020
        else:
            clustering = get_subclustering(
                X,
                silhouette_threshold,
                max_clusters=max_clusters,
                clusteringcachedir=clusteringcachedir
            )  # This shouldn't be hard-coded S Markson 9 June 2020

        loom.ca['ClusteringIteration0'] = clustering
        starting_clustering_depth = 1

    for subi in range(starting_clustering_depth, clustering_depth):

        loom.ca['ClusteringIteration{}'.format(subi)] = ['U'] * len(
            loom.ca['ClusteringIteration{}'.format(subi - 1)])

        for cluster in set([
                x for x in loom.ca['ClusteringIteration{}'.format(subi - 1)]
                if x != 'U'
        ]):  #will need to fix
            mask = loom.ca['ClusteringIteration{}'.format(
                subi -
                1)] == cluster  #first mask, check for top level clustering
            #break
            start = time()
            data_c = loom[layername][:, mask]
            print("processing cluster", cluster, "; time to load: ",
                  time() - start, ", mask size: ", np.sum(mask))
            if mode == 'nmf':
                model = NMF(n_components=np.min([50, data_c.shape[1]]),
                            init='random',
                            random_state=0)
                X = model.fit_transform(data_c.T)
            elif mode == 'pca':

                data_c = data_c.T
                if data_c.shape[0] > 5000:
                    model = IncrementalPCA(n_components=10)
                    for chunk in tqdm(
                            np.array_split(data_c,
                                           data_c.shape[0] // 512,
                                           axis=0),
                            desc='partial fitting over chunks of masked data'):
                        model.partial_fit(chunk)
                    X = model.transform(data_c)
                    print("EV", model.explained_variance_)
                    print("EVR", model.explained_variance_ratio_)
                else:
                    model = PCA(n_components=np.min([10, data_c.shape[0]]),
                                random_state=0)

                    X = model.fit_transform(data_c)
                    print("EV", model.explained_variance_)
                    print("EVR", model.explained_variance_ratio_)

            if max_clusters == 'sqrt_rule':
                print("xshape", X.shape)
                nopath_clustering = get_subclustering(
                    X,
                    silhouette_threshold,
                    max_clusters=int(np.floor(np.sqrt(X.shape[0]))),
                    clusteringcachedir=clusteringcachedir
                )  # This shouldn't be hard-coded S Markson 9 June 2020
            else:
                nopath_clustering = get_subclustering(
                    X,
                    silhouette_threshold,
                    max_clusters=max_clusters,
                    clusteringcachedir=clusteringcachedir
                )  # This shouldn't be hard-coded S Markson 9 June 2020


#            nopath_clustering = get_subclustering(X, score_threshold=silhouette_threshold)  #Really shouldn't be hard-coded S Markson 9 June 2020
            fullpath_clustering = [
                '{}-{}'.format(cluster, x) for x in nopath_clustering
            ]
            loom.ca['ClusteringIteration{}'.format(
                subi)][mask] = fullpath_clustering
        loom.ca['ClusteringIteration{}'.format(subi)] = loom.ca[
            'ClusteringIteration{}'.format(
                subi)]  #This is to force the changes to save to disk
Exemple #47
0
# get sparse representation
X_sparse = sparse.csr_matrix((y_train, (X_train[:, 0], X_train[:, 1])),
                             shape=(len(portfolio_list), len(investment_list)))
model = NMF(n_components=3,
            init='random',
            solver='cd',
            beta_loss='frobenius',
            max_iter=200,
            tol=0.0001,
            alpha=0,
            l1_ratio=0,
            random_state=0,
            verbose=0,
            shuffle=False)
W = model.fit_transform(X_sparse)
H = model.components_

# # Test model

import random

len(X_test)

# Since our test set also contains only positive examples, we want to add some zero values: we randomly generate a pair (user, item) and, if there isn't a position for it, we add it with rating zero

# EXTRA TODO: this function is super slow! find a faster way so the following line can be decommented
# l = int(len(X_test) * 1.5)
l = len(X_test) + 500
t1 = time.time()
while len(X_test) < l:
Exemple #48
0
def generate_nmf_and_loadings(loom,
                              layername,
                              nvargenes=2000,
                              n_components=100,
                              verbose=False):
    """

    Parameters
    ----------
    loom :
        
    layername :
        
    nvargenes :
        (Default value = 2000)
    n_components :
        (Default value = 100)
    verbose :
        (Default value = False)

    Returns
    -------

    
    """
    from sklearn.decomposition import NMF

    if 'GeneVar' not in loom.ra.keys():
        raise Exception(
            "Necessary to have already generated gene expression variances")
    vargenemask = loom.ra['GeneVar'] > np.sort(
        loom.ra['GeneVar'])[::-1][nvargenes]
    X = loom[layername][vargenemask, :]
    model = NMF(n_components=n_components,
                init='random',
                random_state=0,
                verbose=verbose)
    W = model.fit_transform(X)
    H = model.components_

    # record NMF basis
    nmf_factors = []
    counter = 0
    for isvargene in vargenemask:
        if isvargene:
            nmf_factors.append(W[counter, :])
            counter += 1
        else:
            nmf_factors.append(np.zeros(W.shape[1]))
    nmf_factors = np.vstack(nmf_factors)
    factor_sums = []
    for i in range(nmf_factors.shape[1]):
        loom.ra['{} NMF Component {}'.format(
            layername, i + 1)] = nmf_factors[:, i] / np.sum(nmf_factors[:, i])
        factor_sums.append(np.sum(nmf_factors[:, i]))
    factor_sums = np.array(factor_sums)
    # record NMF loadings
    for i in range(H.shape[0]):
        loom.ca['{} NMF Loading Component {}'.format(
            layername, i + 1)] = H[i, :] * factor_sums[i]

    loom.attrs['NumberNMFComponents'] = n_components
Exemple #49
0
def display_topics(model, feature_names, no_top_words, topic_names=None):
    topic_list = []
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            topics = "Topic: " + str(ix)
        else:
            topics = "Topic:'" + str(topic_names[ix])
        terms = ", ".join(
            [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        topic_list.append((topics, terms))
    return topic_list


nmf_model = NMF(n_components=n_components, random_state=42)

start = time.time()
doc_topic = nmf_model.fit_transform(tlj_tfidf)
end = time.time()

nmf_topics = display_topics(nmf_model, tfidf.get_feature_names(), 20)
pprint(nmf_topics)
print("Model Execution Time:", end - start)

pickle.dump(
    nmf_topics,
    open('../../data/pickles/nmf/nmf_topics_' + str(n_components) + '.pkl',
         'wb'))
pickle.dump(
    nmf_model,
    open('../../data/pickles/nmf/nmf_model_' + str(n_components) + '.pkl',
         'wb'))
class MyIndividual(_Individual):
    element_class = _Chromosome

    def get_neighbour(self):
        # select a neighour randomly
        cpy = self.clone(fitness=None)
        cpy.chromosomes = [
            chromosome.random_neighbour() for chromosome in self.chromosomes
        ]
        return cpy


from sklearn.decomposition import NMF
nmf = NMF(n_components=c)
W = nmf.fit_transform(evaluate.M)
H = nmf.components_
err = -evaluate(W, H.T)

i = MyIndividual.random(sizes=(c, ) * N + (p, ) * c)
j = i.clone()
data = i.get_history(stat={'Error': lambda i: -i.fitness}, n_iter=200)
yourdata = j.get_history(stat={'Error': lambda i: -i.fitness}, n_iter=200)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(np.arange(200), yourdata['Error'], 'bo', np.arange(200), data['Error'],
        'r+', [0, 200], [err, err], 'k--')
ax.legend(('My Error', 'Your Error', 'EM Error'))
plt.show()
S = mglearn.datasets.make_signals()
plt.figure(figsize=(11, 2))
plt.plot(S, '-')
plt.xlabel("Time")
plt.ylabel("Signal")
plt.tight_layout()
plt.show()

A = np.random.RandomState(0).uniform(size=(100, 3))
X = np.dot(S, A.T)
print("Shape of measurements: {}".format(X.shape))


nmf = NMF(n_components=3, random_state=42)
S_ = nmf.fit_transform(X)
print("Recovered signal shape: {}".format(S_.shape))

pca = PCA(n_components=3)
H = pca.fit_transform(X)

models = [X, S, S_, H]
names = ['Observations (first three measurements)',
         'True sources',
         'NMF recovered signals',
         'PCA recovered signals']

fig, axes = plt.subplots(4, figsize=(12, 6), gridspec_kw={'hspace': .5},
                         subplot_kw={'xticks': (), 'yticks': ()})

for model, name, ax in zip(models, names, axes):
Exemple #52
0
        print()


def get_top_words_by_loadings(H, features):
    return features[np.argsort(np.sum(H, axis=0))[::-1][:30]]


if __name__ == "__main__":
    # Load data
    df_ml = pd.read_csv(ML_ONLY_FILEPATH, encoding='utf-8')

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_ml = tfidf_vectorizer.fit_transform(df_ml['description'])
    features = np.array(tfidf_vectorizer.get_feature_names())
    nmf_model = NMF(n_components=10, random_state=42)
    W = nmf_model.fit_transform(tfidf_ml)
    H = nmf_model.components_

    # This model specifically returns roughly these topics:
    hand_labeled_features = [
        'machine learning / time series',
        'optimization',
        'neural networks / deep learning',
        'reinforcement learning',
        'bayesian',
        'graphs / graph ML',
        'Generative adversarial networks',
        'image classification',
        'clustering',
        'optimal solutions'
    ]
Exemple #53
0
                                               int(len(R) / 5)].fillna(0)
R[3 * int(len(R) / 5):4 * int(len(R) / 5)] = R[3 * int(len(R) / 5):4 *
                                               int(len(R) / 5)].fillna(0)
R[4 * int(len(R) / 5):] = R[4 * int(len(R) / 5):].fillna(0)

# Delete dataframes to free memory
del ratings, movies, tags, titles, flipped, duplicates, ratings_final

# Progress indicator
print('Calculating collaborative filter model. Time passed in seconds: ',
      time.perf_counter() - start_time)

# NMF for collaborative filtering
# train model
collab_model = NMF(n_components=60, init="nndsvd")
transformed_R = collab_model.fit_transform(R)
collab_matrix = pd.DataFrame(transformed_R, index=R.index)

# Export collab_matrix
pickle.dump(collab_matrix, open('binaries/collab_matrix', 'wb'))
del R, collab_model, transformed_R, collab_matrix

# Progress indicator
print('Calculating content-based filter model. Time passed in seconds: ',
      time.perf_counter() - start_time)

# Tfidf vectorization of the tag strings for content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(tags_final['tags'])

# Reduce dimensionality with SVD
'''

path = "data/markets_new.csv"
weights_paths = ["results_1/all_weights_5.npy"]


df = pd.read_csv(path)

# Drop rows with at least one missing value
df_cut_nan = df.dropna()
df_cut_nan = df_cut_nan.drop("Date", axis=1)
scaler = MinMaxScaler()
df_cut_nan_min_max = scaler.fit_transform(df_cut_nan)

nmf = NMF(n_components=3, l1_ratio=1)
transformed = nmf.fit_transform(df_cut_nan_min_max)
components = nmf.components_
assign_trend = np.argmax(components, axis=0)

for weights_path in weights_paths:
    all_weights = np.load(weights_path)
    all_trends = []
    for weights in all_weights:
        trends = [0, 0, 0]
        for i, weight in enumerate(weights):
            trend = assign_trend[i]
            trends[trend] += weight
        all_trends.append(trends)

    df = pd.DataFrame(all_trends)
    df.columns = ["Trend 1", "Trend 2", "Trend 3"]
Exemple #55
0
def main(test):
    if test == 'True':
        nrows = test_rows
        sys.stdout.flush()
    else:
        nrows = None

    file_name = PROJECT_DIR + '/data/processed/train.csv'
    extract_q1 = ColumnExtractor(file_name, Q_WORD_TOKENIZED[0], nrows=nrows)
    extract_q2 = ColumnExtractor(file_name, Q_WORD_TOKENIZED[1], nrows=nrows)
    q_stacker = ColumnStacker()
    pipeline = FeatureUnion([('extract_q1', extract_q1),
                             ('extract_q2', extract_q2)],
                            n_jobs=2)

    pipeline = Pipeline([('question_extractor', pipeline),
                         ('q_stacker', q_stacker)])

    D_q1 = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv',
                       index_col='id',
                       usecols=['id', Q_WORD_TOKENIZED[0]],
                       nrows=nrows)
    nrows = len(D_q1)
    q1 = D_q1.loc[:, Q_WORD_TOKENIZED[0]].apply(
        lambda l: ' '.join(literal_eval(l)))
    del D_q1

    D_q2 = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv',
                       index_col='id',
                       usecols=['id', Q_WORD_TOKENIZED[1]],
                       nrows=nrows)
    q2 = D_q2.loc[:, Q_WORD_TOKENIZED[1]].apply(
        lambda l: ' '.join(literal_eval(l)))
    del D_q2

    all_questions = q1.append(q2)
    all_questions.index = range(len(all_questions))

    t = TfidfVectorizer(max_df=.95,
                        min_df=2,
                        stop_words='english',
                        max_features=max_tfidf_features,
                        ngram_range=(1, 2))

    t0 = time.clock()
    print('VECTORIZING...')
    tfidf = t.fit_transform(all_questions.values)
    print('Time: ', time.clock() - t0)
    joblib.dump(t, PROJECT_DIR + '/models/' + 'tfidf.pkl')

    nmf_tfidf = NMF(n_components=n_components, init='nndsvda')
    print('NMF tfidf...')
    t0 = time.clock()
    W = nmf_tfidf.fit_transform(tfidf)
    print('Time: ', time.clock() - t0)
    joblib.dump(nmf_tfidf, PROJECT_DIR + '/models/' + 'nmf_tfidf.pkl')

    W = np.abs(W[:nrows, :] - W[nrows:, :])

    D = pd.read_csv(
        PROJECT_DIR + '/data/processed/train.csv',
        index_col='id',
        usecols=['id', MASI_DISTANCE, JACCARD_DISTANCE, EDIT_DISTANCE],
        dtype=np.float64,
        nrows=nrows)
    Dist = D.values
    del D

    D = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv',
                    index_col='id',
                    usecols=['id'] + Q_TYPE1,
                    dtype='object',
                    nrows=nrows)
    D = D.loc[:, Q_TYPE1].applymap(literal_eval)
    T = np.hstack(
        (np.vstack(D.loc[:, Q_TYPE1[0]]), np.vstack(D.loc[:, Q_TYPE1[1]])))
    #    nmf_T = NMF(n_components = 25, init = 'nndsvda')
    #    print('NMF T...')
    #    t0 = time.clock()
    #    T = nmf_T.fit_transform(T)
    #    print('Time: ', time.clock() - t0)
    #    joblib.dump(nmf_T, PROJECT_DIR + '/models/' + 'nmf_T.pkl')
    del D

    X = np.hstack((W, Dist, T))

    D = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv',
                    index_col='id',
                    usecols=['id', 'is_duplicate'],
                    dtype=np.float64,
                    nrows=nrows)
    y = D.values.ravel()

    param_dist = {
        'max_depth': range(15),
        'learning_rate': np.logspace(-3, 1, 50),
        'subsample': [0.3, 0.5, 0.7, 1.0],
        'n_estimators': [250, 400, 700, 1000, 1500, 2000],
        'min_child_weight': [1, 3, 5, 7],
        'gamma': np.logspace(-2, 2, 50),
    }

    cv = RandomizedSearchCV(XGBClassifier(nthread=2),
                            param_dist,
                            scoring='neg_log_loss',
                            n_jobs=4)

    print('Fitting xgb')
    cv.fit(X, y)
    clf = cv.best_estimator_
    joblib.dump(clf, PROJECT_DIR + '/models/' + 'cv_xgb.pkl')
    return
    Yres = y_train - corrected_zero[0]

qx_init = np.random.normal(0, 1, size=[N, Q])

if PCA_INIT:

    ols = LinearRegression(fit_intercept=False)
    ols.fit(z_init, y_train)
    Yres = y_train - ols.predict(z_init)

    if SPARSE:
        pca = NMF(n_components=Q)
        #pca = PCA(n_components = Q)
    else:
        pca = PCA(n_components=Q)
    qx_init = pca.fit_transform(Yres)  #.tocsr())
    qx_init = 15. * qx_init / np.max(qx_init)
    #print(np.max(qx_init))
if args.gpy_init:
    m2 = GPy.models.GPLVM(Yres, Q)
    m2.optimize()
    qx_init = m2['latent_mean'][:]

# Initialize lengths with max distance across dimensions
len_var_init = np.abs(np.max(qx_init, axis=0) - np.mean(qx_init, axis=0))
len_con_init = np.ones(z_init.shape[1])
sig_init = 0.5

if args.gpy_init:
    len_var_init = m2['sum.rbf.lengthscale'] * np.ones(Q)
Exemple #57
0
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=5000,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(joke_df.tokenized_joke)

nmf = NMF(n_components=100, random_state=1, alpha=.1, l1_ratio=.5)

nmf_matrix = nmf.fit_transform(tfidf)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 20)

topic_descriptions = [[
    tfidf_feature_names[i] for i in topic.argsort()[:-25 - 1:-1]
] for topic in nmf.components_]

feature_nn = NearestNeighbors(n_neighbors=5).fit(nmf_matrix)

with open('fit_models/nmf.p', 'wb') as fp:
    pickle.dump((nmf_matrix, topic_descriptions, feature_nn), fp)
Exemple #58
0
class TopicModel:
    def __init__(self, topicCollection, string):
        if string.lower() == "nmf":
            self.model = "NMF"
            print("Topic Extraction Model: sklearn.NMF")
        else:
            self.model = "LDA"
            print("Topic Extraction Model: gensim.LDAModel")
        self.stemmer = PorterStemmer()

    #Train the LDA model on the current discussion
    def train(self, sentences):
        if self.model == "NMF":
            self.sentenceData = []
            for sentence in sentences:
                self.sentenceData.append(preprocess(sentence, self.stemmer))
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1500,
                ngram_range=(1, 2),
                preprocessor=' '.join,
                stop_words='english'
            )
            tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData)
            self.nmf = NMF(n_components=2, solver="mu")
            self.W = self.nmf.fit_transform(tfidf)
            self.H = self.nmf.components_
        else:
            sentenceData = []
            for sentence in sentences:
                sentenceData.append(preprocess(sentence, self.stemmer))
            self.dictionary = Dictionary(sentenceData)
            bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData]
            self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10)

    #Classify a given sentence to one of the topics found in training
    def classify(self, sentence):
        if self.model == "NMF":
            index = self.sentenceData.index(preprocess(sentence, self.stemmer))
            topic = self.W.argmax(axis=1)[index]
            return "Topic " + str(topic)
        else:
            bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer))
            return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0])

    #Shows the terms of a given topic
    def showTerms(self, topic):
        if self.model == "NMF":
            terms = ""
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == int(topic.split(' ')[-1]):
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for term in top_features:
                terms += term + ", "
            print(topic.split(' ')[-1] + " " + terms)
            return terms
        else:
            terms = ""
            topic = int(topic.split(" ")[-1])
            for term in self.lda_model.show_topic(topic):
                terms += term[0] + ", "
            print(str(topic) + " " + terms)
            return terms

    #Gets the probability or the coefficient of the given term in the topic
    def getCoeff(self, topic, term):
        if self.model == "NMF":
            weights = []
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == topic:
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for coeff, terms in zip(weights, top_features):
                if terms == term:
                    return coeff
        else:
            topic = int(topic.split(" ")[-1])
            for terms in self.lda_model.show_topic(topic):
                if terms[0] == term:
                    return terms[1]

    #Shows all the topics found in training
    def showTopics(self):
        if self.model == "NMF":
            ret = []
            for topic_idx, topicID in enumerate(self.H):
                ret.append("Topic " + str(topic_idx))
            return ret
        else:
            topics = self.lda_model.print_topics()
            ret = []
            for topic in topics:
                ret.append("Topic " + str(topic[0]))
            return ret

    #Returns a flag to check what model is deployed at the moment
    def getModel(self):
        return self.model
    def test_custom_nmf(self):

        mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0],
                        [0, 0, 1, 0]],
                       dtype=np.float64)
        mat[:mat.shape[1], :] += np.identity(mat.shape[1])

        mod = NMF(n_components=2, max_iter=2)
        W = mod.fit_transform(mat)
        H = mod.components_

        def predict(W, H, row_index, col_index):
            return np.dot(W[row_index, :], H[:, col_index])

        pred = mod.inverse_transform(W)

        exp = []
        got = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                exp.append((i, j, pred[i, j]))
                got.append((i, j, predict(W, H, i, j)))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got))
        assert max_diff <= 1e-5

        def nmf_to_onnx(W, H):
            """
            The function converts a NMF described by matrices
            *W*, *H* (*WH* approximate training data *M*).
            into a function which takes two indices *(i, j)*
            and returns the predictions for it. It assumes
            these indices applies on the training data.
            """
            col = OnnxArrayFeatureExtractor(H, 'col')
            row = OnnxArrayFeatureExtractor(W.T, 'row')
            dot = OnnxMul(col, row, op_version=TARGET_OPSET)
            res = OnnxReduceSum(dot,
                                output_names="rec",
                                op_version=TARGET_OPSET)
            indices_type = np.array([0], dtype=np.int64)
            onx = res.to_onnx(inputs={
                'col': indices_type,
                'row': indices_type
            },
                              outputs=[('rec', FloatTensorType((None, 1)))])
            return onx

        model_onnx = nmf_to_onnx(W.astype(np.float32), H.astype(np.float32))
        sess = InferenceSession(model_onnx.SerializeToString())

        def predict_onnx(sess, row_indices, col_indices):
            res = sess.run(None, {'col': col_indices, 'row': row_indices})
            return res

        onnx_preds = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                row_indices = np.array([i], dtype=np.int64)
                col_indices = np.array([j], dtype=np.int64)
                pred = predict_onnx(sess, row_indices, col_indices)[0]
                onnx_preds.append((i, j, pred[0, 0]))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds))
        assert max_diff <= 1e-5
/* Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples. This time, you are also provided with a function show_as_image() that displays the image encoded by any 1D array:

def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
    plt.show()
After you are done, take a moment to look through the plots and notice how NMF has expressed the digit as a sum of the components! */

# Import NMF
from sklearn.decomposition import NMF

# Create an NMF model: model
model = NMF(n_components = 7)

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)

# Assign the 0th row of features: digit_features
digit_features = features[0, :]

# Print digit_features
print(digit_features)