Exemple #1
0
def nmf_old(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components,
            init='nndsvdar', random_state=0):
    # fit followed by transform
    model = NMF(n_components=n_components, init=init,
                random_state=random_state)
    # TODO en boucle
    model.fit(mut_final)
    gene_comp = model.components_.copy()
    patient_strat = np.argmax(model.transform(mut_final), axis=1).copy()

    model.fit(mut_diff)
    gene_comp_diff = model.components_.copy()
    patient_strat_diff = np.argmax(
        model.transform(mut_diff), axis=1).copy()

    model.fit(mut_mean_qn)
    gene_comp_mean_qn = model.components_.copy()
    patient_strat_mean_qn = np.argmax(
        model.transform(mut_mean_qn), axis=1).copy()

    model.fit(mut_median_qn)
    gene_comp_median_qn = model.components_.copy()
    patient_strat_median_qn = np.argmax(
        model.transform(mut_median_qn), axis=1).copy()

    return (gene_comp, patient_strat,
            gene_comp_diff, patient_strat_diff,
            gene_comp_mean_qn, patient_strat_mean_qn,
            gene_comp_median_qn, patient_strat_median_qn)
class TopicEmbeddingModel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].fit_transform(\
        #    self.bow['count_vectorizer'].fit_transform(X))

        # depending on the model, train
        if self.modeltype is 'kpca':
            Xc = self.model.fit_transform(X)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X)


    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].transform(\
        #    self.bow['count_vectorizer'].transform(X))
        
        if self.modeltype is 'kpca':
            return self.model.transform(X)
        if self.modeltype is 'nmf':
            return self.model.transform(X)
Exemple #3
0
def test_nmf_transform_custom_init():
    # Smoke test that checks if NMF.transform works with custom initialization
    A = np.abs(random_state.randn(6, 5))
    n_components = 4
    avg = np.sqrt(A.mean() / n_components)
    H_init = np.abs(avg * random_state.randn(n_components, 5))
    W_init = np.abs(avg * random_state.randn(6, n_components))

    m = NMF(solver="cd", n_components=n_components, init="custom", random_state=0)
    m.fit_transform(A, W=W_init, H=H_init)
    m.transform(A)
Exemple #4
0
class MatrixFactorization:
    def __init__(self):
        self.nmf = NMF()

    def fit(self, X):
        self.nmf.fit(X)
        u = self.nmf.transform(X)
        return u.dot(self.nmf.components_)

    def predict(self, X):
        u = self.nmf.transform(X)
        return u.dot(self.nmf.components_)
 def applyNMF(self, number_of_clusters, country_specific_tweets):
     train, feature_names = self.extractFeatures(country_specific_tweets,False)
     name = "nmf"
     
     # Fit the NMF model
     if self.results:
         print("Fitting the NMF model", end=" - ")
     
     t0 = time()
     nmf = NMF(n_components=number_of_clusters, random_state=1, alpha=.1, l1_ratio=.5).fit(train)
     
     if self.results:
         print("done in %0.3fs." % (time() - t0))
     
     if self.results:
         print("\nNMF:")
     
     parameters = nmf.get_params()
     
     if self.results:
         print("Parameter: " + str(parameters))
     topics = nmf.components_
     doc_topic = nmf.transform(train)
     top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
     labels = numpy.asarray(labels)
     
     if self.results:
         print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
                
     return name, parameters, top10, labels
def tfidf_nmf(release_texts, n_components=10, max_features=None):
    '''
        Creates and fits tfidf and NMF models.

        INPUT:
        - n_components: number of latent features for the NMF model to find
        - max_features: max number of features (vocabulary size) for the tfidf model to consider

        OUTPUT:
        - tfidf_vectorizer: tfidf model object
        - tfidf_sparse:tfidf sparse matrix
        - nmf: NMF model object
        - W: Feature matrix output from NMF factorization into W and H matrices
    '''
    # tfidf model
    custom_stop_words = make_stop_words()
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features)
    tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts)

    # normalize row-wise so each row sums to one
    tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1')

    # nmf model
    nmf = NMF(n_components=n_components, random_state=1)
    nmf.fit(tfidf_sparse)
    W = nmf.transform(tfidf_sparse)
    return tfidf_vectorizer, tfidf_sparse, nmf, W
def plot_nmf_illustration():
    rnd = np.random.RandomState(5)
    X_ = rnd.normal(size=(300, 2))
    # Add 8 to make sure every point lies in the positive part of the space
    X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8

    nmf = NMF(random_state=0)
    nmf.fit(X_blob)
    X_nmf = nmf.transform(X_blob)

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis')
    axes[0].set_xlabel("feature 1")
    axes[0].set_ylabel("feature 2")
    axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].set_aspect('equal')
    axes[0].set_title("NMF with two components")

    # second plot
    nmf = NMF(random_state=0, n_components=1)
    nmf.fit(X_blob)

    axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0,
                    s=60, cmap='viridis')
    axes[1].set_xlabel("feature 1")
    axes[1].set_ylabel("feature 2")
    axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')

    axes[1].set_aspect('equal')
    axes[1].set_title("NMF with one component")
Exemple #8
0
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    m = NMF(n_components=4, init="nndsvd", random_state=0)
    ft = m.fit_transform(A)
    t = m.transform(A)
    assert_array_almost_equal(ft, t, decimal=2)
Exemple #9
0
class NMFReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = NMF(n_components=num_components, max_iter=5000)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def display_reduced_iris(self):
        sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0)
        ft = m.fit_transform(A)
        t = m.transform(A)
        assert_array_almost_equal(ft, t, decimal=2)
    def test(cls, csv, K=3, dr='PCA'):
        '''
        csv - A csv file without header.
        '''

        from sklearn.decomposition import PCA, NMF
        from sklearn.random_projection import GaussianRandomProjection
        from sklearn.manifold import MDS, TSNE
        from sklearn.cluster import KMeans
        from sklearn.preprocessing import OneHotEncoder

        X = pd.read_csv(csv, header=None).values
        Z = None
        Xr = None

        if (dr == 'PCA'):
            pca = PCA(n_components=K)  # keep the first K components
            pca.fit(X)
            Z = pca.transform(X)
            Xr = pca.inverse_transform(Z)
        elif (dr == 'NMF'):
            # make sure X is non-negative
            Xmin = np.min(X)
            if (Xmin < 0):
                X = X - Xmin

            nmf = NMF(n_components=K)  # keep the first K components
            nmf.fit(X)
            Z = nmf.transform(X)
            Xr = nmf.inverse_transform(Z)

            if (Xmin < 0):
                Xr = Xr + Xmin
        elif (dr == 'RP'):
            grp = GaussianRandomProjection(
                n_components=K)  # keep the first K components
            Z = grp.fit_transform(X)
        elif (dr == 'VQ'):
            kmeans = KMeans(n_clusters=K).fit(X)
            Xvq = kmeans.predict(X)
            H = kmeans.cluster_centers_
            ohe = OneHotEncoder()
            Z = ohe.fit_transform(Xvq.reshape(-1, 1)).A
            Xr = Z @ H
        elif (dr == 'MDS'):
            mds = MDS(n_components=K)  # keep the first K components
            Z = mds.fit_transform(X)
        elif (dr == 'TSNE'):
            tsne = MDS(n_components=K)  # keep the first K components
            Z = tsne.fit_transform(X)
        elif (dr == 'IDENTITY'):
            # for this case, k is not used.
            Z = X
            Xr = X
        else:
            raise Exception("Invalid DR name")

        return cls(X, Z, Xr)
def MF(filename, K):

	data_matrix = np.loadtxt(str(filename), delimiter = " ");
	dimen = data_matrix.shape;
	num_row = int(dimen[0]);
	num_col = int(dimen[1]);
	
	f_matrix = data_matrix[0:num_row, 0:num_col-2];
	class_label = np.array(data_matrix[0:num_row, num_col-2:num_col-1]);
	label_list = map(int, class_label.T.tolist()[0]);

	year_infor = np.array(data_matrix[0:num_row, num_col-1:num_col]);
	year_list = map(int, year_infor.T.tolist()[0]);

	# do NMF on f_matrix
	model = NMF(n_components = K, random_state = None);
	model.fit(f_matrix);
	W_matrix = model.transform(f_matrix);
	
	transformed_matrix = np.hstack((W_matrix, class_label)).tolist();	

	# group the paper based on temporal information
	year_datalistlist_dict = {};
        for index in range(0,len(year_list)):
        	if year_datalistlist_dict.get(int(year_list[index]), -1) == -1:
                        datalistlist = [];
                        datalistlist.append(transformed_matrix[index]);
                        year_datalistlist_dict[year_list[index]] = datalistlist;
                else:
                        datalistlist = year_datalistlist_dict[year_list[index]]
                        datalistlist.append(transformed_matrix[index]);
                        year_datalistlist_dict[year_list[index]] = datalistlist;
	
	dimen = K;

	sorted_year_list = sorted(year_datalistlist_dict.keys());
	train_set_list = [];
	train_label_list = [];
	test_set_list = [];
	test_label_list = [];
	
	test_year_list = sorted_year_list[len(sorted_year_list)-4:];
	# training and test partition based on the temporal information and put more recent papers into test set
	for k, v in year_datalistlist_dict.items():
		if int(k) not in test_year_list:
			# put all the corresponding paper into training set
			for train in range(0,len(v)):
				train_set_list.append(v[train][0:dimen]);
				train_label_list.append(int(v[train][-1]));		
		else:
			# put all the corresponding paper into test set
			for test in range(0,len(v)):
				test_set_list.append(v[test][0:dimen]);
				test_label_list.append(int(v[test][-1]));

	num_cluster_test = len(list(set(test_label_list)));

	return [train_set_list, train_label_list, test_set_list, test_label_list, num_cluster_test];
    def get_first_nmf_component(X):
        corr_matrix = np.dot(X.T, X) / (X.shape[0] - 1)

        nmf = NMF(n_components=1)
        nmf.fit(corr_matrix)
        print("\t\tReconstruction error: {:.2f}".format(
            nmf.reconstruction_err_))
        print("\t\tNumber of iterations: {}".format(nmf.n_iter_))
        return nmf.transform(corr_matrix)
Exemple #14
0
def test_nmf_transform(solver):
    # Test that NMF.transform returns close values
    rng = np.random.mtrand.RandomState(42)
    A = np.abs(rng.randn(6, 5))
    m = NMF(solver=solver, n_components=3, init='random',
            random_state=0, tol=1e-5)
    ft = m.fit_transform(A)
    t = m.transform(A)
    assert_array_almost_equal(ft, t, decimal=2)
Exemple #15
0
def NMF_train(X_train, X_test, n):
    nmf_model = NMF(n_components=n)
    nmf_model.fit(X_train)
    
    H = nmf_model.components_;
    W = nmf_model.fit_transform(X_train)
    W_test = nmf_model.transform(X_test)
    
    return H, W, W_test
Exemple #16
0
def get_factorization(V, num_roles):
    """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """
    model = NMF(n_components=num_roles, init='random', random_state=0)
    model.fit(V)
    
    node_roles = model.transform(V)
    role_features = model.components_
    
    return np.matrix(node_roles), np.matrix(role_features)
Exemple #17
0
def performNMF(M, fragmentsLookupTable, fragmentsCount):

    if (args.verbose):
        print >> sys.stdout, "- %s START   : calculating NMF" % (timeStamp())

    t0 = time()
    model = NMF(
        n_components=args.components,
        init='nndsvd',
        beta=10000.0,
        max_iter=1000,
        tol=5e-3,
        sparseness='components')
    model.fit(M)
    train_time = (time() - t0)
    components_ = model.components_

    # print >> sys.stdout, components_
    N = model.transform(M)

    if (args.verbose):
        print >> sys.stdout, "- %s FINISH  : calculating NMF" % (timeStamp())

    if (args.verbose):
        print >> sys.stdout, "- %s START   : mapping components" % (
            timeStamp())

    # convert components into locations
    for i in xrange(args.components):
        output = gzip.open(
            args.outdir + "/NMF_component_" + str(i) + ".txt.gz", 'wb')
        if (args.verbose):
            print >> sys.stdout, "-            : processing component %d" % (i)

        try:

            for j in xrange(model.components_[i].shape[0]):
                # print >> sys.stdout, model.components_[i]
                # print >> sys.stdout, "Max value %f" (numpy.max(model.components_[i]))
                # if (model.components_[i][j] != 0):
                fragment1 = j / fragmentsCount
                fragment2 = j % fragmentsCount

                (chr1, midpoint1) = fragmentsLookupTable[fragment1]
                (chr2, midpoint2) = fragmentsLookupTable[fragment2]
                output.write(
                    "%s\t%i\t%s\t%i\t%f\n" % (chr1, midpoint1, chr2, midpoint2,
                                              model.components_[i][j]))

        finally:
            output.close()

    if (args.verbose):
        print >> sys.stdout, "- %s FINISH  : mapping components" % (
            timeStamp())

    return (N, model)
Exemple #18
0
def test_nmf_transform(solver):
    # Test that NMF.transform returns close values
    rng = np.random.mtrand.RandomState(42)
    A = np.abs(rng.randn(6, 5))
    m = NMF(solver=solver, n_components=3, init='random',
            random_state=0, tol=1e-5)
    ft = m.fit_transform(A)
    t = m.transform(A)
    assert_array_almost_equal(ft, t, decimal=2)
Exemple #19
0
def nmf_model(corpus_trigrams, num_topics):
	processed_corpus_str =  [' '.join(word) for word in corpus_trigrams]
	tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=5, ngram_range = (1,3), stop_words='english')
	tfidf = tfidf_vectorizer.fit_transform(processed_corpus_str)
	tfidf_feature_names = tfidf_vectorizer.get_feature_names()
	nmf = NMF(n_components=num_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
	nmf_W = nmf.transform(tfidf)
	nmf_H = nmf.components_
	return (nmf_H, nmf_W, tfidf_feature_names)
Exemple #20
0
def get_factorization(V, num_roles):
    """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """
    model = NMF(n_components=num_roles, init='random', random_state=0)
    model.fit(V)

    node_roles = model.transform(V)
    role_features = model.components_

    return torch.from_numpy(node_roles), torch.from_numpy(role_features)
def reduce_dim_NMF(tfidf_train, tfidf_test, k):
    model = NMF(n_components=k, init='random')
    W_train = model.fit_transform(tfidf_train)
    H_train = model.components_
    tfidf_train_hat = W_train.dot(H_train)
    distance_train = np.linalg.norm(tfidf_train - tfidf_train_hat, 'fro')**2

    W_test = model.transform(tfidf_test)
    return W_train, W_test, distance_train
Exemple #22
0
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    m = NMF(n_components=4, init="random", random_state=0)
    m.fit_transform(A)
    t = m.transform(A)
    A_new = m.inverse_transform(t)
    assert_array_almost_equal(A, A_new, decimal=2)
Exemple #23
0
def cluster_score_store():
    print "start new run clustering, scoring and storing result"
    conn = psycopg2.connect(database='hedda', user='******')
    c = conn.cursor()
    #get last days tweets from db
    c.execute('''SELECT id, text, url, (favcount + retweetcount), date_time FROM tweets
                 WHERE date_time::date > current_date - interval '1' day;''')
    texts = pd.DataFrame(c.fetchall())
    texts.columns = ['id','text','url','sumfavrtcount', 'date_time']
    texts['date_time'] = texts['date_time'].apply(pd.to_datetime)
    texts['hrs'] = texts['date_time'].apply(lambda x: (datetime.now()-x).total_seconds()/60/60)
    texts['favrt_hour'] = texts['sumfavrtcount']/texts['hrs']

    #prepare vectorized text
    tfidfvect = TfidfVectorizer(max_features=100, max_df=0.7, min_df=.01, tokenizer=clean_tokenized_text)
    tfidfvect.fit(texts['text'].values)
    X = tfidfvect.transform(texts['text'].values)
    feature_names = tfidfvect.get_feature_names()

    nmf = NMF(n_components = 25, max_iter = 5000).fit(X)
    topic_labels = []

    for topic_idx, topic in enumerate(nmf.components_):
        #print "Topic %s: %s" % (topic_idx,  ' '.join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))
        topic_labels.append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-3 - 1:-1]]))

    y_hat = nmf.transform(X)
    y_norm = [y/ y.sum() for y in y_hat]

    df_y_hat = pd.DataFrame(y_norm, 
                            columns=topic_labels,  
                            index=texts.index.values) 
    df_y_hat = df_y_hat.fillna(0)
    df_nmf = texts.join(df_y_hat) 

    #pick winning cluster for every tweet
    df_nmf['HighScore'] = df_nmf[topic_labels].max(axis=1)
    df_nmf['Cluster'] = df_nmf[topic_labels].idxmax(axis=1)

    #keep only high scores above 0.85
    df_nmf = df_nmf[df_nmf['HighScore']>=0.85]

    #create cluster dataset with highest scoring tweets per cluster 
    #based on the amount of retweets and favorites per hour
    idx = df_nmf.groupby(['Cluster'])['favrt_hour'].transform(max) == df_nmf['favrt_hour']
    df_clus_favrt = pd.DataFrame(df_nmf[idx][['Cluster','id','date_time','text','url','HighScore','sumfavrtcount','favrt_hour']])
    df_temp = df_nmf.groupby('Cluster').sum()['sumfavrtcount'].reset_index()
    df_temp.columns=['Cluster','total-rt-fav']
    df_clus_favrt = pd.merge(df_clus_favrt, df_temp, on ='Cluster')
    df_clus_favrt.columns=['descr','id','date_time','tweettext','url','score', 'sum-rt-fav-toptweet', 'favrt_hour','total-rt-fav']
     
    #export tweet URL, tweet text, tweet date and any additional information you found useful
    #choice: best cluster is the one with highest total retweets and favorites
    export = df_clus_favrt.sort('total-rt-fav', ascending=False).head(5)[['descr','url','tweettext','date_time']]
    export.to_csv('resultnewstweets-%s.csv' % datetime.now(), index=False)
    print "new file created at: %s" % datetime.now()
Exemple #24
0
def run():
    '''
    Standard topic analysis copied from scikit learn example on 
    http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html
    '''
    
    t0 = time()
    n_topics = 100
    n_top_posts = 20
    n_top_words = 50
    ngram = 1
   
    dataFname = '../DSSG_unleashfootball/word_splits_stopwords'
    originalTexts = '../DSSG_unleashfootball/Original_posts'
    dat = cPickle.load(open(dataFname))
    orig = cPickle.load(open(originalTexts))
    
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,ngram_range=(1,ngram))
    tfidf = vectorizer.fit_transform([' '.join(x) for x in dat]) 
    nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
    print("Done fitting NMF in %0.3fs." % (time() - t0))

    topic_assignments = nmf.transform(tfidf).argmax(axis=1)

    feature_names = vectorizer.get_feature_names()
    
    sentimentWords = json.loads(open('sentiWords.json').read())#load_sentiment()
    sentimentTopics = get_sentiments(nmf.components_,vectorizer,sentimentWords)    
    sentiments = get_sentiments(tfidf,vectorizer,sentimentWords)
    topics = []
    keywords2Topic = {}
    for topic_idx, topic in enumerate(nmf.components_):
        topicDict = {}
        topicDict['sentiment'] = sentimentTopics[topic_idx]
        topicDict['keywords'] = [{'keyword':feature_names[i],'weight':nmf.components_[topic_idx,i]} for i in topic.argsort()[:-n_top_words - 2:-1]]
        
        topicDict['label'] = topicDict['keywords'][0]['keyword']

        # count number of posts in this topic
        topicDict['postCount'] = (topic_assignments==topic_idx).sum()

        # get some representative posts
        ranking = tfidf.dot(nmf.components_[topic_idx,:])
        ranks = ranking.argsort()[-n_top_posts:][::-1]
        topicDict['posts'] = []
        for item in ranks:
            topicDict['posts'].append({'post':orig[item],'relevance':ranking[item],'sentiment':sentiments[item]})
        
        keywords2Topic = dict(keywords2Topic.items() + [ (word['keyword'],topic_idx) for word in topicDict['keywords']])

        print("Topic #%d (Sentiment %f):" %(topic_idx,sentimentTopics[topic_idx]))
        print(" | ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        topics.append(topicDict)    
    
    open('topics.json','wb').write(json.dumps(topics))
    open('keywords2Topic.json','wb').write(json.dumps(keywords2Topic))
Exemple #25
0
def make_NMF_300_feature(row_body_path,
                         row_stance_path,
                         head_tfidf_pkl,
                         body_tfidf_pkl,
                         label_path,
                         save_nmf_model_path,
                         save_head_path,
                         save_body_path,
                         cos_dist=False):
    if not os.path.exists(head_tfidf_pkl) or not os.path.exists(body_tfidf_pkl) \
            or not os.path.exists(label_path):
        make_tfidf_feature_5000(row_body_path,
                                row_stance_path,
                                head_tfidf_pkl,
                                body_tfidf_pkl,
                                label_path,
                                model_save=True)

    X_tfidf_body = load_model(body_tfidf_pkl)
    X_tfidf_head = load_model(head_tfidf_pkl)

    if not os.path.exists(save_nmf_model_path):
        X_all = np.concatenate(
            (X_tfidf_head.toarray(), X_tfidf_body.toarray()), axis=0)
        print('fit NMF topic model')
        t0 = time()
        nmf = NMF(n_components=300, random_state=1, alpha=.1)
        nmf.fit(X_all)
        print('done in {}'.format(time() - t0))
        save_model(save_nmf_model_path, nmf)

    nmf = load_model(save_nmf_model_path)

    if not os.path.exists(save_head_path) or not os.path.exists(
            save_body_path):
        nmf_head_matrix = nmf.transform(X_tfidf_head)
        nmf_body_matrix = nmf.transform(X_tfidf_body)
        save_model(save_head_path, nmf_head_matrix)
        print('saved model {}'.format(save_head_path))
        save_model(save_body_path, nmf_body_matrix)
        print('saved model {}'.format(save_body_path))

    nmf_head_matrix = load_model(save_head_path)
    nmf_body_matrix = load_model(save_body_path)
    if not cos_dist:
        return np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
    else:
        X = []
        for i in range(len(nmf_head_matrix)):
            X_head = np.array(nmf_head_matrix[i]).reshape((1, -1))
            X_body = np.array(nmf_body_matrix[i]).reshape((1, -1))
            cos = cosine_distances(X_head, X_body).flatten()
            X.append(cos.tolist())
        X = np.array(X)
        X_train = np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
        X = np.concatenate((X_train, X), axis=1)
        return X
    def calc_nmf(self,
                 matrix,
                 vocab,
                 providers,
                 components=None,
                 hardclustering=True):
        if components is None:
            components = self.n_topics + self.soft_offset

        print("NMF: Calculating ", components, " components (topics)...")
        nmf = NMF(n_components=components,
                  random_state=1,
                  alpha=.1,
                  l1_ratio=.5).fit(matrix)
        print("NMF: reconstruction error:", nmf.reconstruction_err_)

        # soft clustering
        cluster_assignments = nmf.transform(matrix)  # samples x components

        # derive topics
        topics = {}
        for c_idx, component in enumerate(nmf.components_):
            # determine top terms
            top = component.argsort()[::-1][:self.top_n]
            top = top[component[top] > 0]
            # Store
            topics[c_idx] = {"terms": vocab[top], "weights": component[top]}

        self.__postprocess__(clusters=cluster_assignments,
                             topics=topics,
                             raw_data=matrix,
                             path=self.path + "nmf/",
                             prefix=self.prefix,
                             soft_clustering=True,
                             providers=providers)

        #cluster_assignments = self.__removeInvalid__(cluster_assignments=cluster_assignments, topics=topics)
        if hardclustering:
            print("NMF: KMeans: Calculating ", self.n_topics,
                  " clusters (topics)...")
            cluster_assignments, topics = self.__applyKMeans__(
                raw_data=matrix,
                vocab=vocab,
                soft_clustering=cluster_assignments)
            print("NMF: KMeans: ", len(cluster_assignments),
                  " cluster assignments")

            self.__postprocess__(clusters=cluster_assignments,
                                 topics=topics,
                                 raw_data=matrix,
                                 path=self.path + "nmf/kmeans/",
                                 prefix=self.prefix,
                                 soft_clustering=False,
                                 providers=providers)

        return cluster_assignments, topics
Exemple #27
0
def nmf_topic_modeling (word_matrix, vocab, n = 5):

    nmf = NMF(n_components = n, max_iter = 1000)
    nmf.fit(word_matrix)

    topic_matrix = pd.DataFrame(nmf.transform(word_matrix)).add_prefix("topic_")
    word_matrix = pd.DataFrame(nmf.components_, \
        columns = vocab).T.add_prefix('topic_')

    return nmf, nmf.reconstruction_err_, topic_matrix, word_matrix
Exemple #28
0
    def plot_nmf(self, X, cell, outdir):
        corr_matrix = np.dot(X.T, X) / (X.shape[0] - 1)

        nmf = NMF(n_components=2)
        nmf.fit(corr_matrix)
        projections = pd.DataFrame(nmf.transform(corr_matrix))
        projections.columns = ["LF1", "LF2"]
        info = {"X": ("LF1", ""),
                "Y": ("LF2", "")}
        self.plot(projections, info, cell, outdir)
Exemple #29
0
def matrix_factorization(co_metrix):
    mf = NMF(n_components=500,
             init='random',
             random_state=0,
             max_iter=100,
             alpha=0.75,
             eta=0.001)
    mf.fit(co_metrix)
    word_vector = mf.transform(co_metrix)
    return word_vector
Exemple #30
0
    def fit(self):
        nmf = NMF(**self.fit_parameters)
        nmf.fit(self.input_data)

        self.output_data = nmf.transform(self.input_data)
        self.mapper_data = nmf.components_
        self.model_attributes = {"n_topics": nmf.n_components,
                                 }
        self._log_model_results()
        return self
Exemple #31
0
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='random', random_state=0)
        m.fit_transform(A)
        t = m.transform(A)
        A_new = m.inverse_transform(t)
        assert_array_almost_equal(A, A_new, decimal=2)
def run_nmf(X, asins, features, n_components=20):
    nmf = NMF(n_components)
    nmf.fit(X)
    W = nmf.transform(X)
    H = nmf.components_
    #make interpretable:
    W, H = (x for x in (W, H))
    W_df = pd.DataFrame(W, index=asins)
    H_df = pd.DataFrame(H, columns=features)
    return (W, H, W_df, H_df)
Exemple #33
0
def test_parameter_checking():
    A = np.ones((2, 2))
    name = 'spam'
    # FIXME : should be removed in 1.1
    init = 'nndsvda'
    msg = "Invalid solver parameter: got 'spam' instead of one of"
    with pytest.raises(ValueError, match=msg):
        NMF(solver=name, init=init).fit(A)
    msg = "Invalid init parameter: got 'spam' instead of one of"
    with pytest.raises(ValueError, match=msg):
        NMF(init=name).fit(A)
    msg = "Invalid regularization parameter: got 'spam' instead of one of"
    with pytest.raises(ValueError, match=msg):
        NMF(regularization=name, init=init).fit(A)
    msg = "Invalid beta_loss parameter: got 'spam' instead of one"
    with pytest.raises(ValueError, match=msg):
        NMF(solver='mu', init=init, beta_loss=name).fit(A)
    msg = (
        "Invalid beta_loss parameter: solver 'cd' does not handle "
        "beta_loss = 1.0"
    )
    with pytest.raises(ValueError, match=msg):
        NMF(solver='cd', init=init, beta_loss=1.0).fit(A)

    msg = "Negative values in data passed to"
    with pytest.raises(ValueError, match=msg):
        NMF(init=init).fit(-A)
    with pytest.raises(ValueError, match=msg):
        nmf._initialize_nmf(-A, 2, 'nndsvd')
    clf = NMF(2, tol=0.1, init=init).fit(A)
    with pytest.raises(ValueError, match=msg):
        clf.transform(-A)

    for init in ['nndsvd', 'nndsvda', 'nndsvdar']:
        msg = re.escape(
            "init = '{}' can only be used when "
            "n_components <= min(n_samples, n_features)"
            .format(init)
        )
        with pytest.raises(ValueError, match=msg):
            NMF(3, init=init).fit(A)
        with pytest.raises(ValueError, match=msg):
            nmf._initialize_nmf(A, 3, init)
def nmf_applied_to_wikipedia_articles(articles):
    # Create an NMF instance: model
    model = NMF(n_components=6)
    # Fit the model to articles
    model.fit(articles)
    # Transform the articles: nmf_features
    nmf_features = model.transform(articles)
    # Print the NMF features
    print(nmf_features.round(2))
    return nmf_features
Exemple #35
0
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='random', random_state=0)
        ft = m.fit_transform(A)
        t = m.transform(A)
        A_new = m.inverse_transform(t)
        assert_array_almost_equal(A, A_new, decimal=2)
def topic_dummies(df):

    #CLEAN HTML FUNCTION
    def get_text(cell):
        return BeautifulSoup(cell, 'html.parser').get_text()

    #Parse descriptions using html function above:
    df['description'] = df['description'].apply(get_text)
    df['org_desc'] = df['org_desc'].apply(get_text)
    clean = df['description']

    #All the parameters for the topic modeling.
    n_samples = len(clean)
    n_features = 500
    n_topics = 9
    n_top_words = 30

    my_additional_stopwords = ["la", "et", "en", "le", "les", "des", 'january', 'february',
                           'march', 'april', 'may', 'june', 'july', 'august', 'september',
                           'october', 'november', 'december', 'friday', 'thursday', 'saturday']
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stopwords)


    # Use tf-idf features for NMF.
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                       max_features=n_features,
                                       stop_words=stop_words)
    tfidf = tfidf_vectorizer.fit_transform(clean)

    # Fit the NMF model
    nmf = NMF(n_components=n_topics, random_state=1,
              alpha=.1, l1_ratio=.5).fit(tfidf)

    #Leave this turned off unless you want to print.
    #tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    #print_top_words(nmf, tfidf_feature_names, n_top_words)

    '''
    #Assign topics to descriptions:
    #These are from the full data.  Do NOT use these descriptions on any subset, as they will not match.
    topic_dict = {0:'dinner_party', 1:'educational', 2:'social_networks', 3:'logistics', 4: 'business', 5:'university',
                  6:'club_logistics', 7:'workshop', 8:'club_content'}
    '''
    topic_dict = {0:'topic1', 1:'topic2', 2:'topic3', 3:'topic4', 4: 'topic5', 5:'topic6',
                  6:'topic7', 7:'topic8', 8:'topic9'}


    W = nmf.transform(tfidf)
    df['topic_index'] = np.argmax(W, axis=1)
    df['topic_index'] = df['topic_index'].replace(topic_dict)

    ###Create dummy variables to insert into model
    topic_dummies = pd.get_dummies(df['topic_index']).rename(columns = lambda x: 'topic_'+str(x))
    df = pd.concat([df,topic_dummies],axis=1)
    return df
Exemple #37
0
def test_parameter_checking():
    A = np.ones((2, 2))
    name = "spam"
    # FIXME : should be removed in 1.1
    init = "nndsvda"
    msg = "Invalid solver parameter: got 'spam' instead of one of"
    with pytest.raises(ValueError, match=msg):
        NMF(solver=name, init=init).fit(A)
    msg = "Invalid init parameter: got 'spam' instead of one of"
    with pytest.raises(ValueError, match=msg):
        NMF(init=name).fit(A)

    with ignore_warnings(category=FutureWarning):
        # TODO remove in 1.2
        msg = "Invalid regularization parameter: got 'spam' instead of one of"
        with pytest.raises(ValueError, match=msg):
            NMF(regularization=name, init=init).fit(A)

    msg = "Invalid beta_loss parameter: got 'spam' instead of one"
    with pytest.raises(ValueError, match=msg):
        NMF(solver="mu", init=init, beta_loss=name).fit(A)
    msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0"
    with pytest.raises(ValueError, match=msg):
        NMF(solver="cd", init=init, beta_loss=1.0).fit(A)

    msg = "Negative values in data passed to"
    with pytest.raises(ValueError, match=msg):
        NMF(init=init).fit(-A)
    with pytest.raises(ValueError, match=msg):
        nmf._initialize_nmf(-A, 2, "nndsvd")
    clf = NMF(2, tol=0.1, init=init).fit(A)
    with pytest.raises(ValueError, match=msg):
        clf.transform(-A)

    for init in ["nndsvd", "nndsvda", "nndsvdar"]:
        msg = re.escape(
            "init = '{}' can only be used when "
            "n_components <= min(n_samples, n_features)".format(init))
        with pytest.raises(ValueError, match=msg):
            NMF(3, init=init).fit(A)
        with pytest.raises(ValueError, match=msg):
            nmf._initialize_nmf(A, 3, init)
Exemple #38
0
def main():
    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')
    X_train = newsgroups_train.data
    X_test = newsgroups_test.data
    y_train = newsgroups_train.target
    y_test = newsgroups_test.target

    X_train, X_test = TFIDF(X_train, X_test)

    NMF_ = NMF(n_components=2000)
    X_train_new = NMF_.fit(X_train)
    X_train_new = NMF_.transform(X_train)
    X_test_new = NMF_.transform(X_test)

    print("train with old features: ", np.array(X_train).shape)
    print("train with new features:", np.array(X_train_new).shape)

    print("train with old features: ", np.array(X_train).shape)
    print("train with new features:", np.array(X_train_new).shape)
Exemple #39
0
def JLL(X_train, y_train=None, X_test=None, n=100, init='random'):
    from sklearn.random_projection import johnson_lindenstrauss_min_dim
    mod = NMF(n_components=n, init=init, random_state=0)
    X = mod.fit(X_train, y_train)
    test = mod.transform(X_train)
    if X_test is None:
        out = train
    else:
        test = pca.transform(X_test)
        out = train, test
    return out
 def Nmf(self,):
     data_set=pd.read_csv(self.data_set_name,header=None,index_col=None)
     data_set=data_set.T   
     nmf=NMF(n_components=self.components)
     nmf.fit(data_set)
     data_set=nmf.transform(data_set)
     print("Generate Dre_data.csv." )
     #print("The interpretability of each component:")
     data_set=pd.DataFrame(data_set)
     data_set.to_csv(self.Dred_data,header=False,index=False)
     return 0     
Exemple #41
0
def get_latent_vector(X):
    # for language: n_components=150
    # for repo: n_components=?
    model = NMF(n_components=150, init='nndsvd', max_iter=1000, random_state=1126)
    print('NMF', model)
    model.fit(X)
    W = model.transform(X)
    H = model.components_

    normalized_matrix = normalize(W, axis=1, norm='l2')
    return normalized_matrix
Exemple #42
0
def test_sparse_transform():
    # Test that transform works on sparse data.  Issue #2124

    A = np.abs(random_state.randn(3, 2))
    A[A > 1.0] = 0
    A = csc_matrix(A)

    model = NMF(random_state=0, tol=1e-4, n_components=2)
    A_fit_tr = model.fit_transform(A)
    A_tr = model.transform(A)
    assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
Exemple #43
0
def reduce_dimensions(total_mat, n_topics):
    """
    Calculates and returns nmf 
    Input is data matrix, shape (n_samples, n_features)
    returns W array, shape (n_samples, n_components)
    """
    nmf = NMF(n_components = n_topics, random_state=42, alpha=.2,  l1_ratio=0.5)
    nmf.fit(total_mat)
    X = nmf.transform(total_mat) 
    w = nmf.components_ 
    return nmf 
Exemple #44
0
def nmf(df, week, questionNb, nbTopic, n, plt):
    """
        df: dataframe contains documents/sentences
        week: which week that you want ?
        questionNb: which question in this week ?
        nbTopic: how many topics do you think these document have ?
        n: find top n sentences contribute for each topic
    """

    print('Welcome to NMF algorithm.')
    print(
        'Begin find topics for all answers of question number {i} of week {w}'.
        format(i=questionNb, w=week))

    df = df[df['week'] == week]

    df['relevant'] = df['processed_responses'].apply(
        lambda x: x[questionNb - 1] if len(x) > questionNb - 1 else '')

    df = df.reset_index().drop('index', 1)

    " Non-negative Matrix Factorization is able to use tf-idf "

    tfidf_vectorizer = TfidfVectorizer(min_df=7,
                                       max_df=18)  #(max_features=vocab_size)
    tfidf = tfidf_vectorizer.fit_transform(df['relevant'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    print('Shape of tfidf matrix:', tfidf.shape)

    "NMF"
    nmf = NMF(n_components=nbTopic,
              random_state=1,
              alpha=.1,
              l1_ratio=.5,
              init='nndsvd').fit(tfidf)

    print('Topic words distribution shape:', nmf.components_.shape)
    plot_topics(nmf, tfidf_feature_names, nbTopic, plt)

    sim_matrix = concepts_responses_similarity(nmf.components_,
                                               tfidf.toarray())

    "Retrieve top n responses for a specific topic"
    d = {}
    for i in range(nbTopic):
        response_scores = sim_matrix[:, i]
        top_indexes = response_scores.argsort()[-n:][::-1]
        top_responses = []
        for index in top_indexes:
            top_responses.append(df.iloc[index]['standardized_responses'])
        d['Topic #' + str(i)] = top_responses

    return df, pd.DataFrame.from_dict(d), nmf.transform(tfidf)
Exemple #45
0
 def initialize(self):
     # TfIdf vectors
     tfidf_vectors, tfidf_vectorizer = self.vectorize_as_tfidf(self.naked_docs)
     nmf = NMF(n_components=self.num_topics, random_state=1,
       alpha=.1, l1_ratio=.5).fit(tfidf_vectors)
     self.vectors = tfidf_vectors
     self.vectorizer = tfidf_vectorizer
     tfidf_feature_names = tfidf_vectorizer.get_feature_names()
     self.topics = self.get_topics(nmf, tfidf_feature_names, self.num_topic_words)
     self.doc_topic_distrib = nmf.transform(tfidf_vectors)
     self.model = nmf
def NMF_(X_train_tfidf, X_test_tfidf):

    model = NMF(n_components=50, init='random', random_state=0)

    W_train_r = model.fit_transform(X_train_tfidf)
    W_test_r = model.transform(X_test_tfidf)

    H = model.components_
    Err_NMF = 0
    Err_NMF = np.sum(np.array(X_train_tfidf - W_train_r.dot(H))**2)
    return W_train_r, W_test_r, H, Err_NMF
Exemple #47
0
def test_sparse_transform():
    # Test that transform works on sparse data.  Issue #2124

    A = np.abs(random_state.randn(3, 2))
    A[A > 1.0] = 0
    A = csc_matrix(A)

    for solver in ('pg', 'cd'):
        model = NMF(solver=solver, random_state=0, tol=1e-4, n_components=2)
        A_fit_tr = model.fit_transform(A)
        A_tr = model.transform(A)
        assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
Exemple #48
0
def main():
    mat = np.zeros(shape=(0, 16 * 16 * 10))
    weightsArr = []
    arr = []
    f = open('../data/coarse.pkl', 'rb')
    while True:
        try:
            scan = pickle.load(f)
        except:
            break
        d = scan.data
        arr.append(d.shape[0])
        weightsArr.append(Weights(scan))
        mat = np.append(
            mat,
            np.reshape(d, (d.shape[0], d.shape[1] * d.shape[2] * d.shape[3])),
            axis=0)
        if len(arr) % 20 == 0:
            print(len(arr))

    print(mat.shape)
    np.save("../data/mat.npy", mat)

    doPCA = True
    doNMF = False
    if doPCA:
        pca = PCA(n_components=700, whiten=True)
        pca.fit(mat)

        print("PCA Fitted")
        curr = 0
        for i, x in enumerate(arr):
            weights = pca.transform(mat[curr:(curr + x), :])
            curr += x
            weightsArr[i].setWeights(weights)

        f = open('../data/pca.pkl', 'wb')
        pickle.dump(weightsArr, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(pca.components_, f, pickle.HIGHEST_PROTOCOL)
    if doNMF:
        nmf = NMF(n_components=20)
        nmf.fit(mat)

        print("NMF Fitted")
        curr = 0
        for i, x in enumerate(arr):
            weights = nmf.transform(mat[curr:(curr + x), :])
            curr += x
            weightsArr[i].setWeights(weights)

        f = open('../data/nmf.pkl', 'wb')
        pickle.dump(weightsArr, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(nmf.components_, f, pickle.HIGHEST_PROTOCOL)
def model_ratings_NMF(ratings, movies_ind, n_components):
    R = pd.DataFrame(ratings) # model assumes R ~ PQ'
    model = NMF(n_components=n_components, init='random', random_state=10)
    model.fit(R)

    P = model.components_  # Movie feature
    Q = model.transform(R)  # User features

    query = user_ratings.reshape(1,-1)

    t=model.transform(query)
    
    # prediction movie ratings of input user
    outcome = np.dot(t,P)
    outcome=pd.DataFrame(outcome)
    outcome = outcome.transpose()
    outcome['movieId'] = movies_ind['movieId']
    outcome = outcome.rename(columns={0:'rating'})
    top = outcome.sort_values(by='rating',ascending=False).head(150) # top 100 ratings from predictions list
    
    return top
Exemple #50
0
    def get_features(head_and_body):
        filename = "NMF_topics" + str(n_topics) + "topics"

        if include_holdout == True:
            filename += "_holdout"

        if include_unlbled_test == True:
            filename += "unlbled_test"

        if not (os.path.exists(features_dir + "/" + filename + ".pkl")):
            X_all, vocab = get_all_data(head_and_body, filename)

            # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
            # more important topic words a body contains of a certain topic, the higher its value for this topic
            nfm = NMF(n_components=n_topics, random_state=1, alpha=.1)

            print("NMF_topics: fit and transform body")
            t0 = time()
            nfm.fit_transform(X_all)
            print("done in %0.3fs." % (time() - t0))

            with open(features_dir + "/" + filename + ".pkl", 'wb') as handle:
                joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            vocab = get_vocab(head_and_body, filename)
            with open(features_dir + "/" + filename + ".pkl", 'rb') as handle:
                nfm = joblib.load(handle)

        vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        print("NMF_topics: transform head and body")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        nfm_head_matrix = nfm.transform(X_train_head)
        nfm_body_matrix = nfm.transform(X_train_body)

        if cosinus_dist == False:
            return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1)
        else:
            # calculate cosine distance between the body and head
            X = []
            for i in range(len(nfm_head_matrix)):
                X_head_vector = np.array(nfm_head_matrix[i]).reshape(
                    (1, -1))  # 1d array is deprecated
                X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1))
                cos_dist = cosine_distances(X_head_vector,
                                            X_body_vector).flatten()
                X.append(cos_dist.tolist())
            return X
Exemple #51
0
def test_nmf_sparse_transform():
    # Test that transform works on sparse data.  Issue #2124
    rng = np.random.mtrand.RandomState(42)
    A = np.abs(rng.randn(3, 2))
    A[1, 1] = 0
    A = csc_matrix(A)

    for solver in ('cd', 'mu'):
        model = NMF(solver=solver, random_state=0, n_components=2,
                    max_iter=400)
        A_fit_tr = model.fit_transform(A)
        A_tr = model.transform(A)
        assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
    def get_features(head_and_body):
        filename = "NMF_topics" + str(n_topics) + "topics"

        if include_holdout == True:
            filename += "_holdout"

        if include_unlbled_test == True:
            filename += "unlbled_test"

        if not (os.path.exists(features_dir + "/" + filename + ".pkl")):
            X_all, vocab = get_all_data(head_and_body, filename)

            # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
            # more important topic words a body contains of a certain topic, the higher its value for this topic
            nfm = NMF(n_components=n_topics, random_state=1, alpha=.1)

            print("NMF_topics: fit and transform body")
            t0 = time()
            nfm.fit_transform(X_all)
            print("done in %0.3fs." % (time() - t0))

            with open(features_dir + "/" + filename + ".pkl", 'wb') as handle:
                joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            vocab = get_vocab(head_and_body, filename)
            with open(features_dir + "/" + filename + ".pkl", 'rb') as handle:
                nfm = joblib.load(handle)

        vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        print("NMF_topics: transform head and body")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        nfm_head_matrix = nfm.transform(X_train_head)
        nfm_body_matrix = nfm.transform(X_train_body)

        if cosinus_dist == False:
            return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1)
        else:
            # calculate cosine distance between the body and head
            X = []
            for i in range(len(nfm_head_matrix)):
                X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1))  # 1d array is deprecated
                X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1))
                cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
                X.append(cos_dist.tolist())
            return X
class Factorizer(MultiScalePatches):

    def fit(self, data, k=5):
        super(Factorizer, self).fit(data)
        ftr = self.compute(data)
        self.nnmf = NMF(n_components=k).fit(ftr)
        return self

    def transform(self, data):
        ftr = self.compute(data)
        return self.nnmf.transform(ftr)

    def fit_transform(self, data, k=5):
        return self.fit(data, k=k).transform(data)
def nmf_articles(df, n_topics, n_features=5000, n_top_words=20, random_state=None, max_df=1, min_df=1):
    tfidf, feature_names, reverse_lookup = create_document_vector(df, max_features=n_features, max_df=max_df, min_df=min_df)
    nmf = NMF(n_components=n_topics, random_state=random_state, alpha=.1, l1_ratio=0.25).fit(tfidf)
    W = nmf.transform(tfidf)

    # Currently the attribution for each row in W is not a percentage, but we want to assign each document to any topic which it can be at least 10% attributed to
    sums = np.sum(W, axis=1)
    W_percent = W / sums[:, None]

    # For efficient slicing we will return a sparse boolean array
    labels = W_percent >= 0.1

    words = top_words(nmf.components_, feature_names, n_top_words)
    return nmf, tfidf, W, W_percent, labels, words, feature_names, reverse_lookup
Exemple #55
0
def test_non_negative_factorization_consistency():
    # Test that the function is called in the same way, either directly
    # or through the NMF class
    A = np.abs(random_state.randn(10, 10))
    A[:, 2 * np.arange(5)] = 0

    W_nmf, H, _ = non_negative_factorization(A, random_state=1, tol=1e-2)
    W_nmf_2, _, _ = non_negative_factorization(A, H=H, update_H=False, random_state=1, tol=1e-2)

    model_class = NMF(random_state=1, tol=1e-2)
    W_cls = model_class.fit_transform(A)
    W_cls_2 = model_class.transform(A)
    assert_array_almost_equal(W_nmf, W_cls, decimal=10)
    assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def nmf_faces(X_train, X_test):
    # Build NMF models with 10, 50, 100 and 500 components
    # this list will hold the back-transformd test-data
    reduced_images = []
    for n_components in [10, 50, 100, 500]:
        # build the NMF model
        nmf = NMF(n_components=n_components, random_state=0)
        nmf.fit(X_train)
        # transform the test data (afterwards has n_components many dimensions)
        X_test_nmf = nmf.transform(X_test)
        # back-transform the transformed test-data
        # (afterwards it's in the original space again)
        X_test_back = np.dot(X_test_nmf, nmf.components_)
        reduced_images.append(X_test_back)
    return reduced_images
Exemple #57
0
    def load_religion(self, path="data/religion.DTA", k=5):
        """Return nmf features from a STATA file"""
        # http://www.thearda.com/Archive/Files/Downloads/RCMSCY10_DL2.asp
        df = pd.read_stata(path)
        id_df = df[["stcode", "cntycode"]].copy()
        id_df.columns = ["st_num", "county_num"]
        cols = [x for x in df.columns if "rate" in x]  # only take percentage cols
        nmf_data = df[cols].fillna(0)
        model = NMF(n_components=k).fit(nmf_data)
        features = model.transform(nmf_data)
        nmf_feats = pd.DataFrame(features)
        # Name columns for interperatibility
        nmf_feats.columns = ["relig_nmf_feat_" + str(x) for x in list(nmf_feats.columns)]
        # Join the NMF. k = number of topics / cols to add
        output = id_df.join(nmf_feats)

        return output
Exemple #58
0
def get_topics(n_components=10, n_top_words=15, print_output=True):
	custom_stop_words = make_stop_words(new_stop_words)
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
	tfidf = tfidf_vectorizer.fit_transform(release_texts)
	tfidf = row_normalize_tfidf(tfidf)

	nmf = NMF(n_components=n_components, random_state=1)
	# nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)
	nmf.fit(tfidf)
	W = nmf.transform(tfidf)
	

	if print_output:
		print("\nTopics in NMF model:")
		tfidf_feature_names = tfidf_vectorizer.get_feature_names()
		print_top_words(nmf, tfidf_feature_names, n_top_words)
	
	return tfidf, nmf, W