Ejemplo n.º 1
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
    def get_cluster_membership(self):
        """ Determine the cluster number that each sample is associated with. """

        model = ProjectedGradientNMF(n_components=self._num_clusters,
                                     init='random',
                                     beta=.3,
                                     eta=.5,
                                     max_iter=5000)

        w = model.fit_transform(self._matrix)
        h = model.components_

        # convert the 'H' matrix, which represents weights for our data matrix W, into
        # an array representing cluster membership. Index of biggest value in each
        # col of matrix H is the cluster
        clusters = []
        model_width = len(h[0])

        for col_idx in range(model_width):
            max_val = dict()
            for row_idx in range(self._num_clusters):
                h_val = h[row_idx][col_idx]

                if not max_val or h_val > max_val['val']:
                    max_val = {'row_idx': row_idx, 'val': h_val}

            clusters.append(max_val['row_idx'])

        # clusters array, w, h
        return (clusters, w, h)
Ejemplo n.º 3
0
def nmfModel(matrix, nTopics):
    t=time()
    print "Starting Factorization"
    nmf = ProjectedGradientNMF(nTopics, max_iter=220, sparseness='data', init='nndsvd')
    W = nmf.fit_transform(matrix)
    H = nmf.components_
    print "Factorization took %s minutes"%(round((time()-t)/60., 2))
    return W, H, nmf
Ejemplo n.º 4
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors,
                                  max_iter=1000,
                                  nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
 def nmf(self, k):
     
     nmf = ProjectedGradientNMF(n_components=k, max_iter=200)
     P = nmf.fit_transform(self.tdm)
     Q = nmf.components_.T
     self.P = P
     self.Q = Q
     self.er = nmf.reconstruction_err_
     #print "\tError: ", self.er
     return P, Q
def calcNMF(delta_data, components):

    data = preprocess(delta_data)
    nmf = ProjectedGradientNMF(n_components=components)
    x_nmf = nmf.fit_transform(data['cleanMatrix'])
   
    nmf_fill = np.ones((delta_data.shape[0],components))*np.nan
    nmf_fill[data['cleanind']] = x_nmf
    nmf_weights = nmf.components_.T
    delta_nmf = {'transform':nmf_fill,
                 'weights' : nmf_weights,
                }
    return delta_nmf
Ejemplo n.º 7
0
    def __nmf_initialization(A, ncomms):
        try:
            from sklearn.decomposition import ProjectedGradientNMF
        except ImportError:
            print("sklearn module is missing.")
            return

        model = ProjectedGradientNMF(n_components=ncomms, init='nndsvd')
        Uin = np.asmatrix(model.fit_transform(A))
        Vin = np.asmatrix(model.components_)
        Vin = Vin.T
        init_dict = {'U': Uin, 'V': Vin}
        return init_dict
Ejemplo n.º 8
0
    def reducedim_nmf(self,factors):
        print "Number of factors is "+str(factors)

        model = ProjectedGradientNMF(n_components=factors,init='random',random_state=0)
        self.reducedmatrix= model.fit_transform(self.fullmatrix)  #left factor w (n*k)
        h= model.components_ #right factor h (k*d)

        if self.testing:
            print self.fullmatrix
            print self.reducedmatrix
            print h
            v = numpy.dot(self.reducedmatrix,h)
            print v
        print "Completed NMF routine"
        for vector in self.vectordict.values():
            vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex])
        print "Stored individual vectors"
Ejemplo n.º 9
0
 def train_model(self):
     print 'begin'
     RATE_MATRIX = np.zeros((9238, 7973))
     for line in self.train.values:
         print line
         uid = int(float(line[1]))
         iid = int(float(line[2]))
         RATE_MATRIX[uid][iid] = int(float(line[3]))
     V = spr.csr_matrix(RATE_MATRIX)
     model = ProjectedGradientNMF(n_components=self.n_features, max_iter=1000, nls_max_iter=10000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
     print model.reconstruction_err_
     self.ValidateF1()
     t = pd.DataFrame(np.array(self.pu))
     t.to_csv('50pu')
     t = pd.DataFrame(np.array(self.qi))
     t.to_csv('50qi')
     print("model generation over")
Ejemplo n.º 10
0
def recommend(matrix_3filled, matrix_raw, user, numOfNeighbors=5):
    	
	# The following 3 lines uses Scikit-learn. For more information, refer to the documentation link in README.
    	model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)
    	model.fit(matrix_3filled)

	# transformed matrix is the result of non-negative matrix factorization, and we will use this for the recommendations
    	transformed = np.dot(model.fit_transform(matrix_3filled), model.components_)
    
   	neighbors=[]
	# Calculate distances from the current user to every other users.
    	distances = np.sum((transformed-transformed[user])**2, axis=1)

	# Find nearest neighbors.
    	for x in xrange(numOfNeighbors):
        	distances[np.argmin(distances)] = sys.float_info.max
        	neighbors.append(np.argmin(distances))

	# Get an average for nearest neighbors. average is a vector containing the average rating for each humor.
    	average=[0.0]*transformed.shape[1]
    	for x in xrange(numOfNeighbors):
        	average += transformed[neighbors[x]]
    	average = average/numOfNeighbors

	# Find the unrated items for current users.
    	unratedItems=[]
    	for x in xrange(np.shape(matrix_raw)[1]):
        	if matrix_raw[user][x] == 0:
            		unratedItems.append(x)
    
	# If there are no unrated items, just return an item with max average rating.
    	if len(unratedItems) is 0:
        	item = np.argmax(average)
        	return item
	# Else, return an unrated item with max average rating.
    	else:
        	maxAverage = 0
        	item = np.argmax(average)
        	for x in xrange(len(unratedItems)):
            		if average[unratedItems[x]] > maxAverage:
                		maxAverage = average[unratedItems[x]] 
                		item = unratedItems[x]
        	return item
def decomposition(V, W, H, n_components, solver='mu', update_H=True):
    if solver != 'project':
        W, H, _ = non_negative_factorization(V,
                                             W=W,
                                             H=H,
                                             n_components=n_components,
                                             update_H=update_H,
                                             max_iter=1000,
                                             solver=solver)
        #regularization='transformation', l1_ratio=0.1)
    else:
        model = ProjectedGradientNMF(n_components=n_components,
                                     init='random',
                                     random_state=0,
                                     sparseness='data',
                                     beta=0,
                                     max_iter=100000)
        model.fit(V)
        H = model.components_
        W = model.fit_transform(V)
    return W, H
Ejemplo n.º 12
0
    def reducedim_nmf(self, factors):
        print "Number of factors is " + str(factors)

        model = ProjectedGradientNMF(n_components=factors,
                                     init='random',
                                     random_state=0)
        self.reducedmatrix = model.fit_transform(
            self.fullmatrix)  #left factor w (n*k)
        h = model.components_  #right factor h (k*d)

        if self.testing:
            print self.fullmatrix
            print self.reducedmatrix
            print h
            v = numpy.dot(self.reducedmatrix, h)
            print v
        print "Completed NMF routine"
        for vector in self.vectordict.values():
            vector.array = sparse.csc_matrix(
                self.reducedmatrix[vector.rowindex])
        print "Stored individual vectors"
Ejemplo n.º 13
0
class NMFpredictor(Predictor):
    def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500,
        n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001):

        self.check_non_negtive(model)
        self.model = model
        super(NMFpredictor,self).__init__()
        
        self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter,
                   n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, 
                   sparseness=sparseness,tol=tol)
        self.user_latent_M, self.item_latent_M = self.construct_latent_matrics()

    def construct_latent_matrics(self):
    	start = time.time()
        data_matrix = self.model.get_data_matrix()
        user_latent_M = self.nmf.fit_transform(data_matrix)
        item_latent_M = self.nmf.components_
        print "use time: ", time.time() - start
        return user_latent_M, item_latent_M

    def predict(self,user_id, item_id):
        user_no = self.model.user_id_to_no[user_id]
        item_no = self.model.item_id_to_no[item_id]
        pref = np.dot(self.user_latent_M[user_no,:], self.item_latent_M[:,item_no])

        if pref > self.model.max_pref:
            pref = self.model.max_pref
        if pref < self.model.min_pref:
            pref = self.model.min_pref

        return pref

    def check_non_negtive(self,model):
        if model.min_pref < 0:
            raise NotImplementedError("non_negtive!")
Ejemplo n.º 14
0
import numpy


client = MongoClient('mongodb://localhost:27017/')
mydb = client['movie_database']


movies = mydb.movies.find()
i = 1
for movie in movies:
    print str(i)+" >> "+movie.get("title") +"--"+ movie.get("_id")
    i = i + 1
users = mydb.users.find()
i = 1
for user in users:
    print str(i) + " >>" + user.get("_id") + "--" + user.get("password")

activities = mydb.activity.find()
i = 1
for activity in activities:
    print str(i) + " >>" + str(activity)

A = numpy.random.uniform(size = [40, 30])
nmf_model = ProjectedGradientNMF(n_components = 5, init='random', random_state=0)
W = nmf_model.fit_transform(A);
H = nmf_model.components_;


print W
print H
Ejemplo n.º 15
0
def filter_1sigma_nmf_new(dma, iter_date, df, header_df):
    print 'Get the 1-sigma filtered data'
    print df.shape[1]
    idx_vt = df.shape[1] - 1
    mean_viewtime = df[idx_vt].mean()
    std_viewtime = df[idx_vt].std()

    print mean_viewtime / 3600.0, std_viewtime / 3600.0

    reduced_df = df[(df[idx_vt] >= LOW_LIMIT)
                    & (df[idx_vt] <= HIGH_LIMIT)].reset_index()
    print reduced_df.shape

    reduced_df[range(1, idx_vt)] = reduced_df[range(1, idx_vt)].div(
        1.0 * reduced_df[idx_vt], 'index')
    dev_id_list = reduced_df[0]

    reduced_df_vsum = reduced_df[range(1, idx_vt)].sum()
    reduced_df_vsum = reduced_df_vsum[reduced_df_vsum > 0.00]
    idx_list = reduced_df_vsum.index.tolist()
    reduced_df_1 = reduced_df[range(1, idx_vt)][reduced_df_vsum.index.tolist()]

    # Select the header accordingly
    reduced_header_df = header_df[idx_list]

    #program_viewtime_array = np.array(reduced_df[range(1,idx_vt)].astype(np.float))
    program_viewtime_array = np.array(reduced_df_1.astype(np.float))
    program_name_array = np.array(reduced_header_df)

    t_program_viewtime_array = program_viewtime_array.transpose()

    cluster_num = 14
    # Non-negative Matrix Factorization
    model = ProjectedGradientNMF(n_components=cluster_num,
                                 sparseness='data',
                                 init='nndsvd',
                                 max_iter=400,
                                 random_state=0)
    WW = model.fit_transform(t_program_viewtime_array)
    t_WW = WW.transpose()
    HH = model.components_
    t_HH = HH.transpose()
    #print t_HH.shape
    #print pd.DataFrame(t_HH).head()
    membership = [-1 for item in range(0, t_HH.shape[0])]
    # Assign the membership
    for i in range(0, t_HH.shape[0]):
        membership[i] = np.argmax(t_HH[i])

    dd = reduced_header_df
    print dd.shape
    print program_name_array.shape
    print program_viewtime_array.shape

    file = open(
        'decompose_results_clusters_%s_%s_%s.csv' %
        (iter_date.month, iter_date.day, dma), 'w')
    file.write(
        'Cluster_id,Dev_num,Household_num,Feature_val,Feature_fraction,Program_name\n'
    )
    file.write(
        '-1,%s,%s,,,\n' %
        (len(dev_id_list), get_household_num(dma, dev_id_list.tolist())))
    cluster_num = t_WW.shape[0]

    for i in range(0, cluster_num):
        dev_indices = [index for index, v in enumerate(membership) if v == i]
        dev_in_cluster = dev_id_list[dev_indices]
        dev_num = len(dev_in_cluster)
        household_num = get_household_num(dma, dev_in_cluster.tolist())

        #print heapq.nlargest(10,t_WW[i])
        feature_val = np.sort(t_WW[i])
        feature_val = feature_val[::-1]
        #print 't_WW:',t_WW[i]
        #print 'sorted t_WW:',feature_val
        val_sum = np.sum(feature_val)
        feature_frac = feature_val * 1.0 / val_sum
        accumulated_frac = 0
        cut_ind = 0
        for frac in feature_frac:
            accumulated_frac += frac
            cut_ind += 1
            if accumulated_frac > 0.6:
                break
        idx_list = np.argsort(t_WW[i])[::-1][:cut_ind]
        program_list = program_name_array[0][idx_list]
        for j in range(0, cut_ind):
            file.write('%s,%s,%s,%s,%s,%s\n' %
                       (i, dev_num, household_num, feature_val[j],
                        feature_frac[j], program_list[j]))
        #file.write(' '.join(program_name_array[0][idx_list]))
        #file.write('\n')
    file.close()
    #income_analysis(dma, dev_id_list, cluster_num, membership)
    #child_present_analysis(dma, dev_id_list, cluster_num, membership)
    #age_analysis(dma, dev_id_list, cluster_num, membership)
    clusters_obj = all_clusters(dma, cluster_num, dev_id_list, membership)
    return clusters_obj
Ejemplo n.º 16
0
def nmf(mat,latentFactorNum=50,tol=1e-8,max_iter=1000):
	model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter);
	print "nnmf start:",datetime.now();
	W,H=model.fit_transform(mat);
	print "nnmf end:",datetime.now();
	return W,H;
Ejemplo n.º 17
0
	print "nnmf end:",datetime.now();
	return W,H;
if __name__=="__main__":
	if len(argv)!=3:
		print "usage:",argv[0],"datafile_prefix threshold";
	else:
		t,users=load_index_map(argv[1]+".user");
		t,brands=load_index_map(argv[1]+".brand");
		clickMat=convert(argv[1]+".clk.lbm",len(users),len(brands));
		buyMat=convert(argv[1]+".buy.lbm",len(users),len(brands));
		testUCMat=convert("data/8.clk.lbm",len(users),len(brands)).todense();
		testUBMat=convert("data/8.clk.lbm",len(users),len(brands)).todense();
		model=ProjectedGradientNMF(n_components=50,init='nndsvd',tol=1e-8,max_iter=1000);
		print "nnmf start:",datetime.now();
		#W,H=model.fit_transform(clickMat);
		W,H=model.fit_transform(buyMat);
		print "nnmf end:",datetime.now();
		Y=np.dot(W,H);   # prediction

		#cuMat=np.transpose(clickMat).todense();
		#cbMat=cuMat.dot(buyMat.todense());
		#buyPredict=np.dot(Y,cbMat);
		buyPredict=Y;
		#print "error=",norm(clickMat-Y);
		fout=open("/tmp/score","w");
		for i in range(len(users)):
			content=users[i];
			for j in range(len(brands)):
				if buyPredict[i,j]<1e-5:
					continue;
				content+="\t"+brands[j]+":"+str(buyPredict[i,j]);
Ejemplo n.º 18
0
#answers_train, answers_test, cats_train, cats_test = train_test_split(answers, cats, test_size = 0.3)#, random_state=42)

# Word counts
count_vect = CountVectorizer(stop_words = 'english')
answers_train = count_vect.fit_transform(answers_train)
answers_test = count_vect.transform(answers_test)

# Tf-idf
tfidf_transformer = TfidfTransformer()
answers_train = tfidf_transformer.fit_transform(answers_train)
answers_test = tfidf_transformer.transform(answers_test)

# NMF fit on training set
print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape))
nmf = ProjectedGradientNMF(n_components = 100, max_iter=200)
answers_train = nmf.fit_transform(answers_train)
answers_test = nmf.transform(answers_test)

# Fit SVM classifier
print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape))
svc = svm.LinearSVC()
svc.fit(answers_train, cats_train)

print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100))
print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100))
mc_label = Counter(cats_train).most_common(1)[0][0]
print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100))

# Metrics
np.set_printoptions(linewidth=200, precision=3)
cats_pred = svc.predict(answers_test)
Ejemplo n.º 19
0
    question_index = question_list.index(curr_q)
    user_index = expert_list.index(curr_u)

    matrix[question_index][user_index] = label
#print matrix

# In[57]:
print 'running model...'

model = ProjectedGradientNMF(n_components=50,
                             init='nndsvda',
                             random_state=0,
                             max_iter=300,
                             eta=0.01,
                             alpha=0.01)
W = model.fit_transform(matrix)
H = model.components_
rHat = np.dot(W, H)
print 'recon error: ', model.reconstruction_err_

#np.savetxt("rHat.txt",rHat)

#pickle.dump(question_list, 'qList.txt')
# np.savetxt("qList.txt",question_list)
#np.savetxt( user_list,"uList.txt")

# In[61]:

#matrix = pa.read_csv("rHat.txt")
#rHat = np.array(matrix)
Ejemplo n.º 20
0
      metrics.adjusted_rand_score(labels, km.labels_))
print(
    "Silhouette Coefficient: %0.3f" %
    metrics.silhouette_score(tfidf_matrix_train, km.labels_, sample_size=1000))

print()

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

pgnmf_model = ProjectedGradientNMF(n_components=NUM_TOPICS)
pgnmf_z = pgnmf_model.fit_transform(data_vectorized)
print(pgnmf_z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
Ejemplo n.º 21
0
def main():

    es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])

    index_name = "slclusters"

    if es_client.indices.exists(index_name):
        print("deleting '%s' index..." % (index_name))
        print(es_client.indices.delete(index = index_name, ignore=[400, 404]))

    print("creating '%s' index..." % (index_name))
    print(es_client.indices.create(index = index_name))


    import re
    rr=re.compile(r"[\w']+")
    tok=lambda a:rr.findall(a)

    ff1=open('../docker/syslog.csv').readlines()
    aa=[]
    for d in ff1:
        #print(d)
        try:
            aa.append(json.loads(d))
        except:
            continue
    print(len(aa))
    # ff='\n'.join(ff1)
    docs=[]
    other=[]
    # aa=json.loads(ff)
    #print(aa)

    for iii,row in enumerate(aa):
        if len(tok(row['syslog_message']))>3:
            doc={}
            doc['created_at']=datetime.strptime(row["@timestamp"], "%Y-%m-%dT%H:%M:%S.000Z")
            doc['text']=row['syslog_message']
            docs.append(doc['text'])
            other.append( doc['created_at'] )
            print(doc['text'])
            print(tok(doc['text']))
            print()
            if len(docs)>=100000:
                break


    cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5)



    # for iii,t in enumerate(tc):
    #     print(iii,t)
    #     if iii>100:
    #         break
    M=cv.fit_transform(docs).astype(np.float)
    M2=Normalizer(copy=False).fit_transform(M)

    km=KMeans(n_clusters=30, init='k-means++', max_iter=200, n_init=5,\
                verbose=True)

    km.fit_transform(M2)
    clusters=km.labels_

    sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])]

    nmf=ProjectedGradientNMF(n_components=30)
    M3=nmf.fit_transform(M2)
    print(M3.shape)

    tDict={}
    maxInd=0
    esDocs=[]
    for iii in sortInds:
        dd={}
        dd['message']=docs[iii]
        dd['cluster']=int(clusters[iii])

        c2=tuple(np.argsort(M3[iii,:])[-1:])
        if c2 in tDict:
            cc=tDict[c2]
        else:
            cc=maxInd
            tDict[c2]=maxInd
            maxInd=maxInd+1

        dd['cluster2']=cc
        dd['created_at']=other[iii]
        esDocs.append(dd)
        #print(clusters[iii],other[iii],other[iii])

    res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='syslogmsg', refresh=True)
Ejemplo n.º 22
0
feature_names = fr.columns

X = np.array(fr.astype(float))
'''for i in range(60):												#Test error as a function of number of topics

   model = ProjectedGradientNMF(n_components=i, init='nndsvda',random_state=0,max_iter=500)
   model.fit(X)

   print (i,model.reconstruction_err_);'''

model = ProjectedGradientNMF(n_components=11,
                             init='nndsvda',
                             random_state=0,
                             max_iter=500)  #Perform the NMF
Xtrans = model.fit_transform(X)

for topic_idx, topic in enumerate(
        model.components_
):  #Print the rubric items with strongest contribution in topics
    sorte = np.sort(topic)[::-1]
    sorteargs = np.argsort(topic)[::-1]
    i = 0
    print("Topic #%d:" % topic_idx)
    while (sorte[i] > 1.5
           ):  #Only show things where contribution is large (1.5 is arbitrary)
        print feature_names[sorteargs[i]], np.mean(
            np.transpose(X)[sorteargs[i]]) / ptvals[feature_names[
                sorteargs[i]]]
        i += 1
    sm = 0.
Ejemplo n.º 23
0
import numpy as np
X = np.array([[1,1,2,3], [2, 1,4,5], [3, 2,4,5], [4, 1,2,1], [5, 4,3,1], [6, 1,4,3]])
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)

print model.fit(X)
#ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200,
#        n_components=2, nls_max_iter=2000, random_state=0, sparseness=None,
#        tol=0.0001)
print model.components_
#array([[ 0.77032744,  0.11118662],
#       [ 0.38526873,  0.38228063]])
print model.reconstruction_err_
#0.00746...

W = model.fit_transform(X);
H = model.components_;

print 'w: ' + str(W)
print 'h: ' + str(H)

model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0)


print model.fit(X)
#ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200,
#            n_components=2, nls_max_iter=2000, random_state=0,
#            sparseness='components', tol=0.0001)

print model.components_
#array([[ 1.67481991,  0.29614922],
Ejemplo n.º 24
0
####THEIRS- not needed
# Example data matrix X

###MINE
X = DataFrame(matrix)
X_imputed = X.copy()
X = pa.DataFrame(matrix)# DataFrame(toy_vals, index = range(nrows), columns = range(ncols))
###use some way to mask only a few vals.... thst too either 0 or 1
msk = (X.values + np.random.randn(*X.shape) - X.values) < 0.8
X_imputed.values[~msk] = 0


##THEIRS

# Hiding values to test imputation
# Initializing model
nmf_model = ProjectedGradientNMF(n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01)
nmf_model.fit(X_imputed.values)

# iterate model
#while nmf_model.reconstruction_err_**2 > 10:
    #nmf_model = NMF( n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01)
W = nmf_model.fit_transform(X_imputed.values)
X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk]
print nmf_model.reconstruction_err_

H = nmf_model.components_
rHat = np.dot(W,H)
np.savetxt("rHat.txt" ,rHat) 
Ejemplo n.º 25
0
def main():

    es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])

    index_name = "twclusters"

    if es_client.indices.exists(index_name):
        print("deleting '%s' index..." % (index_name))
        print(es_client.indices.delete(index = index_name, ignore=[400, 404]))

    print("creating '%s' index..." % (index_name))
    print(es_client.indices.create(index = index_name))


    from tokenizers import tokenize_nor,get_nor_stopwords
    tok=lambda a:tokenize_nor(a,get_nor_stopwords())

    docs=[]
    other=[]
    conn=sqlite3.connect('../data/tweets.sqlite')
    cur=conn.execute('select * from T')
    for iii,row in enumerate(cur):
        doc={}
        doc['_id']=row[3]
        doc['created_at']=datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S")
        doc['author_id']=row[1]
        doc['text']=row[4]
        doc['language']=row[5]
        if len(tok(doc['text']))>2:
            docs.append(doc['text'])
            other.append( (doc['created_at'],doc['author_id']) )
        if len(docs)>=100000:
            break

    cur.close()


    cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5)



    # for iii,t in enumerate(tc):
    #     print(iii,t)
    #     if iii>100:
    #         break
    M=cv.fit_transform(docs).astype(np.float)
    M2=Normalizer(copy=False).fit_transform(M)

    km=KMeans(n_clusters=20, init='k-means++', max_iter=200, n_init=5,\
                verbose=True)

    km.fit_transform(M2)
    clusters=km.labels_

    sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])]

    nmf=ProjectedGradientNMF(n_components=10)
    M3=nmf.fit_transform(M2)
    print(M3.shape)

    tDict={}
    maxInd=0
    esDocs=[]
    for iii in sortInds:
        dd={}
        dd['tweet']=docs[iii]
        dd['cluster']=int(clusters[iii])

        c2=tuple(np.argsort(M3[iii,:])[-2:])
        if c2 in tDict:
            cc=tDict[c2]
        else:
            cc=maxInd
            tDict[c2]=maxInd
            maxInd=maxInd+2

        dd['cluster2']=cc
        dd['created_at']=other[iii][1]
        dd['author_id']=other[iii][0]
        esDocs.append(dd)
        #print(clusters[iii],other[iii],other[iii])

    res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='tweet', refresh=True)
Ejemplo n.º 26
0
__author__ = 'juliewe'

import numpy as np
#import sklearn.decomposition.NMF as NMF
#implements C.J.Lin's projected gradient methods for NMF

X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])  #n*d
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)
w = model.fit_transform(X)  #left factor w (n*k)
h = model.components_  #right factor h (k*d)

print w
print h
v = np.dot(w, h)
print v
Ejemplo n.º 27
0
__author__ = 'juliewe'


import numpy as np
#import sklearn.decomposition.NMF as NMF
#implements C.J.Lin's projected gradient methods for NMF

X = np.array([[1,1],[2,1],[3,1.2],[4,1],[5,0.8],[6,1]])  #n*d
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2,init='random',random_state=0)
w= model.fit_transform(X)  #left factor w (n*k)
h= model.components_ #right factor h (k*d)

print w
print h
v = np.dot(w,h)
print v



Ejemplo n.º 28
0
    parser = argparse.ArgumentParser(description='Compute Non-negative Matrix Factorization')
    parser.add_argument('data_matrix', help='path to data file, should be readable by numpy')
    parser.add_argument('k', type=int, help='number of components to keep')
    parser.add_argument('feature_list', help='path to file containing list of feature names')
    parser.add_argument('index_file', help='path to array_index for this dataset')
    
    args = vars(parser.parse_args())
    data = np.loadtxt(args['data_matrix'])
    k = args['k']
    with open(args['feature_list']) as f:
        feature_list = map(str.rstrip, f.readlines())
    indexes = np.loadtxt(args['index_file'])

    model = ProjectedGradientNMF(n_components=k, init='random', random_state=0)
    H = model.fit_transform(data) # H is submissions(row) by factors(cols)
    W = model.components_    # W is factors(row) by features(cols)
    magnitude = np.prod([np.sum(H, axis = 0), np.sum(W, axis = 1)], axis = 0)

    savetxt_3d(np.array(sort_by_row(W))[:, 0:20, :], 'nmf/factors_and_sorted_features.np', "factor")
    show_feature_name('nmf/factors_and_sorted_features.np', feature_list)

    subs_and_sorted_factors = sort_by_row(H)
    for sub in subs_and_sorted_factors:
        for factor in sub:
            factor[0] += 1
    savetxt_3d(subs_and_sorted_factors, 'nmf/subs_and_sorted_factors.np', "submission")

    print "\n-------------- pattern of dominating factors ----------------\n"

    pattern = []