def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
def get_cluster_membership(self): """ Determine the cluster number that each sample is associated with. """ model = ProjectedGradientNMF(n_components=self._num_clusters, init='random', beta=.3, eta=.5, max_iter=5000) w = model.fit_transform(self._matrix) h = model.components_ # convert the 'H' matrix, which represents weights for our data matrix W, into # an array representing cluster membership. Index of biggest value in each # col of matrix H is the cluster clusters = [] model_width = len(h[0]) for col_idx in range(model_width): max_val = dict() for row_idx in range(self._num_clusters): h_val = h[row_idx][col_idx] if not max_val or h_val > max_val['val']: max_val = {'row_idx': row_idx, 'val': h_val} clusters.append(max_val['row_idx']) # clusters array, w, h return (clusters, w, h)
def nmfModel(matrix, nTopics): t=time() print "Starting Factorization" nmf = ProjectedGradientNMF(nTopics, max_iter=220, sparseness='data', init='nndsvd') W = nmf.fit_transform(matrix) H = nmf.components_ print "Factorization took %s minutes"%(round((time()-t)/60., 2)) return W, H, nmf
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
def nmf(self, k): nmf = ProjectedGradientNMF(n_components=k, max_iter=200) P = nmf.fit_transform(self.tdm) Q = nmf.components_.T self.P = P self.Q = Q self.er = nmf.reconstruction_err_ #print "\tError: ", self.er return P, Q
def calcNMF(delta_data, components): data = preprocess(delta_data) nmf = ProjectedGradientNMF(n_components=components) x_nmf = nmf.fit_transform(data['cleanMatrix']) nmf_fill = np.ones((delta_data.shape[0],components))*np.nan nmf_fill[data['cleanind']] = x_nmf nmf_weights = nmf.components_.T delta_nmf = {'transform':nmf_fill, 'weights' : nmf_weights, } return delta_nmf
def __nmf_initialization(A, ncomms): try: from sklearn.decomposition import ProjectedGradientNMF except ImportError: print("sklearn module is missing.") return model = ProjectedGradientNMF(n_components=ncomms, init='nndsvd') Uin = np.asmatrix(model.fit_transform(A)) Vin = np.asmatrix(model.components_) Vin = Vin.T init_dict = {'U': Uin, 'V': Vin} return init_dict
def reducedim_nmf(self,factors): print "Number of factors is "+str(factors) model = ProjectedGradientNMF(n_components=factors,init='random',random_state=0) self.reducedmatrix= model.fit_transform(self.fullmatrix) #left factor w (n*k) h= model.components_ #right factor h (k*d) if self.testing: print self.fullmatrix print self.reducedmatrix print h v = numpy.dot(self.reducedmatrix,h) print v print "Completed NMF routine" for vector in self.vectordict.values(): vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
def train_model(self): print 'begin' RATE_MATRIX = np.zeros((9238, 7973)) for line in self.train.values: print line uid = int(float(line[1])) iid = int(float(line[2])) RATE_MATRIX[uid][iid] = int(float(line[3])) V = spr.csr_matrix(RATE_MATRIX) model = ProjectedGradientNMF(n_components=self.n_features, max_iter=1000, nls_max_iter=10000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose() print model.reconstruction_err_ self.ValidateF1() t = pd.DataFrame(np.array(self.pu)) t.to_csv('50pu') t = pd.DataFrame(np.array(self.qi)) t.to_csv('50qi') print("model generation over")
def recommend(matrix_3filled, matrix_raw, user, numOfNeighbors=5): # The following 3 lines uses Scikit-learn. For more information, refer to the documentation link in README. model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) model.fit(matrix_3filled) # transformed matrix is the result of non-negative matrix factorization, and we will use this for the recommendations transformed = np.dot(model.fit_transform(matrix_3filled), model.components_) neighbors=[] # Calculate distances from the current user to every other users. distances = np.sum((transformed-transformed[user])**2, axis=1) # Find nearest neighbors. for x in xrange(numOfNeighbors): distances[np.argmin(distances)] = sys.float_info.max neighbors.append(np.argmin(distances)) # Get an average for nearest neighbors. average is a vector containing the average rating for each humor. average=[0.0]*transformed.shape[1] for x in xrange(numOfNeighbors): average += transformed[neighbors[x]] average = average/numOfNeighbors # Find the unrated items for current users. unratedItems=[] for x in xrange(np.shape(matrix_raw)[1]): if matrix_raw[user][x] == 0: unratedItems.append(x) # If there are no unrated items, just return an item with max average rating. if len(unratedItems) is 0: item = np.argmax(average) return item # Else, return an unrated item with max average rating. else: maxAverage = 0 item = np.argmax(average) for x in xrange(len(unratedItems)): if average[unratedItems[x]] > maxAverage: maxAverage = average[unratedItems[x]] item = unratedItems[x] return item
def decomposition(V, W, H, n_components, solver='mu', update_H=True): if solver != 'project': W, H, _ = non_negative_factorization(V, W=W, H=H, n_components=n_components, update_H=update_H, max_iter=1000, solver=solver) #regularization='transformation', l1_ratio=0.1) else: model = ProjectedGradientNMF(n_components=n_components, init='random', random_state=0, sparseness='data', beta=0, max_iter=100000) model.fit(V) H = model.components_ W = model.fit_transform(V) return W, H
def reducedim_nmf(self, factors): print "Number of factors is " + str(factors) model = ProjectedGradientNMF(n_components=factors, init='random', random_state=0) self.reducedmatrix = model.fit_transform( self.fullmatrix) #left factor w (n*k) h = model.components_ #right factor h (k*d) if self.testing: print self.fullmatrix print self.reducedmatrix print h v = numpy.dot(self.reducedmatrix, h) print v print "Completed NMF routine" for vector in self.vectordict.values(): vector.array = sparse.csc_matrix( self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
class NMFpredictor(Predictor): def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500, n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001): self.check_non_negtive(model) self.model = model super(NMFpredictor,self).__init__() self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter, n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, sparseness=sparseness,tol=tol) self.user_latent_M, self.item_latent_M = self.construct_latent_matrics() def construct_latent_matrics(self): start = time.time() data_matrix = self.model.get_data_matrix() user_latent_M = self.nmf.fit_transform(data_matrix) item_latent_M = self.nmf.components_ print "use time: ", time.time() - start return user_latent_M, item_latent_M def predict(self,user_id, item_id): user_no = self.model.user_id_to_no[user_id] item_no = self.model.item_id_to_no[item_id] pref = np.dot(self.user_latent_M[user_no,:], self.item_latent_M[:,item_no]) if pref > self.model.max_pref: pref = self.model.max_pref if pref < self.model.min_pref: pref = self.model.min_pref return pref def check_non_negtive(self,model): if model.min_pref < 0: raise NotImplementedError("non_negtive!")
import numpy client = MongoClient('mongodb://localhost:27017/') mydb = client['movie_database'] movies = mydb.movies.find() i = 1 for movie in movies: print str(i)+" >> "+movie.get("title") +"--"+ movie.get("_id") i = i + 1 users = mydb.users.find() i = 1 for user in users: print str(i) + " >>" + user.get("_id") + "--" + user.get("password") activities = mydb.activity.find() i = 1 for activity in activities: print str(i) + " >>" + str(activity) A = numpy.random.uniform(size = [40, 30]) nmf_model = ProjectedGradientNMF(n_components = 5, init='random', random_state=0) W = nmf_model.fit_transform(A); H = nmf_model.components_; print W print H
def filter_1sigma_nmf_new(dma, iter_date, df, header_df): print 'Get the 1-sigma filtered data' print df.shape[1] idx_vt = df.shape[1] - 1 mean_viewtime = df[idx_vt].mean() std_viewtime = df[idx_vt].std() print mean_viewtime / 3600.0, std_viewtime / 3600.0 reduced_df = df[(df[idx_vt] >= LOW_LIMIT) & (df[idx_vt] <= HIGH_LIMIT)].reset_index() print reduced_df.shape reduced_df[range(1, idx_vt)] = reduced_df[range(1, idx_vt)].div( 1.0 * reduced_df[idx_vt], 'index') dev_id_list = reduced_df[0] reduced_df_vsum = reduced_df[range(1, idx_vt)].sum() reduced_df_vsum = reduced_df_vsum[reduced_df_vsum > 0.00] idx_list = reduced_df_vsum.index.tolist() reduced_df_1 = reduced_df[range(1, idx_vt)][reduced_df_vsum.index.tolist()] # Select the header accordingly reduced_header_df = header_df[idx_list] #program_viewtime_array = np.array(reduced_df[range(1,idx_vt)].astype(np.float)) program_viewtime_array = np.array(reduced_df_1.astype(np.float)) program_name_array = np.array(reduced_header_df) t_program_viewtime_array = program_viewtime_array.transpose() cluster_num = 14 # Non-negative Matrix Factorization model = ProjectedGradientNMF(n_components=cluster_num, sparseness='data', init='nndsvd', max_iter=400, random_state=0) WW = model.fit_transform(t_program_viewtime_array) t_WW = WW.transpose() HH = model.components_ t_HH = HH.transpose() #print t_HH.shape #print pd.DataFrame(t_HH).head() membership = [-1 for item in range(0, t_HH.shape[0])] # Assign the membership for i in range(0, t_HH.shape[0]): membership[i] = np.argmax(t_HH[i]) dd = reduced_header_df print dd.shape print program_name_array.shape print program_viewtime_array.shape file = open( 'decompose_results_clusters_%s_%s_%s.csv' % (iter_date.month, iter_date.day, dma), 'w') file.write( 'Cluster_id,Dev_num,Household_num,Feature_val,Feature_fraction,Program_name\n' ) file.write( '-1,%s,%s,,,\n' % (len(dev_id_list), get_household_num(dma, dev_id_list.tolist()))) cluster_num = t_WW.shape[0] for i in range(0, cluster_num): dev_indices = [index for index, v in enumerate(membership) if v == i] dev_in_cluster = dev_id_list[dev_indices] dev_num = len(dev_in_cluster) household_num = get_household_num(dma, dev_in_cluster.tolist()) #print heapq.nlargest(10,t_WW[i]) feature_val = np.sort(t_WW[i]) feature_val = feature_val[::-1] #print 't_WW:',t_WW[i] #print 'sorted t_WW:',feature_val val_sum = np.sum(feature_val) feature_frac = feature_val * 1.0 / val_sum accumulated_frac = 0 cut_ind = 0 for frac in feature_frac: accumulated_frac += frac cut_ind += 1 if accumulated_frac > 0.6: break idx_list = np.argsort(t_WW[i])[::-1][:cut_ind] program_list = program_name_array[0][idx_list] for j in range(0, cut_ind): file.write('%s,%s,%s,%s,%s,%s\n' % (i, dev_num, household_num, feature_val[j], feature_frac[j], program_list[j])) #file.write(' '.join(program_name_array[0][idx_list])) #file.write('\n') file.close() #income_analysis(dma, dev_id_list, cluster_num, membership) #child_present_analysis(dma, dev_id_list, cluster_num, membership) #age_analysis(dma, dev_id_list, cluster_num, membership) clusters_obj = all_clusters(dma, cluster_num, dev_id_list, membership) return clusters_obj
def nmf(mat,latentFactorNum=50,tol=1e-8,max_iter=1000): model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter); print "nnmf start:",datetime.now(); W,H=model.fit_transform(mat); print "nnmf end:",datetime.now(); return W,H;
print "nnmf end:",datetime.now(); return W,H; if __name__=="__main__": if len(argv)!=3: print "usage:",argv[0],"datafile_prefix threshold"; else: t,users=load_index_map(argv[1]+".user"); t,brands=load_index_map(argv[1]+".brand"); clickMat=convert(argv[1]+".clk.lbm",len(users),len(brands)); buyMat=convert(argv[1]+".buy.lbm",len(users),len(brands)); testUCMat=convert("data/8.clk.lbm",len(users),len(brands)).todense(); testUBMat=convert("data/8.clk.lbm",len(users),len(brands)).todense(); model=ProjectedGradientNMF(n_components=50,init='nndsvd',tol=1e-8,max_iter=1000); print "nnmf start:",datetime.now(); #W,H=model.fit_transform(clickMat); W,H=model.fit_transform(buyMat); print "nnmf end:",datetime.now(); Y=np.dot(W,H); # prediction #cuMat=np.transpose(clickMat).todense(); #cbMat=cuMat.dot(buyMat.todense()); #buyPredict=np.dot(Y,cbMat); buyPredict=Y; #print "error=",norm(clickMat-Y); fout=open("/tmp/score","w"); for i in range(len(users)): content=users[i]; for j in range(len(brands)): if buyPredict[i,j]<1e-5: continue; content+="\t"+brands[j]+":"+str(buyPredict[i,j]);
#answers_train, answers_test, cats_train, cats_test = train_test_split(answers, cats, test_size = 0.3)#, random_state=42) # Word counts count_vect = CountVectorizer(stop_words = 'english') answers_train = count_vect.fit_transform(answers_train) answers_test = count_vect.transform(answers_test) # Tf-idf tfidf_transformer = TfidfTransformer() answers_train = tfidf_transformer.fit_transform(answers_train) answers_test = tfidf_transformer.transform(answers_test) # NMF fit on training set print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape)) nmf = ProjectedGradientNMF(n_components = 100, max_iter=200) answers_train = nmf.fit_transform(answers_train) answers_test = nmf.transform(answers_test) # Fit SVM classifier print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape)) svc = svm.LinearSVC() svc.fit(answers_train, cats_train) print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100)) print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100)) mc_label = Counter(cats_train).most_common(1)[0][0] print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100)) # Metrics np.set_printoptions(linewidth=200, precision=3) cats_pred = svc.predict(answers_test)
question_index = question_list.index(curr_q) user_index = expert_list.index(curr_u) matrix[question_index][user_index] = label #print matrix # In[57]: print 'running model...' model = ProjectedGradientNMF(n_components=50, init='nndsvda', random_state=0, max_iter=300, eta=0.01, alpha=0.01) W = model.fit_transform(matrix) H = model.components_ rHat = np.dot(W, H) print 'recon error: ', model.reconstruction_err_ #np.savetxt("rHat.txt",rHat) #pickle.dump(question_list, 'qList.txt') # np.savetxt("qList.txt",question_list) #np.savetxt( user_list,"uList.txt") # In[61]: #matrix = pa.read_csv("rHat.txt") #rHat = np.array(matrix)
metrics.adjusted_rand_score(labels, km.labels_)) print( "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(tfidf_matrix_train, km.labels_, sample_size=1000)) print() # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online') lda_Z = lda_model.fit_transform(data_vectorized) print(lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) pgnmf_model = ProjectedGradientNMF(n_components=NUM_TOPICS) pgnmf_z = pgnmf_model.fit_transform(data_vectorized) print(pgnmf_z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Non-Negative Matrix Factorization Model nmf_model = NMF(n_components=NUM_TOPICS) nmf_Z = nmf_model.fit_transform(data_vectorized) print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Latent Semantic Indexing Model lsi_model = TruncatedSVD(n_components=NUM_TOPICS) lsi_Z = lsi_model.fit_transform(data_vectorized) print(lsi_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Let's see how the first document in the corpus looks like in different topic spaces print(lda_Z[0]) print(nmf_Z[0])
def main(): es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }]) index_name = "slclusters" if es_client.indices.exists(index_name): print("deleting '%s' index..." % (index_name)) print(es_client.indices.delete(index = index_name, ignore=[400, 404])) print("creating '%s' index..." % (index_name)) print(es_client.indices.create(index = index_name)) import re rr=re.compile(r"[\w']+") tok=lambda a:rr.findall(a) ff1=open('../docker/syslog.csv').readlines() aa=[] for d in ff1: #print(d) try: aa.append(json.loads(d)) except: continue print(len(aa)) # ff='\n'.join(ff1) docs=[] other=[] # aa=json.loads(ff) #print(aa) for iii,row in enumerate(aa): if len(tok(row['syslog_message']))>3: doc={} doc['created_at']=datetime.strptime(row["@timestamp"], "%Y-%m-%dT%H:%M:%S.000Z") doc['text']=row['syslog_message'] docs.append(doc['text']) other.append( doc['created_at'] ) print(doc['text']) print(tok(doc['text'])) print() if len(docs)>=100000: break cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5) # for iii,t in enumerate(tc): # print(iii,t) # if iii>100: # break M=cv.fit_transform(docs).astype(np.float) M2=Normalizer(copy=False).fit_transform(M) km=KMeans(n_clusters=30, init='k-means++', max_iter=200, n_init=5,\ verbose=True) km.fit_transform(M2) clusters=km.labels_ sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])] nmf=ProjectedGradientNMF(n_components=30) M3=nmf.fit_transform(M2) print(M3.shape) tDict={} maxInd=0 esDocs=[] for iii in sortInds: dd={} dd['message']=docs[iii] dd['cluster']=int(clusters[iii]) c2=tuple(np.argsort(M3[iii,:])[-1:]) if c2 in tDict: cc=tDict[c2] else: cc=maxInd tDict[c2]=maxInd maxInd=maxInd+1 dd['cluster2']=cc dd['created_at']=other[iii] esDocs.append(dd) #print(clusters[iii],other[iii],other[iii]) res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='syslogmsg', refresh=True)
feature_names = fr.columns X = np.array(fr.astype(float)) '''for i in range(60): #Test error as a function of number of topics model = ProjectedGradientNMF(n_components=i, init='nndsvda',random_state=0,max_iter=500) model.fit(X) print (i,model.reconstruction_err_);''' model = ProjectedGradientNMF(n_components=11, init='nndsvda', random_state=0, max_iter=500) #Perform the NMF Xtrans = model.fit_transform(X) for topic_idx, topic in enumerate( model.components_ ): #Print the rubric items with strongest contribution in topics sorte = np.sort(topic)[::-1] sorteargs = np.argsort(topic)[::-1] i = 0 print("Topic #%d:" % topic_idx) while (sorte[i] > 1.5 ): #Only show things where contribution is large (1.5 is arbitrary) print feature_names[sorteargs[i]], np.mean( np.transpose(X)[sorteargs[i]]) / ptvals[feature_names[ sorteargs[i]]] i += 1 sm = 0.
import numpy as np X = np.array([[1,1,2,3], [2, 1,4,5], [3, 2,4,5], [4, 1,2,1], [5, 4,3,1], [6, 1,4,3]]) from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) print model.fit(X) #ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, # n_components=2, nls_max_iter=2000, random_state=0, sparseness=None, # tol=0.0001) print model.components_ #array([[ 0.77032744, 0.11118662], # [ 0.38526873, 0.38228063]]) print model.reconstruction_err_ #0.00746... W = model.fit_transform(X); H = model.components_; print 'w: ' + str(W) print 'h: ' + str(H) model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0) print model.fit(X) #ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, # n_components=2, nls_max_iter=2000, random_state=0, # sparseness='components', tol=0.0001) print model.components_ #array([[ 1.67481991, 0.29614922],
####THEIRS- not needed # Example data matrix X ###MINE X = DataFrame(matrix) X_imputed = X.copy() X = pa.DataFrame(matrix)# DataFrame(toy_vals, index = range(nrows), columns = range(ncols)) ###use some way to mask only a few vals.... thst too either 0 or 1 msk = (X.values + np.random.randn(*X.shape) - X.values) < 0.8 X_imputed.values[~msk] = 0 ##THEIRS # Hiding values to test imputation # Initializing model nmf_model = ProjectedGradientNMF(n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) nmf_model.fit(X_imputed.values) # iterate model #while nmf_model.reconstruction_err_**2 > 10: #nmf_model = NMF( n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) W = nmf_model.fit_transform(X_imputed.values) X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk] print nmf_model.reconstruction_err_ H = nmf_model.components_ rHat = np.dot(W,H) np.savetxt("rHat.txt" ,rHat)
def main(): es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }]) index_name = "twclusters" if es_client.indices.exists(index_name): print("deleting '%s' index..." % (index_name)) print(es_client.indices.delete(index = index_name, ignore=[400, 404])) print("creating '%s' index..." % (index_name)) print(es_client.indices.create(index = index_name)) from tokenizers import tokenize_nor,get_nor_stopwords tok=lambda a:tokenize_nor(a,get_nor_stopwords()) docs=[] other=[] conn=sqlite3.connect('../data/tweets.sqlite') cur=conn.execute('select * from T') for iii,row in enumerate(cur): doc={} doc['_id']=row[3] doc['created_at']=datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") doc['author_id']=row[1] doc['text']=row[4] doc['language']=row[5] if len(tok(doc['text']))>2: docs.append(doc['text']) other.append( (doc['created_at'],doc['author_id']) ) if len(docs)>=100000: break cur.close() cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5) # for iii,t in enumerate(tc): # print(iii,t) # if iii>100: # break M=cv.fit_transform(docs).astype(np.float) M2=Normalizer(copy=False).fit_transform(M) km=KMeans(n_clusters=20, init='k-means++', max_iter=200, n_init=5,\ verbose=True) km.fit_transform(M2) clusters=km.labels_ sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])] nmf=ProjectedGradientNMF(n_components=10) M3=nmf.fit_transform(M2) print(M3.shape) tDict={} maxInd=0 esDocs=[] for iii in sortInds: dd={} dd['tweet']=docs[iii] dd['cluster']=int(clusters[iii]) c2=tuple(np.argsort(M3[iii,:])[-2:]) if c2 in tDict: cc=tDict[c2] else: cc=maxInd tDict[c2]=maxInd maxInd=maxInd+2 dd['cluster2']=cc dd['created_at']=other[iii][1] dd['author_id']=other[iii][0] esDocs.append(dd) #print(clusters[iii],other[iii],other[iii]) res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='tweet', refresh=True)
__author__ = 'juliewe' import numpy as np #import sklearn.decomposition.NMF as NMF #implements C.J.Lin's projected gradient methods for NMF X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) #n*d from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) w = model.fit_transform(X) #left factor w (n*k) h = model.components_ #right factor h (k*d) print w print h v = np.dot(w, h) print v
__author__ = 'juliewe' import numpy as np #import sklearn.decomposition.NMF as NMF #implements C.J.Lin's projected gradient methods for NMF X = np.array([[1,1],[2,1],[3,1.2],[4,1],[5,0.8],[6,1]]) #n*d from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2,init='random',random_state=0) w= model.fit_transform(X) #left factor w (n*k) h= model.components_ #right factor h (k*d) print w print h v = np.dot(w,h) print v
parser = argparse.ArgumentParser(description='Compute Non-negative Matrix Factorization') parser.add_argument('data_matrix', help='path to data file, should be readable by numpy') parser.add_argument('k', type=int, help='number of components to keep') parser.add_argument('feature_list', help='path to file containing list of feature names') parser.add_argument('index_file', help='path to array_index for this dataset') args = vars(parser.parse_args()) data = np.loadtxt(args['data_matrix']) k = args['k'] with open(args['feature_list']) as f: feature_list = map(str.rstrip, f.readlines()) indexes = np.loadtxt(args['index_file']) model = ProjectedGradientNMF(n_components=k, init='random', random_state=0) H = model.fit_transform(data) # H is submissions(row) by factors(cols) W = model.components_ # W is factors(row) by features(cols) magnitude = np.prod([np.sum(H, axis = 0), np.sum(W, axis = 1)], axis = 0) savetxt_3d(np.array(sort_by_row(W))[:, 0:20, :], 'nmf/factors_and_sorted_features.np', "factor") show_feature_name('nmf/factors_and_sorted_features.np', feature_list) subs_and_sorted_factors = sort_by_row(H) for sub in subs_and_sorted_factors: for factor in sub: factor[0] += 1 savetxt_3d(subs_and_sorted_factors, 'nmf/subs_and_sorted_factors.np', "submission") print "\n-------------- pattern of dominating factors ----------------\n" pattern = []