def return_doc_stats(doc_coll): '''returns the number of documements and vocabulary size of the collection''' n_docs = 0 vocab_size = 0 with open(doc_coll) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") for row in csv_reader: for idx_ctn in row: idx_ctn_li = idx_ctn.split(":") if int(idx_ctn_li[0]) > vocab_size: vocab_size = int(idx_ctn_li[0]) n_docs += 1 return n_docs, vocab_size if __name__ == '__main__': X = create_data_matrix("data/nyt_data.txt") model = NMF(X, 25, 100) results = model.get_objective() fig = plt.figure() axes = fig.add_subplot(1, 1, 1) axes.plot(np.linspace(1, len(results), len(results)), results) axes.set_xlabel("iteration number") axes.set_ylabel("Objective function value") axes.set_title("Objective function per iteration") plt.show()
k = int(sys.argv[4]) # number of community to detect output = sys.argv[5] # output filename metrics = dict() # load two matrix start = time.time() graph = Util.Graph(ufile, cfile) metrics["readTime"] = time.time() - start # main algorithm start = time.time() if algorithm == 'bigclam': trainComm = bigClam(graph, k) elif algorithm == 'nmf': trainComm = NMF(graph, k) elif algorithm == 'lc': trainComm = LC(graph, k) elif algorithm == 'cpm': trainComm = CPM(graph, k) else: print("invalid algorithm") exit(-1) metrics["execTime"] = time.time() - start realComm = graph.community print(trainComm) # evaluation metrics metrics["f1score"] = Util.f1score(realComm, trainComm) metrics["omgIdx"] = Util.omegaIndex(realComm, trainComm) metrics["accuracy"] = Util.accuracy(realComm, trainComm)
graph = Util.Graph(ufile, cfile) epsilon = 10**(-8) # background edge propability in sec. 4 delta = np.sqrt(epsilon) # threshold to determine user-community edge # delta = np.sqrt(2.0 * graph.m / graph.n / graph.n) metrics["algorithm"] = algorithm metrics["readTime"] = time.time() - start realComm = graph.community # avarage number of communities per user avgnum = Util.avgCommNum(realComm) print("Average community per user:{}".format(avgnum)) # main algorithm start = time.time() if algorithm == 'bigclam': trainComm = bigClam(graph, realComm, k, delta) elif algorithm == 'nmf': trainComm = NMF(graph, k) elif algorithm == 'lc': trainComm = LC(graph, k) elif algorithm == 'cpm': trainComm = CPM(graph, k) else: print("invalid algorithm") exit(-1) metrics["execTime"] = time.time() - start trainComm = {int(k): [int(i) for i in v] for k, v in trainComm.items()} print(trainComm) # evaluation metrics metric_time = time.time() metrics["f1score"] = Util.f1score(realComm, trainComm)
import pandas as pd import matplotlib.pyplot as plt import time from NMF import NMF from utils import plot_gallery data = pd.read_csv('data.txt', sep='\t', header=None).values.T print(data.shape) t = time.time() W, H = NMF(data, 8, 100, 1e-4) print(H.shape, W.shape) plot_gallery('%s - Train time %.1fs' % ('Non-negative components - NMF', time.time() - t), W.T, 4, 2, (64, 64)) plt.show()
for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) plt.imshow(comp.reshape(image_shape), cmap=cmap, interpolation='nearest', vmin=-vmax, vmax=vmax) plt.xticks(()) plt.yticks(()) plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces[:n_components]) # ############################################################################# # Do the estimation and plot it name = 'Non-negative components - NMF' print("Extracting the top %d %s..." % (n_components, name)) t0 = time() data = faces W, H = NMF(data, k=n_components) train_time = (time() - t0) print("done in %0.3fs" % train_time) print('components_:', H.shape, '\n**\n', H) plot_gallery('%s - Train time %.1fs' % (name, train_time), H) plt.show()
validation_U = usr_encoder.transform(validation[:, 0].reshape(-1, 1)) validation_I = item_encoder.transform(validation[:, 1].reshape(-1, 1)) validation_y = validation[:, 2].reshape(-1, 1) #参数 args = sys.argv learning_rate = float(args[1]) batch_size = int(sys.argv[2]) iteration = int(sys.argv[3]) print('start training , learning_rate: %f, batch_size: %d, iter: %d' % (learning_rate, batch_size, iteration)) print('-------------' + 'satrt time ' + str(datetime.now()) + '--------------') model = NMF(learning_rate=learning_rate, batch_size=batch_size, iteration=iteration) # model=MLP(learning_rate=learning_rate,batch_size=batch_size,iteration=iteration) # model=GMF(learning_rate=learning_rate,batch_size=batch_size,iteration=iteration) model.fit(train_U, train_I, train_y) res = model.predict(test_U, test_I) m = 101 k = 10 hr = hating_rate(res, m, k) ndcg = ndcg(res, m, k) print('hating rate: %f' % hr) print('ndcg: %f' % ndcg) with open('res-nmf.txt', 'w') as w: w.write('learning_rate: %f, batch_size: %d, iter: %d\n' % (learning_rate, batch_size, iteration))
# -*- coding: utf-8 -*- import numpy as np from NMF import NMF """ テスト用のスクリプト 使い方記載 """ if __name__ == "__main__": nmf = NMF() """ コンストラクタを作ったあとには、まずこれを呼ぶこと ここで、k,row,columnの設定をするので、これを呼ばないとsetDictionaryが動かない """ nmf.setAnalyzData([[1, 2, 3, 4], [2, 3, 4, 5]], k=3) """ ここでテンプレートをセットする """ nmf.setDictionary(0, [0.0, 2.0]) nmf.setDictionary(1, [1.0, 6.0]) nmf.setDictionary(2, [11.0, 10.0]) """ NMF開始 引数には、アルゴリズムと反復更新回数を渡しておく """ dic, act = nmf.separate_euc_with_template(iter=200) # dic,act = nmf.separate_kl_with_template(iter=200) # dic,act = nmf.separate_is_with_template(iter=200)