def return_doc_stats(doc_coll):
    '''returns the number of documements and vocabulary size of the 
	   collection'''

    n_docs = 0
    vocab_size = 0
    with open(doc_coll) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        for row in csv_reader:
            for idx_ctn in row:
                idx_ctn_li = idx_ctn.split(":")
                if int(idx_ctn_li[0]) > vocab_size:
                    vocab_size = int(idx_ctn_li[0])
            n_docs += 1

    return n_docs, vocab_size


if __name__ == '__main__':

    X = create_data_matrix("data/nyt_data.txt")
    model = NMF(X, 25, 100)
    results = model.get_objective()
    fig = plt.figure()
    axes = fig.add_subplot(1, 1, 1)
    axes.plot(np.linspace(1, len(results), len(results)), results)
    axes.set_xlabel("iteration number")
    axes.set_ylabel("Objective function value")
    axes.set_title("Objective function per iteration")
    plt.show()
Esempio n. 2
0
    k = int(sys.argv[4])  # number of community to detect
    output = sys.argv[5]  # output filename

    metrics = dict()

    # load two matrix
    start = time.time()
    graph = Util.Graph(ufile, cfile)
    metrics["readTime"] = time.time() - start

    # main algorithm
    start = time.time()
    if algorithm == 'bigclam':
        trainComm = bigClam(graph, k)
    elif algorithm == 'nmf':
        trainComm = NMF(graph, k)
    elif algorithm == 'lc':
        trainComm = LC(graph, k)
    elif algorithm == 'cpm':
        trainComm = CPM(graph, k)
    else:
        print("invalid algorithm")
        exit(-1)
    metrics["execTime"] = time.time() - start

    realComm = graph.community
    print(trainComm)
    # evaluation metrics
    metrics["f1score"] = Util.f1score(realComm, trainComm)
    metrics["omgIdx"] = Util.omegaIndex(realComm, trainComm)
    metrics["accuracy"] = Util.accuracy(realComm, trainComm)
Esempio n. 3
0
    graph = Util.Graph(ufile, cfile)
    epsilon = 10**(-8)  # background edge propability in sec. 4
    delta = np.sqrt(epsilon)  # threshold to determine user-community edge
    # delta = np.sqrt(2.0 * graph.m / graph.n / graph.n)
    metrics["algorithm"] = algorithm
    metrics["readTime"] = time.time() - start
    realComm = graph.community
    # avarage number of communities per user
    avgnum = Util.avgCommNum(realComm)
    print("Average community per user:{}".format(avgnum))
    # main algorithm
    start = time.time()
    if algorithm == 'bigclam':
        trainComm = bigClam(graph, realComm, k, delta)
    elif algorithm == 'nmf':
        trainComm = NMF(graph, k)
    elif algorithm == 'lc':
        trainComm = LC(graph, k)
    elif algorithm == 'cpm':
        trainComm = CPM(graph, k)
    else:
        print("invalid algorithm")
        exit(-1)
    metrics["execTime"] = time.time() - start

    trainComm = {int(k): [int(i) for i in v] for k, v in trainComm.items()}
    print(trainComm)
    # evaluation metrics

    metric_time = time.time()
    metrics["f1score"] = Util.f1score(realComm, trainComm)
Esempio n. 4
0
import pandas as pd
import matplotlib.pyplot as plt
import time

from NMF import NMF
from utils import plot_gallery

data = pd.read_csv('data.txt', sep='\t', header=None).values.T
print(data.shape)
t = time.time()

W, H = NMF(data, 8, 100, 1e-4)
print(H.shape, W.shape)

plot_gallery('%s - Train time %.1fs' % ('Non-negative components - NMF', time.time() - t),
             W.T,
             4,
             2,
             (64, 64))
plt.show()
Esempio n. 5
0
    for i, comp in enumerate(images):
        plt.subplot(n_row, n_col, i + 1)
        vmax = max(comp.max(), -comp.min())
        plt.imshow(comp.reshape(image_shape),
                   cmap=cmap,
                   interpolation='nearest',
                   vmin=-vmax,
                   vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)


# #############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces[:n_components])

# #############################################################################
# Do the estimation and plot it
name = 'Non-negative components - NMF'
print("Extracting the top %d %s..." % (n_components, name))
t0 = time()
data = faces
W, H = NMF(data, k=n_components)
train_time = (time() - t0)
print("done in %0.3fs" % train_time)

print('components_:', H.shape, '\n**\n', H)
plot_gallery('%s - Train time %.1fs' % (name, train_time), H)
plt.show()
Esempio n. 6
0
    validation_U = usr_encoder.transform(validation[:, 0].reshape(-1, 1))
    validation_I = item_encoder.transform(validation[:, 1].reshape(-1, 1))
    validation_y = validation[:, 2].reshape(-1, 1)

    #参数
    args = sys.argv
    learning_rate = float(args[1])
    batch_size = int(sys.argv[2])
    iteration = int(sys.argv[3])
    print('start training , learning_rate: %f, batch_size: %d, iter: %d' %
          (learning_rate, batch_size, iteration))

    print('-------------' + 'satrt time ' + str(datetime.now()) +
          '--------------')
    model = NMF(learning_rate=learning_rate,
                batch_size=batch_size,
                iteration=iteration)
    # model=MLP(learning_rate=learning_rate,batch_size=batch_size,iteration=iteration)
    # model=GMF(learning_rate=learning_rate,batch_size=batch_size,iteration=iteration)

    model.fit(train_U, train_I, train_y)
    res = model.predict(test_U, test_I)
    m = 101
    k = 10
    hr = hating_rate(res, m, k)
    ndcg = ndcg(res, m, k)
    print('hating rate: %f' % hr)
    print('ndcg: %f' % ndcg)
    with open('res-nmf.txt', 'w') as w:
        w.write('learning_rate: %f, batch_size: %d, iter: %d\n' %
                (learning_rate, batch_size, iteration))
Esempio n. 7
0
# -*- coding: utf-8 -*-

import numpy as np

from NMF import NMF
"""
テスト用のスクリプト
使い方記載
"""
if __name__ == "__main__":

    nmf = NMF()
    """
    コンストラクタを作ったあとには、まずこれを呼ぶこと
    ここで、k,row,columnの設定をするので、これを呼ばないとsetDictionaryが動かない
    """
    nmf.setAnalyzData([[1, 2, 3, 4], [2, 3, 4, 5]], k=3)
    """
    ここでテンプレートをセットする
    """
    nmf.setDictionary(0, [0.0, 2.0])
    nmf.setDictionary(1, [1.0, 6.0])
    nmf.setDictionary(2, [11.0, 10.0])
    """
    NMF開始
    引数には、アルゴリズムと反復更新回数を渡しておく
    """
    dic, act = nmf.separate_euc_with_template(iter=200)
    # dic,act = nmf.separate_kl_with_template(iter=200)
    # dic,act = nmf.separate_is_with_template(iter=200)