Ejemplo n.º 1
0
 def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2,
             num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'):
     NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w)
 
     self._niterH = niterH
     self._nsub = nsub
     self.data = data
     self._mfmethod = mfmethod
     self._mapW = mapW
     self._sstrategy = sstrategy
     self._base_sel = base_sel
     
     # assign the correct distance function
     if self._sstrategy == 'cur':
         self._subfunc = self.curselect
                 
     elif self._sstrategy == 'kmeans':
         self._subfunc = self.kmeansselect
             
     elif self._sstrategy == 'hull':
         self._subfunc = self.hullselect
         
     elif self._sstrategy == 'laesa':
         self._subfunc = self.laesaselect
         
     elif self._sstrategy == 'sivm':
         self._subfunc = self.sivmselect
         
     else:
         self._subfunc = self.randselect
class TestNMF(unittest.TestCase):
    """
    Test the NMF class.
    """
    def setUp(self):
        self.description_csv = pd.read_csv("docs/description.csv")
        self.description_1000_csv = pd.read_csv("docs/description_1000.csv")
        self.dp = DocsPreprocessor()
        self.description_1000 = self.dp.process(self.description_1000_csv)
        self.nmf = NMF(self.description_1000)

    def test_type(self):
        self.assertEqual(type(self.nmf.docs), list)

    """ def test_vectorize(self):
        vect, terms = self.nmf.vectorize()
        self.assertTrue(len(terms) == 2381)
        self.assertEqual((vect.shape[0], vect.shape[1]), (1000, 2381)) """
    """ def test_create_model(self):
        self.nmf.create_model(10) """
    """ def test_run_topic_models(self):
        self.nmf.run_topic_models(10, 30, 10) """
    """ def test_create_word_embedding_model(self):
        w_model = self.nmf.create_word_embedding_model() """

    def test_process_models(self):
        self.nmf.process_models(10, 30, 10, 20)
Ejemplo n.º 3
0
    def run(self):
        """
        :returns: Tuple, such as (<dict of nmis, with algorithm names as keys >)
        """

        nsc = NSpecSparse(self.X, self.k, maxiter=2000)
        nmf = NMF(self.X, self.k)
        km = KMeans(n_clusters=self.k)
        nsckm = NSpecSparseKM(self.X, self.k, maxiter=2000)

        nsc_result = nsc.predict()
        nmf_result = nmf.predict()
        km_result  = km.fit_predict(self.X)
        nsckm_result = nsckm.predict()

        w_nsc = nsc_result.matrices[0].todense()
        w_nmf = nmf_result.matrices[0]
        w_nsckm = nsckm_result.matrices # gets only the labels

        arrays = {
            'nsc': np.array(np.argmax(w_nsc, axis=1))[:,0],
            'nmf': np.array(np.argmax(w_nmf, axis=1)),
            'km': km_result,
            'nsckm': w_nsckm
        }

        nmi = {k: nmiscore(arrays[k], self.y) for k in arrays.keys()}

        return (nmi, arrays)
Ejemplo n.º 4
0
    def __init__(self,
                 data,
                 num_bases=0,
                 niter=1,
                 show_progress=False,
                 compW=True,
                 center_mean=True):

        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)

        # center the data around the mean first
        self._center_mean = center_mean

        if self._center_mean:
            # copy the data before centering it -> arrays
            # are passed by reference ...
            self._data_orig = data
            self._meanv = self._data_orig[:, :].mean(axis=1).reshape(
                data.shape[0], -1)
            self.data = self._data_orig - self._meanv
        else:
            self.data = data
Ejemplo n.º 5
0
    def __init__(self,
                 data,
                 num_bases=4,
                 niter=100,
                 show_progress=False,
                 compW=True):
        """ Inits Nmf class:
		
		sampleNmf = Nmf(data, num_bases=4, niter=100, show_progress=True, compW=True)
		
		Args:
			data (required)	: d x n data matrix [d - dimension, n -number of samples]
			num_bases	: number of basis vectors for W (default: 4)
			niter		: number of iterations (default: 100)
			show_progress	: (default: True)
			compW		: set to True if W and H should be optimized, set to False
					if only H should be optimized. This is usefull if W is 
					computed somewhere or if new data should be mapped on a
					given set of basis vectors W.
		"""
        # data can be either supplied by conventional numpy arrays or
        # as a numpy array within a pytables table (should be preferred for large data sets)
        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)
Ejemplo n.º 6
0
    def run(self):
        """
        :returns: Tuple, such as (<dict of nmis, with algorithm names as keys >)
        """

        nsc = NSpecSparse(self.X, self.k, maxiter=2000)
        nmf = NMF(self.X, self.k)
        km = KMeans(n_clusters=self.k)
        nsckm = NSpecSparseKM(self.X, self.k, maxiter=2000)

        nsc_result = nsc.predict()
        nmf_result = nmf.predict()
        km_result = km.fit_predict(self.X)
        nsckm_result = nsckm.predict()

        w_nsc = nsc_result.matrices[0].todense()
        w_nmf = nmf_result.matrices[0]
        w_nsckm = nsckm_result.matrices  # gets only the labels

        arrays = {
            'nsc': np.array(np.argmax(w_nsc, axis=1))[:, 0],
            'nmf': np.array(np.argmax(w_nmf, axis=1)),
            'km': km_result,
            'nsckm': w_nsckm
        }

        nmi = {k: nmiscore(arrays[k], self.y) for k in arrays.keys()}

        return (nmi, arrays)
Ejemplo n.º 7
0
 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
               compute_err=True, niter=1):
     """ Factorize s.t. WH = data
         
         Parameters
         ----------
         show_progress : bool
                 print some extra information to stdout.
         compute_h : bool
                 iteratively update values for H.
         compute_w : bool
                 iteratively update values for W.
         compute_err : bool
                 compute Frobenius norm |data-WH| after each update and store
                 it to .ferr[k].
         
         Updated Values
         --------------
         .W : updated values for W.
         .H : updated values for H.
         .ferr : Frobenius norm |data-WH|.
     """
     
     NMF.factorize(self, niter=1, show_progress=show_progress, 
               compute_w=compute_w, compute_h=compute_h, 
               compute_err=compute_err)
Ejemplo n.º 8
0
 def factorize(self, niter=10, compute_w=True, compute_h=True, 
               show_progress=False, compute_err=True):
     """ Factorize s.t. WH = data
         
         Parameters
         ----------
         niter : int
                 number of iterations.
         show_progress : bool
                 print some extra information to stdout.
         compute_h : bool
                 iteratively update values for H.
         compute_w : bool
                 iteratively update values for W.
         compute_err : bool
                 compute Frobenius norm |data-WH| after each update and store
                 it to .ferr[k].
         
         Updated Values
         --------------
         .W : updated values for W.
         .H : updated values for H.
         .ferr : Frobenius norm |data-WH| for each iteration.
     """       
           
     # init some learning parameters
     self._lamb_W = 1.0/niter
     self._lamb_H = 1.0/niter  
     
     NMF.factorize(self, niter=niter, compute_w=compute_w, 
                   compute_h=compute_h, show_progress=show_progress,
                   compute_err=compute_err)
Ejemplo n.º 9
0
    def __init__(self,
                 data_1,
                 data_2,
                 lambd=0.5,
                 num_bases=4,
                 niter=100,
                 show_progress=False,
                 compH=True,
                 compW=True):

        # generate a new data set data using a weighted
        # combination of data_1 and data_2

        self._data_1 = data_1
        self._data_2 = data_2
        self._lambd = lambd

        data = np.concatenate(
            (lambd * self._data_1, (1.0 - lambd) * self._data_2), axis=0)
        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)
Ejemplo n.º 10
0
 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
               compute_err=True, niter=1):
     """ Factorize s.t. WH = data
         
         Parameters
         ----------
         show_progress : bool
                 print some extra information to stdout.
         compute_h : bool
                 iteratively update values for H.
         compute_w : bool
                 iteratively update values for W.
         compute_err : bool
                 compute Frobenius norm |data-WH| after each update and store
                 it to .ferr[k].
         
         Updated Values
         --------------
         .W : updated values for W.
         .H : updated values for H.
         .ferr : Frobenius norm |data-WH|.
     """
     
     NMF.factorize(self, niter=1, show_progress=show_progress, 
               compute_w=compute_w, compute_h=compute_h, 
               compute_err=compute_err)
Ejemplo n.º 11
0
 def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2,
             num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'):
     NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w)
 
     self._niterH = niterH
     self._nsub = nsub
     self.data = data
     self._mfmethod = mfmethod
     self._mapW = mapW
     self._sstrategy = sstrategy
     self._base_sel = base_sel
     
     # assign the correct distance function
     if self._sstrategy == 'cur':
         self._subfunc = self.curselect
                 
     elif self._sstrategy == 'kmeans':
         self._subfunc = self.kmeansselect
             
     elif self._sstrategy == 'hull':
         self._subfunc = self.hullselect
         
     elif self._sstrategy == 'laesa':
         self._subfunc = self.laesaselect
         
     elif self._sstrategy == 'sivm':
         self._subfunc = self.sivmselect
         
     else:
         self._subfunc = self.randselect
def run_model(vector, features, k, max_iter):
    model = NMF(k, max_iter)
    W, H = model.fit_transform(vector)
    # print('Cost: ', model.cost(vector))
    cw = common_words(H, features, num_words=10)
    print('Topics in {} with {} iterations '.format(column, max_iter))
    print_topics(cw)
    return vector, features
Ejemplo n.º 13
0
def main():
    origin,mask = load_data(4,5,0.2)
    dm = origin*mask
    mask1 = dm / origin
    print(mask1)
    print(mask)
    nmf = NMF()
    rec = nmf.predict(dm,mask)
    print(rec-dm)
Ejemplo n.º 14
0
    def __init__(self,
                 data,
                 num_bases=4,
                 niter=10,
                 show_progress=False,
                 compW=True):

        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)
Ejemplo n.º 15
0
 def __init__(self,
              data,
              num_bases=4,
              niter=100,
              show_progress=False,
              compW=True):
     # data can be either supplied by conventional numpy arrays or
     # as a numpy array within a pytables table (should be preferred for large data sets)
     NMF.__init__(self,
                  data,
                  num_bases=num_bases,
                  niter=niter,
                  show_progress=show_progress,
                  compW=compW)
Ejemplo n.º 16
0
    def __init__(self,
                 data,
                 num_bases=4,
                 niter=100,
                 show_progress=False,
                 compW=True):

        # call inherited method
        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)
Ejemplo n.º 17
0
    def factorize(self,
                  niter=1,
                  show_progress=False,
                  compute_w=True,
                  compute_h=True,
                  compute_err=True):

        # enforce certain default values, otherwise it won't work
        NMF.factorize(self,
                      niter=1,
                      show_progress=show_progress,
                      compute_w=True,
                      compute_h=True,
                      compute_err=compute_err)
Ejemplo n.º 18
0
def process_one_category(data_path):
    bird_category = int(data_path.split('/')[-1].split('.')[0])
    filenames = os.listdir(data_path)
    out_dir = 'output/bird_{0:03d}'.format(bird_category)
    os.mkdir(out_dir)

    # load images
    raw_images = [plt.imread(os.path.join(data_path, filename)) for filename in filenames]
    for i in range(len(raw_images)):
        img = raw_images[i]
        if np.array(img).shape[-1] > 3:
            raw_images[i] = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
        cv2.imwrite(os.path.join(out_dir, 'raw_{0:03d}_{1}.png'.format(bird_category, i)), img)
    raw_images = [imresize(img, 224, 224) for img in raw_images]  # resize
    raw_images = np.stack(raw_images)

    # preprocess
    images = raw_images.transpose((0, 3, 1, 2)).astype('float32')  # to numpy, NxCxHxW, float32
    images -= np.array([0.485, 0.456, 0.406]).reshape((1, 3, 1, 1))  # zero mean
    images /= np.array([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1))  # unit variance

    images = torch.from_numpy(images)  # convert to pytorch tensor
    if cuda:
        images = images.cuda()

    net = models.vgg19(pretrained=True)  # load pre-trained VGG-19
    if cuda:
        net = net.cuda()
    del net.features._modules['36']  # remove max-pooling after final conv layer

    with torch.no_grad():
        features = net.features(images)
        flat_features = features.permute(0, 2, 3, 1).contiguous().view((-1, features.size(1)))  # NxCxHxW -> (N*H*W)xC

    print('Reshaped features from {0}x{1}x{2}x{3} to ({0}*{2}*{3})x{1} = {4}x{1}'.format(*features.shape,
                                                                                         flat_features.size(0)))

    for K in [15]:
        with torch.no_grad():
            W, _ = NMF(flat_features, K, random_seed=0, cuda=cuda, max_iter=50)

        heatmaps = W.cpu().view(features.size(0), features.size(2), features.size(3), K).permute(0, 3, 1, 2)
        # (N*H*W)xK -> NxKxHxW
        heatmaps = torch.nn.functional.interpolate(heatmaps, size=(224, 224), mode='bilinear', align_corners=False)
        # 14x14 -> 224x224
        heatmaps /= heatmaps.max(dim=3, keepdim=True)[0].max(dim=2, keepdim=True)[0]
        # normalize by factor (i.e., 1 of K)
        heatmaps = heatmaps.cpu().numpy()
        # print(heatmaps.shape) # (60, K, 224, 224)
        save_mask2d(heatmaps, K, out_dir)
Ejemplo n.º 19
0
    def __init__(self, data, num_bases=0, center_mean=True,  **kwargs):

        NMF.__init__(self, data, num_bases=num_bases)
        
        # center the data around the mean first
        self._center_mean = center_mean            

        if self._center_mean:
            # copy the data before centering it
            self._data_orig = data            
            self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1)                
            self.data = self._data_orig -  self._meanv
        else:
            self.data = data
Ejemplo n.º 20
0
    def __init__(self, data, num_bases=0, center_mean=True):

        NMF.__init__(self, data, num_bases=num_bases)
        
        # center the data around the mean first
        self._center_mean = center_mean            

        if self._center_mean:
            # copy the data before centering it
            self._data_orig = data            
            self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1)                
            self.data = self._data_orig -  self._meanv
        else:
            self.data = data
Ejemplo n.º 21
0
 def update_h(self):
     print self._method
     if self._method == 'pca':
        self.H = np.dot(pinv(self.W), self.data)
            
     if self._method == 'nmf':
         mdl = NMF(self.data, num_bases=self._num_bases)
         mdl.W = self.W
         mdl.factorize(compute_w=False, niter=50)
         self.H = mdl.H.copy()
     
     if self._method == 'aa':
         mdl = AA(self.data, num_bases=self._num_bases)
         mdl.W = self.W
         mdl.factorize(compute_w=False)
         self.H = mdl.H.copy()
Ejemplo n.º 22
0
 def update_h(self):
     print self._method
     if self._method == 'pca':
        self.H = np.dot(pinv(self.W), self.data)
            
     if self._method == 'nmf':
         mdl = NMF(self.data, num_bases=self._num_bases)
         mdl.W = self.W
         mdl.factorize(compute_w=False, niter=50)
         self.H = mdl.H.copy()
     
     if self._method == 'aa':
         mdl = AA(self.data, num_bases=self._num_bases)
         mdl.W = self.W
         mdl.factorize(compute_w=False)
         self.H = mdl.H.copy()
Ejemplo n.º 23
0
    def __init__(self,
                 data,
                 num_bases=4,
                 niter=100,
                 show_progress=False,
                 compW=True):
        # data can be either supplied by conventional numpy arrays or
        # as a numpy array within a pytables table (should be preferred for large data sets)

        NMF.__init__(self,
                     data,
                     num_bases=num_bases,
                     niter=niter,
                     show_progress=show_progress,
                     compW=compW)

        # controls how fast lambda should increase:
        # this influence convergence to binary values during the update. A value
        # <1 will result in non-binary decompositions as the update rule effectively
        # is a conventional nmf update rule. Values >1 give more weight to making the
        # factorization binary with increasing iterations.
        # setting either W or H to 0 results make the resulting matrix non binary.
        self._lamb_increase_W = 1.1
        self._lamb_increase_H = 1.1
Ejemplo n.º 24
0
 def factorize(self, niter=1, show_progress=False, 
               compute_w=True, compute_h=True, compute_err=True):
                   
     # enforce certain default values, otherwise it won't work
     NMF.factorize(self, niter=1, show_progress=show_progress, 
               compute_w=True, compute_h=True, compute_err=compute_err)
Ejemplo n.º 25
0
 def generate_model():
     return NMF(F, K, b=args.b_div, m=args.sparsity_weight,
                              robust_normalization=True, tol=args.tol, dtype=dtype,
                              device=args.device, keep_history=True)
Ejemplo n.º 26
0
    df = pd.read_csv('./temp/ml-100k/u.data',
                     sep='\t',
                     header=None,
                     usecols=[0, 1, 2],
                     names=['userid', 'itemid', 'rating'])
    R = pd.pivot_table(df,
                       values='rating',
                       index=['userid'],
                       columns=['itemid'])
    R.fillna(0, inplace=True)

    ans1 = R[2][1]
    R[2][1] = 0
    ans2 = R[200][940]
    R[200][940] = 0
    ans3 = R[900][931]
    R[900][931] = 0

    nmf = NMF(R.shape[0], R.shape[1])

    nmf.sess.run(tf.global_variables_initializer())
    for step in range(50000):
        _, loss, R_pred = nmf.sess.run([nmf.train_op, nmf.loss, nmf.R_pred], {
            nmf.R: R.values,
            nmf.lr: 0.001
        })
        if step % 100 == 0:
            print("[%d] loss: %.4f | " % (step, loss), end='')
            print(ans1, ':', R_pred[2][1], '|', ans2, ':', R_pred[200][940],
                  '|', ans3, ':', R_pred[900][931])
Ejemplo n.º 27
0
    k = 5                       # Stevilo priporocil
    rank = 5                    # Dimenzija modela
    eta = 1e-2                  # Ucni korak
    max_iter = 500              # Stevilo iteracij
    compute_error=True          # Izracun napake
    symmetric=True              # Simetircni podatki: DA
    min_prior_connections = 1   # Minimalni stevilo povezav, da je neko vozlisce priporoceno
    alpha = 0.01                # Kazen za pretirano prileganje



naslovi = csv.DictReader(open(datapath, encoding="utf-8"), delimiter=",").fieldnames


print("Prilagajanje modela podatkom ...")
model = NMF(compute_error=compute_error, rank=rank, eta=eta, max_iter=max_iter,
            symmetric=symmetric, alpha=alpha)
model.fit(X)
Xp = model.predict_all()
print("Učenje koncano!")

Y = X.copy()
f, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
axes[0].set_xlabel("Vozlisce")
axes[0].set_ylabel("Vozlisce")
axes[1].set_xlabel("Vozlisce")
axes[0].pcolor(Y, cmap='Oranges', vmin=0, vmax=X.max())
axes[0].set_title("Podatki")

axes[1].pcolor(Xp, cmap='Oranges', vmin=0, vmax=X.max())
axes[1].set_title("Model")
axes[1].set_xlim(0, X.shape[1])
Ejemplo n.º 28
0
def train():

    # ----------- Load data -----------
    # load data
    dict = cPickle.load(open('pre_load.p', 'rb'))
    tr_X, tr_y, tr_na_list, te_X, te_y, te_na_list = dict['tr_X'], dict[
        'tr_y'], dict['tr_na_list'], dict['te_X'], dict['te_y'], dict[
            'te_na_list']

    tr_positive = np.take(tr_X, np.where(tr_y == 1)[0])
    tr_positive = [t / np.max(t) for t in tr_positive]
    tr_positive = [
        librosa.feature.stack_memory(t.transpose(), n_steps=sh_order)
        for t in tr_positive
    ]
    tr_negative = np.take(tr_X, np.where(tr_y == 0)[0])
    tr_negative = [t / np.max(t) for t in tr_negative]
    tr_negative = [
        librosa.feature.stack_memory(t.transpose(), n_steps=sh_order)
        for t in tr_negative
    ]

    # # # ----------- Do training seperate bases for each file-----------
    # nmf_model=NMF(rank_p, norm_W=1,  iterations=500, update_func = "kl", verbose=True)
    # W_positive=[]
    # for f in tr_positive:
    #     [W,H,error]=nmf_model.process(f.transpose())
    #     W_positive.append(W)
    #
    # nmf_model=NMF(rank_p, norm_W=1,  iterations=500, update_func = "kl", verbose=True)
    # W_negative=[]
    # for f in tr_negative:
    #     [W,H,error]=nmf_model.process(f.transpose())
    #     W_negative.append(W)

    tr_positive = np.hstack(tr_positive)
    tr_negative = np.hstack(tr_negative)
    train_data = np.hstack((tr_positive, tr_negative))
    print >> sys.stderr, train_data.shape
    #
    # # # ----------- Do training overcomplete dictionary -----------
    # p = decomposition.PCA(whiten=True, n_components= 0.99)
    # pca_data=p.fit_transform(train_data)

    #   #   num=500
    # num_dim=pca_data.shape[1]
    # num_training_samples=pca_data.shape[0]
    # km = spherical_kmeans.OSKmeans(num,num_dim)
    # print "Learning k-means: "+ str(num)
    # for _ in range(1000):
    #    print _
    #    for index in range(num_training_samples):
    #        km.update(pca_data[index,:])
    # codebook=km.centroids
    # cPickle.dump( [codebook, p], open( W_name, 'wb' ), protocol=cPickle.HIGHEST_PROTOCOL )

    # # # ----------- Do training -----------
    if type == '0_1':
        print >> sys.stderr, "NMF on positive examples"
        nmf_model = NMF(rank_p,
                        norm_W=1,
                        iterations=200,
                        update_func="kls",
                        verbose=False)
        [W_positive, H, error] = nmf_model.process(tr_positive, lam=lam)
        # # # a_H=np.ones(rank_n+rank_p)
        # # # b_H=np.ones(rank_n+rank_p)
        # # # [error, W_positive, H_gap] = gap_vbem(tr_positive, rank_n+rank_p, a_H, b_H, iterations=100, verbose=True)
        # #
        print >> sys.stderr, "NMF on negative examples"
        nmf_model = NMF(rank_n,
                        norm_W=1,
                        iterations=200,
                        update_func="kls",
                        verbose=False)
        [W_negative, H, error] = nmf_model.process(tr_negative, lam=lam)
        # # # [error, W_negative, H_gap] = gap_vbem(tr_negative, rank_n+rank_p, a_H, b_H, iterations=100, verbose=True)
        cPickle.dump([W_positive, W_negative],
                     open(W_name, 'wb'),
                     protocol=cPickle.HIGHEST_PROTOCOL)
    elif type == '01':
        # # -------- Train with masking ----------
        print >> sys.stderr, "masked NMF on training files"
        mask = np.zeros((rank_p, tr_negative.shape[1]))
        V = np.hstack((tr_negative, tr_positive))
        H0 = np.random.rand(rank_n + rank_p, V.shape[1]) + eps
        H0[-mask.shape[0]:, :mask.shape[1]] = mask
        nmf_model = NMF(rank_n + rank_p,
                        norm_W=1,
                        iterations=200,
                        update_func="kls",
                        verbose=False)
        [W, H, error] = nmf_model.process(V, H0=H0, lam=lam)
        print >> sys.stderr, error
        # # a_H=np.ones(rank_n+rankwork/bird_backup/W/W_mel_01_kl_50p_50_9folds.n.p_p)
        # # b_H=np.ones(rank_n+rank_p)
        # # [error, W_gap, H_gap] = gap_vbem(V, rank_n+rank_p, a_H, b_H, H0, iterations=100, verbose=False)
        #
        cPickle.dump(W, open(W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    else:
        raise ValueError('Dictionary type not recognized')

    print >> sys.stderr, "Dictionary " + W_name + " finished!"
Ejemplo n.º 29
0
    def fit(self, X, y=None, features=None):
        """
        Constructs DAG according to `self.dag_method` and learns
        coexpression modules across multiple resolutions.        

        Parameters
        ----------
        X: `numpy.ndarray` or `scipy.sparse.csr_matrix`
            Matrix with rows corresponding to all of the samples that
            define the DAG and columns corresponding to features that
            define the correlation matrices.
        y
            Ignored
        features: `numpy.ndarray` of `str`
            A list of strings with feature labels.
        """
        super(DecomposeDAG, self).fit(X, y, features)

        n_samples, n_features = X.shape

        if self.verbose:
            print('Stacking...')
            sys.stdout.flush()
        X_multi = self.multiresolution_stack(X)

        if self.verbose:
            print('Decomposing...')
            sys.stdout.flush()

        if self.decomp_method == 'nmf':
            #from sklearn.decomposition import NMF
            from nmf import NMF
            decomp = NMF(
                n_components=self.n_components,
                init=None,
                solver='cd',
                beta_loss='frobenius',
                alpha=1e-3,
                l1_ratio=1,
                random_state=69,
                tol=1e-2,
                verbose=self.verbose,
            ).fit(X_multi)
            components = decomp.components_

        elif self.decomp_method == 'lda':
            from sklearn.decomposition import (LatentDirichletAllocation as
                                               LDA)
            decomp = LDA(
                n_components=self.n_components,
                learning_method='online',
                max_iter=20,
                mean_change_tol=1e-2,
                n_jobs=self.n_jobs,
                random_state=69,
                verbose=self.verbose,
            ).fit(X_multi)
            components = decomp.components_

        elif self.decomp_method == 'hdp':
            from bnp.online_hdp import (HierarchicalDirichletProcess as HDP)
            hdp = HDP(
                n_topic_truncate=self.n_components,
                n_doc_truncate=10000,
                learning_method='online',
                n_jobs=self.n_jobs,
                random_state=69,
                verbose=self.verbose,
            ).fit(X_multi)
            components = hdp.lambda_

        else:
            raise ValueError('Invalid decomposition method {}'.format(
                self.decomp_method))

        n_components = components.shape[0]
        self.cluster_components = np.reshape(
            components, (n_components, n_features, len(self.nodes)))

        cc = np.sum(self.cluster_components, axis=1)
        cc /= cc.max()
        assert (cc.shape == (n_components, len(self.nodes)))

        for node_idx, node in enumerate(self.nodes):
            node.viz_value = list(cc[:, node_idx])

        return self
Ejemplo n.º 30
0
 def __init__(self, data, k=-1, num_bases=4):
     # call inherited method
     NMF.__init__(self, data, num_bases=num_bases)
     self._k = k
     if self._k == -1:
         self._k = num_bases
Ejemplo n.º 31
0
 def __init__(self, data, k=-1, num_bases=4):
     # call inherited method
     NMF.__init__(self, data, num_bases=num_bases)
     self._k = k
     if self._k == -1:
         self._k = num_bases
Ejemplo n.º 32
0
    def run_nmf(self, tr_positive, tr_negative):
        '''Extract a dictionary via NMF given a method chosen in config file
        Args:
           tr_positive: a numpy array containing all the positive examples 
           tr_negative: a numpy array containing all the negative examples
        Output:
            NONE, the dictionary is saved in a file
        '''
        print(tr_positive.shape)
        if self.type == '0_1':

            print("NMF on positive examples")
            nmf_model = NMF(self.rank_1,
                            norm_W=1,
                            iterations=self.iterations,
                            update_func=self.update_func,
                            verbose=True)
            [W_positive, H, error] = nmf_model.process(tr_positive)

            print("NMF on negative examples")
            nmf_model = NMF(self.rank_0,
                            norm_W=1,
                            iterations=self.iterations,
                            update_func=self.update_func,
                            verbose=True)
            [W_negative, H, error] = nmf_model.process(tr_negative)

            print("Saved dictionary to " + self.W_name)
            W = np.hstack((W_positive, W_negative))
            cPickle.dump([W_positive, W_negative],
                         open(self.W_name, 'wb'),
                         protocol=cPickle.HIGHEST_PROTOCOL)

        elif self.type == 'unsupervised':

            print("Unsupervised NMF")
            V = np.hstack((tr_negative, tr_positive))
            nmf_model = NMF(self.rank_0 + self.rank_1,
                            norm_W=1,
                            iterations=self.iterations,
                            update_func=self.update_func,
                            verbose=True)
            [W, H, error] = nmf_model.process(V)

            print("Saved dictionary to " + self.W_name)
            cPickle.dump(W,
                         open(self.W_name, 'wb'),
                         protocol=cPickle.HIGHEST_PROTOCOL)

        elif self.type == '01':
            # # -------- Train with masking ----------
            print("Masked NMF on training files")
            V = np.hstack((tr_negative, tr_positive))

            mask = np.zeros((self.rank_1, tr_negative.shape[1]))
            H0 = np.random.rand(self.rank_0 + self.rank_1, V.shape[1]) + eps
            H0[-mask.shape[0]:, :mask.shape[1]] = mask

            nmf_model = NMF(self.rank_0 + self.rank_1,
                            norm_W=1,
                            iterations=self.iterations,
                            update_func=self.update_func,
                            verbose=True)
            [W, H, error] = nmf_model.process(V, H0=H0)

            print("Saved dictionary to " + self.W_name)
            cPickle.dump(W,
                         open(self.W_name, 'wb'),
                         protocol=cPickle.HIGHEST_PROTOCOL)

        elif self.type == '01_orth':
            # # -------- Train with masking ----------
            print("masked NMF on training files")
            V = np.hstack((tr_negative, tr_positive))

            mask = np.zeros((self.rank_1, tr_negative.shape[1]))
            H0 = np.random.rand(self.rank_0 + self.rank_1, V.shape[1]) + eps
            H0[-mask.shape[0]:, :mask.shape[1]] = mask

            nmf_model = NMF(self.rank_0 + self.rank_1,
                            norm_W=1,
                            rankW0=self.rank_0,
                            rankW1=self.rank_1,
                            len_V0=tr_negative.shape[1],
                            iterations=self.iterations,
                            update_func=self.update_func,
                            verbose=False)
            print(self.lam_orth)
            [W, H, error] = nmf_model.process(V, H0=H0, lam_orth=self.lam_orth)

            print("Saved dictionary to " + self.W_name)
            cPickle.dump(W,
                         open(self.W_name, 'wb'),
                         protocol=cPickle.HIGHEST_PROTOCOL)
        else:
            raise ValueError('Dictionary type not recognized')

        return W
Ejemplo n.º 33
0
    def solve(self):
        '''
        The function is main entry for clustering. There are several methods to be used
            1. kmeans or Kmeans++
            3. NMF-nonnegative matrix factorization
            4. ONMF-Orthogonality constrained nonnegative matrix factorization
        '''
        if self.method_name in {
                'kmeans', 'kmeans++', 'kmod', 'msd-km', 'nmf', 'dtpp', 'hals',
                'onmf-stf', 'onpmf', 'sncp1c', 'sncp2c', 'sncp4c'
        }:
            cls_assign = None
            time_used = 0
            if self.method_name == 'kmeans':
                W, H = self.data_manager.gen_inits_WH(init='random',
                                                      seed=self.seed_num,
                                                      H_ortho=True)
                initial_centroids = np.asarray(W.transpose())
                start_time = time.time()
                kmeans = KMeans(self.data_manager.get_data_mat(), self.cls_num,
                                self.seed_num)
                print 'initial shape'
                print initial_centroids.shape
                cls_assign, _ = kmeans.solve(initial_centroids,
                                             self.data_manager, self.res_dir)
                end_time = time.time()
                time_used = end_time - start_time
            elif self.method_name == 'kmeans++':
                start_time = time.time()
                kmeans = KMeans(self.data_manager.get_data_mat(), self.cls_num,
                                self.seed_num)
                initial_centroids = kmeans.create_centroids_by_kpp()
                cls_assign, _ = kmeans.solve(initial_centroids)
                end_time = time.time()
                time_used = end_time - start_time
            elif self.method_name == 'nmf':
                # Before nmf, we should check the validity of input data
                if self.data_manager.contain_zero_rows():
                    raise ValueError(
                        'Error: the data matrix has negative values!')
                nmf = NMF(self.data_manager, self.res_dir, self.cls_num,
                          self.seed_num)
                cls_assign, time_used = nmf.solve()

            elif self.method_name in {
                    'dtpp', 'hals', 'onmf-stf', 'onpmf', 'sncp1c', 'sncp2c',
                    'sncp4c'
            }:
                #if self.data_manager.contain_zero_rows():
                #    raise ValueError('Error: the data matrix has negative values')
                nu = 1e-10
                mul = 0
                onmf = ONMF(self.data_manager, self.res_dir, self.cls_num,
                            self.seed_num, mul, nu)
                cls_assign, time_used, (W, H) = onmf.solve(self.method_name)

            # if the dataset is '2d#X', we need to draw a figure to show the clustering
            # result
            if self.data_name.startswith('2d'):
                dat_path = os.path.join(root_dir, 'results', self.method_name,
                                        self.data_name,
                                        'res' + str(self.seed_num) + '.pdf')
                # get the result directory where the result is stored
                self.data_manager.visualize_data(partition_idx=cls_assign,
                                                 dat_path=dat_path,
                                                 data_points=np.asarray(
                                                     W.transpose()))

#self.data_manager.visualize_data(partition_idx = cls_assign, dat_path = dat_path)
#save clustering performance
            true_labels = self.data_manager.get_labels()
            print(true_labels.shape)
            temp_dict = collections.OrderedDict()
            temp_dict['seed'] = self.seed_num
            temp_dict['time'] = time_used
            temp_dict['Purity'] = calculate_purity(cls_assign, true_labels)
            temp_dict['ARI'] = adjusted_rand_idx = calculate_rand_index(
                cls_assign, true_labels)
            temp_dict['ACC'] = calculate_accuracy(cls_assign, true_labels)
            temp_dict['NMI'] = calculate_NMI(cls_assign, true_labels)

            return temp_dict

        elif self.method_name in {'sncp', 'sncp1', 'sncp2', 'sncp3'}:

            for nu in {1e-10}:
                for mul in {0}:
                    onmf = ONMF(self.data_manager, self.res_dir, self.cls_num,
                                self.seed_num, mul, nu)
                    #onmf = ONMF(self.data_manager.get_data_mat(), self.res_dir, 20, self.SNR, self.seed_num)
                    cls_assign, time_used, (W,
                                            H) = onmf.solve(self.method_name)

                    if self.data_name.startswith('2d'):
                        dat_path = os.path.join(
                            root_dir, 'results', self.method_name,
                            self.data_name,
                            'res' + str(self.seed_num) + '.pdf')
                        self.data_manager.visualize_data(
                            partition_idx=cls_assign,
                            dat_path=dat_path,
                            data_points=np.asarray(W.transpose()))

        elif self.method_name == 'visualize_data':
            self.data_manager.visualize_data()

        else:
            raise ValueError('Error: no other methods are supported now!')
Ejemplo n.º 34
0
 def setUp(self):
     self.description_csv = pd.read_csv("docs/description.csv")
     self.description_1000_csv = pd.read_csv("docs/description_1000.csv")
     self.dp = DocsPreprocessor()
     self.description_1000 = self.dp.process(self.description_1000_csv)
     self.nmf = NMF(self.description_1000)
Ejemplo n.º 35
0
# encoding: utf-8
'''
Created on 2016年11月15日

@author: alibaba
'''
import numpy as np
#from basenmf import BaseNMF, NMFResult
#from projective import ProjectiveNMF
from nmf import NMF

X=np.array([(1,2,3,4,5,6),(4,5,6,7,8,9),(1,3,5,4,2,6)])/10.0
nmf=NMF(X,2,maxiter=100)
nmf_result=nmf.predict()
w_nmf = nmf_result.matrices[0]
print(w_nmf)
Ejemplo n.º 36
0
from projective import ProjectiveNMF
from scipy import misc
import pylab as pl
import numpy as np

#%% --
# 1. Lena

# train
lena = misc.lena()

result_pnmf = ProjectiveNMF(lena, 75).predict()
w = result_pnmf.matrices[0]
lena_hat_pnmf = w * w.T * lena

result_nmf = NMF(lena, 75, objective="kl").predict()
lena_hat_nmf = np.dot(result_nmf.matrices[0], result_nmf.matrices[1])

#%% show results

pl.figure(1)
pl.subplot(131)
pl.title("Original")
pl.imshow(lena, cmap="gray")

pl.subplot(132)
pl.title("NMF")
pl.imshow(lena_hat_nmf, cmap="gray")

pl.subplot(133)
pl.title("PNMF")
Ejemplo n.º 37
0
 def __init__(self, data, num_bases=4, lamb=2.0):
     # call inherited method
     NMF.__init__(self, data, num_bases=num_bases)
     self._lamb = lamb
Ejemplo n.º 38
0
 def __init__(self, data, num_bases=4, lamb=2.0):
     # call inherited method
     NMF.__init__(self, data, num_bases=num_bases)
     self._lamb = lamb