Ejemplo n.º 1
0
    def fit(self,X):
       
        n_objects = X.shape[0]
        n_features = X.shape[1]
        
        sigma = np.zeros((self.n_clusters, n_features, n_features))
        w = np.tile(1.0/self.n_clusters, self.n_clusters)


        centers_idx = np.random.choice(n_objects, size=self.n_clusters, replace=False)
        mu = X[centers_idx, :] 
            
        for cluster in range (self.n_clusters):
            sigma[cluster :, :] = np.eye(n_features)
        
        ll = log_likelihood(X, w, mu, sigma)
        
        for i in range(self.max_iter):
            
            ll_new = log_likelihood(X, w, mu, sigma)
            self.save_logs(compute_labels(X, mu), w, mu, sigma, ll)
            
            if i > 0 and abs(ll_new - ll) < self.tol:
                self.cluster_centers_ = mu.copy()
                self.labels_ = compute_labels(X, mu)
                self.covars_ = sigma.copy()
                self.w_ = w.copy()
                break
            else:
                gamma = self.estep(X,w, mu, sigma)
                w, mu, sigma = self.mstep(X,gamma)
                ll = ll_new
                i+=1
                if i == self.max_iter:
                    self.convergence = -1
Ejemplo n.º 2
0
    def fit(self, X):
        n_objects, n_features = X.shape
        
        self.covars_  = np.zeros((self.n_clusters, n_features, n_features))
        self.w_ = np.tile(1.0 / self.n_clusters, self.n_clusters)

        centers_idx = np.random.choice(n_objects, size = self.n_clusters, replace = False)
        self.cluster_centers_ = X[centers_idx, :]
 
        for cluster in range(self.n_clusters):
            self.covars_[cluster :, :] = np.eye(n_features)
            
        self.ll = log_likelihood(X, self.w_, self.cluster_centers_, self.covars_)
        
        for i in range(self.max_iter):
            if self.logging:
                self.logs['log_likelihood'].append(log_likelihood(X, self.w_, self.cluster_centers_, self.covars_))
                self.logs['labels'].append(compute_labels(X, self.cluster_centers_))
                self.logs['w'].append(self.w_)
                self.logs['mu'].append(self.cluster_centers_)
                self.logs['sigma'].append(self.covars_)
                    
            ll_new = log_likelihood(X, self.w_, self.cluster_centers_, self.covars_)
            
            if i > 0 and abs(ll_new - self.ll) < self.tol:
                break
            else:
                g = self.e_step(X)
                self.m_step(X, g)
                self.ll = ll_new
                
        self.labels_ = compute_labels(X, self.cluster_centers_)
Ejemplo n.º 3
0
 def fit(self, X):
     n_objects = X.shape[0]
     best_log_likelihood = float('-inf')
     for i in range(self.n_init):
         centers_idx = np.random.choice(n_objects, size=self.n_clusters, replace=False)
         mu = X[centers_idx, :]
         labels = compute_labels(X, mu)
         ll = log_likelihood_from_labels(X, labels)
         if ll > best_log_likelihood:
             best_log_likelihood = ll
             self.cluster_centers_ = mu.copy()
             self.labels_ = labels
Ejemplo n.º 4
0
            docvecs_fname = "dataset/processed_data/docvecs.csv"
            docvecs.to_csv(docvecs_fname, index=False, header=False)

        end = int(round(time.time() * 1000))
        print("Process input done! - Elapsed time: %d" % (end - begin))

    # load pre-processed data
    begin = int(round(time.time() * 1000))
    print("Loading pre-processed data ...")
    train_tfidfvecs = utils.load_vecs(
        "dataset/processed_data/train/tfidfvecs.csv")
    test_tfidfvecs = utils.load_vecs(
        "dataset/processed_data/test/tfidfvecs.csv")
    train_docvecs = utils.load_vecs("dataset/processed_data/train/docvecs.csv")
    test_docvecs = utils.load_vecs("dataset/processed_data/test/docvecs.csv")
    train_labels = utils.compute_labels("dataset/textdata/train/data.csv")
    test_labels = utils.compute_labels("dataset/textdata/test/data.csv")
    train_dataset = {
        'inputs': [train_tfidfvecs, train_docvecs],
        'outputs': [train_labels]
    }
    test_dataset = {
        'inputs': [test_tfidfvecs, test_docvecs],
        'outputs': [test_labels]
    }
    end = int(round(time.time() * 1000))
    print("Compute labels done! - Elapsed time: %d" % (end - begin))

    if not context_only:
        checkpoint_dir = "pretrained/multiview/"
    else: