Esempio n. 1
0
    def learnStructureHelper(self,dataset,ids):
        curr_depth=self.nvariables-dataset.shape[1]
        #print ("curr_dept: ", curr_depth)
        if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth:
            clt=CLT()
            clt.learnStructure(dataset)

                    
            return clt
        xycounts = Util.compute_xycounts(dataset) + 1  # laplace correction
        xcounts = Util.compute_xcounts(dataset) + 2  # laplace correction
        # compute mutual information score for all pairs of variables
        # weights are multiplied by -1.0 because we compute the minimum spanning tree
        edgemat = Util.compute_edge_weights(xycounts, xcounts)
        np.fill_diagonal(edgemat, 0) #  #
        
    
        scores = np.sum(edgemat, axis=0)
        variable = np.argmax(scores)
        
        new_dataset1=np.delete(dataset[dataset[:,variable]==1],variable,1)
        p1=float(new_dataset1.shape[0])+1.0
        new_ids=np.delete(ids,variable,0)
        
        new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable, 1)
        p0 = float(new_dataset0.shape[0]) +1.0
        
        return [variable,ids[variable],p0,p1,self.learnStructureHelper(new_dataset0,new_ids),
                self.learnStructureHelper(new_dataset1,new_ids)]
Esempio n. 2
0
def load_mt(in_dir, data_name):
    infile = in_dir + data_name + '.npz'
    #out_file = '../module/' + data_name + '.npz'
    reload_dict = np.load(infile)
    reload_mix_clt = MIXTURE_CLT()
    reload_mix_clt.mixture_weight = reload_dict['weights']
    reload_mix_clt.n_components = reload_mix_clt.mixture_weight.shape[0]

    reload_clt_component = reload_dict['clt_component']

    #print (reload_clt_component)
    for i in xrange(reload_mix_clt.n_components):
        clt_c = CLT()
        #str_id = str(i)
        curr_component = reload_clt_component[i]
        clt_c.xyprob = curr_component['xyprob']
        clt_c.xprob = curr_component['xprob']
        clt_c.topo_order = curr_component['topo_order']
        clt_c.parents = curr_component['parents']
        clt_c.log_cond_cpt = curr_component['log_cond_cpt']
        clt_c.cond_cpt = np.exp(clt_c.log_cond_cpt)  #deep

        reload_mix_clt.clt_list.append(clt_c)

    return reload_mix_clt
Esempio n. 3
0
    def learnStructureP_Helper(self, dataset, ids, portion):
        curr_depth = self.nvariables - dataset.shape[1]
        #print ("curr_dept: ", curr_depth)
        if dataset.shape[0] < self.min_rec or dataset.shape[
                1] < self.min_var or curr_depth >= self.depth:
            clt = CLT()
            clt.learnStructure(dataset)

            return clt
        xycounts = Util.compute_xycounts(dataset) + 1  # laplace correction
        xcounts = Util.compute_xcounts(dataset) + 2  # laplace correction
        # compute mutual information score for all pairs of variables
        # weights are multiplied by -1.0 because we compute the minimum spanning tree
        edgemat = Util.compute_edge_weights(xycounts, xcounts)
        np.fill_diagonal(edgemat, 0)  #  #

        #print ("edgemat: ", edgemat)
        scores = np.sum(edgemat, axis=0)
        #print (scores)
        ind_portion = np.random.choice(ids.shape[0],
                                       int(ids.shape[0] * portion),
                                       replace=False)
        #print 'ind_portion: ', ind_portion
        scores_portion = scores[ind_portion]
        #print 'scores_portion: ', scores_portion

        #print np.argmax(scores_portion)
        variable = ind_portion[np.argmax(scores_portion)]

        new_dataset1 = np.delete(dataset[dataset[:, variable] == 1], variable,
                                 1)
        p1 = float(new_dataset1.shape[0]) + 1.0
        new_ids = np.delete(ids, variable, 0)

        #print ("new_ids: ", new_ids)
        new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable,
                                 1)
        p0 = float(new_dataset0.shape[0]) + 1.0

        #print ("p0, p1:", float(p0)/(p0+p1), float(p1)/(p0+p1))
        return [
            variable, ids[variable], p0, p1,
            self.learnStructureP_Helper(new_dataset0, new_ids, portion),
            self.learnStructureP_Helper(new_dataset1, new_ids, portion)
        ]
Esempio n. 4
0
File: JT.py Progetto: vibhavg1/CNxD
def main_jt():
    dataset_dir = sys.argv[2]
    data_name = sys.argv[4]

    train_name = dataset_dir + data_name + '.ts.data'
    valid_name = dataset_dir + data_name + '.valid.data'
    test_name = dataset_dir + data_name + '.test.data'
    data_train = np.loadtxt(train_name, delimiter=',', dtype=np.uint32)
    data_valid = np.loadtxt(valid_name, delimiter=',', dtype=np.uint32)
    data_test = np.loadtxt(test_name, delimiter=',', dtype=np.uint32)

    clt = CLT()
    clt.learnStructure(data_train)
    print 'clt testset loglikihood score: ', clt.computeLL(
        data_test) / data_test.shape[0]

    n_variable = data_train.shape[1]
    clt.get_log_cond_cpt()

    jt = JunctionTree()
    jt.learn_structure(clt.topo_order, clt.parents, clt.cond_cpt)

    evid_list = []
    query_var = np.arange(n_variable)

    start = time.time()
    marginal = get_marginal_JT(jt, evid_list, query_var)

    print '------Marginals------'
    for i in xrange(query_var.shape[0]):

        print marginal[i]
    print 'running time for new: ', time.time() - start
Esempio n. 5
0
    def learnStructure(self, dataset, n_components):
        #print ("Mixture of Chow-Liu Tree ......" )
        # Shuffle the dataset
        self.n_components = n_components
        self.mixture_weight = np.full(n_components, 1.0 / n_components)
        #print ("mixture weights: ", self.mixture_weight)
        data_shuffle = np.copy(dataset)
        np.random.shuffle(data_shuffle)
        n_data = data_shuffle.shape[0] / self.n_components

        for c in xrange(self.n_components):
            if c == self.n_components - 1:  # the last portion
                data_slice = data_shuffle[c * n_data:, :]

            else:
                data_slice = data_shuffle[c * n_data:((c + 1) * n_data), :]

            clt = CLT()
            clt.learnStructure(data_slice)

            self.clt_list.append(clt)
Esempio n. 6
0
    def learn_structure_weight(self, dataset, weights, ids, smooth):
        curr_depth = self.nvariables - dataset.shape[1]

        if dataset.shape[0] < self.min_rec or dataset.shape[
                1] < self.min_var or curr_depth >= self.depth:
            clt = CLT()
            clt.learnStructure(dataset)
            clt.xyprob = np.zeros((1, 1, 2, 2))
            clt.xprob = np.zeros((1, 2))
            return clt

        self.xycounts = Util.compute_weighted_xycounts(dataset,
                                                       weights) + smooth
        self.xcounts = Util.compute_weighted_xcounts(dataset,
                                                     weights) + 2.0 * smooth
        edgemat = Util.compute_edge_weights(self.xycounts, self.xcounts)

        #edgemat[edgemat == 0.0] = 1e-20
        np.fill_diagonal(edgemat, 0)

        scores = np.sum(edgemat, axis=0)
        #print (scores)
        variable = np.argmax(scores)

        #print ("variable: ", ids[variable])

        index1 = np.where(dataset[:, variable] == 1)[0]
        index0 = np.where(dataset[:, variable] == 0)[0]
        #index0 = np.setdiff1d(np.arange(dataset.shape[0]), index1)

        new_dataset = np.delete(dataset, variable, axis=1)

        new_dataset1 = new_dataset[index1]
        new_weights1 = weights[index1]
        p1 = np.sum(new_weights1) + smooth

        #print ("new_ids: ", new_ids)
        new_dataset0 = new_dataset[index0]
        new_weights0 = weights[index0]
        p0 = np.sum(new_weights0) + smooth

        # Normalize
        p0 = p0 / (p0 + p1)
        p1 = 1.0 - p0

        #print p0, p1

        new_ids = np.delete(ids, variable, 0)

        #print ("p0, p1:", float(p0)/(p0+p1), float(p1)/(p0+p1))
        return [
            variable, ids[variable], p0, p1,
            self.learn_structure_weight(new_dataset0, new_weights0, new_ids,
                                        smooth),
            self.learn_structure_weight(new_dataset1, new_weights1, new_ids,
                                        smooth)
        ]
Esempio n. 7
0
    def learnStructureHelper(self,tum, dataset, ids, lamda,  beta_function, evid_list, data_ind, next_id = -1, next_weights = np.zeros(2)):
        
        curr_depth=self.nvariables - ids.shape[0]
        
        if len(evid_list) == 0:    # the first run
            #alpha = 1.0 * lamda
            sub_dataset = dataset
        else:

            if data_ind.shape[0] == 0:
                sub_dataset = np.array([])
                #alpha = 0.0
            else:
                sub_dataset = dataset[data_ind,:][:, ids]
                #print (sub_dataset.shape)            
        alpha = utilM.updata_coef(sub_dataset.shape[0], dataset.shape[0], lamda, beta_function)
        #if True: 
        if next_id == -1:
            # tum part
            p_xy, p_x = tum.inference_jt(evid_list,ids)
            
            if alpha > 0:
                # dataset part
                xycounts = Util.compute_xycounts(sub_dataset) + 1  # laplace correction
                xcounts = Util.compute_xcounts(sub_dataset) + 2  # laplace correction
                p_xy_d = Util.normalize2d(xycounts)
                #print p_xy
                p_x_d = Util.normalize1d(xcounts)
                #print (p_xy)
                # leaf node
                
                p_xy = alpha * p_xy_d + (1-alpha) * p_xy
                p_x = alpha * p_x_d + (1-alpha) * p_x
            
            
            # compute mutual information score for all pairs of variables
            # weights are multiplied by -1.0 because we compute the minimum spanning tree
            edgemat = Util.compute_MI_prob(p_xy, p_x)
            
            # reset self mutual information to be 0
            np.fill_diagonal(edgemat, 0)
            #for i in xrange(self.nvariables):
                #print (edgemat[i,i])
            
            #print ("edgemat: ", edgemat)
            scores = np.sum(edgemat, axis=0)
            #print (scores)
            variable = np.argmax(scores)   
            
            #variable = 7 ####test
            variable_id = ids[variable] # the index in the original file
            
            p1 =p_x[variable,1]
            p0 =p_x[variable,0]
            
            evid_list.append(np.array([variable_id, -1]))   # -1 means not determined yet
        
            if curr_depth >= self.depth:
                clt_leaf=CLT()
                clt_leaf.learnStructure_MI(edgemat)
                #edgemat = None # Save memory
                clt_leaf.xyprob = p_xy
                clt_leaf.xprob = p_x
                clt_leaf.get_log_cond_cpt()  #   0809
                # Try to save the memory
                clt_leaf.xyprob = np.zeros((1, 1, 2, 2))       #   0809

                
                save_info = {}
                save_info['ids'] = ids
                save_info['next_id'] = variable_id
                save_info['next_weights'] = np.array([p0,p1])
                save_info['evid_list'] = evid_list 
                save_info['data_ind'] = data_ind 
                
                
                clt_leaf.save_info = save_info
                return clt_leaf
        
        else:
            variable_id = next_id
            p0 = next_weights[0]
            p1 = next_weights[1]
            variable = np.where(ids==variable_id)[0][0]
                
        
        evid_list_0 = copy.deepcopy(evid_list) 
        evid_list_1 = copy.deepcopy(evid_list)
        evid_list_0[-1][1] = 0
        evid_list_1[-1][1] = 1
        new_ids=np.delete(ids,variable)
        
        
        
        if alpha> 0:
            #print ('0+1', data_ind.shape[0])
            new_data_ind0 = data_ind[np.where(sub_dataset[:,variable] ==0)[0]]
            #print ('0:',new_data_ind0.shape[0])
            new_data_ind1 = data_ind[np.where(sub_dataset[:,variable] ==1)[0]]
            #print ('1:',new_data_ind1.shape[0])
        else:
            new_data_ind0 = np.array([])
            new_data_ind1 = np.array([])
        
        new_ids=np.delete(ids,variable)
        
        
        #print ("p0, p1: ", p0, p1)

        
        #return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, new_ids, evid_list_0),
        #        self.learnStructureHelper(tum,  new_ids, evid_list_1)]
        return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, dataset, new_ids, lamda,  beta_function, evid_list_0, new_data_ind0),
                self.learnStructureHelper(tum,dataset, new_ids, lamda, beta_function, evid_list_1, new_data_ind1)]