Beispiel #1
0
    def marginal_inference(self, evidence={}):

        messages = np.zeros((self.n_features, 2))
        logprob = 0.0
        for i in self.post_order:
            if i != 0:
                state_evidence = evidence.get(self.scope[i])
                if state_evidence != None:
                    messages[self.tree[i], 0] += self.log_factors[
                        i, state_evidence, 0] + messages[i, state_evidence]
                    messages[self.tree[i], 1] += self.log_factors[
                        i, state_evidence, 1] + messages[i, state_evidence]
                else:
                    # marginalization
                    messages[self.tree[i], 0] += logr(
                        np.exp(self.log_factors[i, 0, 0] + messages[i, 0]) +
                        np.exp(self.log_factors[i, 1, 0] + messages[i, 1]))
                    messages[self.tree[i], 1] += logr(
                        np.exp(self.log_factors[i, 0, 1] + messages[i, 0]) +
                        np.exp(self.log_factors[i, 1, 1] + messages[i, 1]))
            else:
                state_evidence = evidence.get(self.scope[i])
                if state_evidence != None:
                    logprob = self.log_factors[i, state_evidence,
                                               0] + messages[0, state_evidence]
                else:
                    # marginalization
                    logprob = logr(
                        np.exp(self.log_factors[i, 0, 0] + messages[0, 0]) +
                        np.exp(self.log_factors[i, 1, 0] + messages[0, 1]))
        return logprob
Beispiel #2
0
 def score_sample_log_proba(self, x):
     """ WRITEME """
     prob = 0.0
     x1 = np.concatenate((x[0:self.or_feature], x[self.or_feature + 1:]))
     if x[self.or_feature] == 0:
         prob = prob + logr(self.left_weight) + self.left_child.score_sample_log_proba(x1)
     else:
         prob = prob + logr(self.right_weight) + self.right_child.score_sample_log_proba(x1)
     return prob
Beispiel #3
0
 def score_samples(self, data, n_c, out_filename):
     with open(out_filename, 'w') as out_log:
         self.compute_weights(n_c)
         mean = 0.0
         for x in data:
             prob = 0.0
             for k in range(n_c):
                 prob = prob + np.exp(self.csns[k].score_sample_log_proba(x))*self.weights[k]
             mean = mean + logr(prob)
             out_log.write('%.10f\n'%logr(prob))
     out_log.close()
     return mean / data.shape[0]
Beispiel #4
0
 def score_sample_log_proba(self, x):
     """ WRITEME """
     prob = 0.0
     for i in range(len(self.tree_forest)):
         if self.or_features[i] == None:
             prob = prob + self.cltree.score_sample_scope_log_proba(x, self.tree_forest[i])
         else:
             x0 = x[self.tree_forest[i]]
             x1 = np.concatenate((x0[0:self.or_features[i]], x0[self.or_features[i] + 1:]))
             if x0[self.or_features[i]] == 0:
                 prob = prob + logr(self.left_weights[i]) + self.children_left[i].score_sample_log_proba(x1)
             else:
                 prob = prob + logr(self.right_weights[i]) + self.children_right[i].score_sample_log_proba(x1)
Beispiel #5
0
 def score_samples(self, data, n_c, out_filename):
     with open(out_filename, 'w') as out_log:
         self.compute_weights(n_c)
         mean = 0.0
         for x in data:
             prob = 0.0
             for k in range(n_c):
                 prob = prob + np.exp(self.csns[k].score_sample_log_proba(
                     x)) * self.weights[k]
             mean = mean + logr(prob)
             out_log.write('%.10f\n' % logr(prob))
     out_log.close()
     return mean / data.shape[0]
Beispiel #6
0
 def marginal_inference(self, evidence={}):
     log_proba = 0.0
     state_evidence = evidence.get(self.or_feature_scope)
     if state_evidence is not None:
         if state_evidence == 0:
             log_proba = self.left_child.marginal_inference(evidence)
             log_proba += logr(self.left_weight)
         else:
             log_proba = self.right_child.marginal_inference(evidence)
             log_proba += logr(self.right_weight)
     else:
         left_log_proba = self.left_child.marginal_inference(evidence)
         right_log_proba = self.right_child.marginal_inference(evidence)
         log_proba = logr(np.exp(left_log_proba)*self.left_weight + np.exp(right_log_proba)*self.right_weight)
     return log_proba
Beispiel #7
0
 def score_sample_log_proba(self, x):
     """ WRITEME """
     prob = 0.0
     for s in range(len(self.children)):
         prob = prob + (self.weights[s] *
                        np.exp(self.children[s].score_sample_log_proba(x)))
     return logr(prob)
Beispiel #8
0
 def marginal_inference(self, evidence={}):
     log_proba = 0.0
     state_evidence = evidence.get(self.or_feature_scope)
     if state_evidence is not None:
         if state_evidence == 0:
             log_proba = self.left_child.marginal_inference(evidence)
             log_proba += logr(self.left_weight)
         else:
             log_proba = self.right_child.marginal_inference(evidence)
             log_proba += logr(self.right_weight)
     else:
         left_log_proba = self.left_child.marginal_inference(evidence)
         right_log_proba = self.right_child.marginal_inference(evidence)
         log_proba = logr(
             np.exp(left_log_proba) * self.left_weight +
             np.exp(right_log_proba) * self.right_weight)
     return log_proba
Beispiel #9
0
def log_probs_numba(n_features, 
                    scope, 
                    n_samples, 
                    alpha, 
                    mpriors, 
                    priors, 
                    log_probs, 
                    log_j_probs, 
                    cond, 
                    p):
    for i in range(n_features):
        id_i = scope[i]
        prob = (p[i] + alpha*mpriors[id_i,1])/(n_samples + alpha)
        log_probs[i,0] = logr(1-prob)
        log_probs[i,1] = logr(prob)

    for i in range(n_features):
        for j in range(n_features):
            id_i = scope[i]
            id_j = scope[j]

            log_j_probs[i,j,1,1] = logr((cond[i,j] + alpha*priors[id_i,id_j,1,1]) / ( n_samples + alpha))
            log_j_probs[i,j,0,1] = logr((cond[j,j] - cond[i,j] + alpha*priors[id_i,id_j,0,1]) / ( n_samples + alpha))
            log_j_probs[i,j,1,0] = logr((cond[i,i] - cond[i,j] + alpha*priors[id_i,id_j,1,0]) / ( n_samples + alpha))
            log_j_probs[i,j,0,0] = logr((n_samples - cond[j,j] - cond[i,i] + cond[i,j] + alpha*priors[id_i,id_j,0,0]) / ( n_samples + alpha))

            log_j_probs[j,i,1,1] = log_j_probs[i,j,1,1]
            log_j_probs[j,i,1,0] = log_j_probs[i,j,0,1]
            log_j_probs[j,i,0,1] = log_j_probs[i,j,1,0]
            log_j_probs[j,i,0,0] = log_j_probs[i,j,0,0]

    return (log_probs, log_j_probs)
Beispiel #10
0
def log_probs_numba(n_features, scope, n_samples, alpha, mpriors, priors,
                    log_probs, log_j_probs, cond, p):
    for i in range(n_features):
        id_i = scope[i]
        prob = (p[i] + alpha * mpriors[id_i, 1]) / (n_samples + alpha)
        log_probs[i, 0] = logr(1 - prob)
        log_probs[i, 1] = logr(prob)

    for i in range(n_features):
        for j in range(n_features):
            id_i = scope[i]
            id_j = scope[j]

            log_j_probs[i, j, 1, 1] = logr(
                (cond[i, j] + alpha * priors[id_i, id_j, 1, 1]) /
                (n_samples + alpha))
            log_j_probs[i, j, 0, 1] = logr(
                (cond[j, j] - cond[i, j] + alpha * priors[id_i, id_j, 0, 1]) /
                (n_samples + alpha))
            log_j_probs[i, j, 1, 0] = logr(
                (cond[i, i] - cond[i, j] + alpha * priors[id_i, id_j, 1, 0]) /
                (n_samples + alpha))
            log_j_probs[i, j, 0, 0] = logr(
                (n_samples - cond[j, j] - cond[i, i] + cond[i, j] +
                 alpha * priors[id_i, id_j, 0, 0]) / (n_samples + alpha))

            log_j_probs[j, i, 1, 1] = log_j_probs[i, j, 1, 1]
            log_j_probs[j, i, 1, 0] = log_j_probs[i, j, 0, 1]
            log_j_probs[j, i, 0, 1] = log_j_probs[i, j, 1, 0]
            log_j_probs[j, i, 0, 0] = log_j_probs[i, j, 0, 0]

    return (log_probs, log_j_probs)
Beispiel #11
0
 def mpe(self, evidence={}):
     mpe_log_proba = 0.0
     state_evidence = evidence.get(self.or_feature_scope)
     if state_evidence is not None:
         if state_evidence == 0:
             (mpe_state, mpe_log_proba) = self.left_child.mpe(evidence)
             mpe_state[self.or_feature_scope] = 0
             mpe_log_proba += logr(self.left_weight)
         else:
             (mpe_state, mpe_log_proba) = self.right_child.mpe(evidence)
             mpe_state[self.or_feature_scope] = 1
             mpe_log_proba += logr(self.right_weight)
     else:
         (left_mpe_state,
          left_mpe_log_proba) = self.left_child.mpe(evidence)
         (right_mpe_state,
          right_mpe_log_proba) = self.right_child.mpe(evidence)
         if left_mpe_log_proba + logr(
                 self.left_weight) > right_mpe_log_proba + logr(
                     self.right_weight):
             mpe_state = left_mpe_state
             mpe_state[self.or_feature_scope] = 0
             mpe_log_proba = left_mpe_log_proba + logr(self.left_weight)
         else:
             mpe_state = right_mpe_state
             mpe_state[self.or_feature_scope] = 1
             mpe_log_proba = right_mpe_log_proba + logr(self.right_weight)
     return (mpe_state, mpe_log_proba)
Beispiel #12
0
    def marginal_inference(self, evidence = {}):

        messages = np.zeros((self.n_features, 2))
        logprob = 0.0
        for i in self.post_order:
            if i != 0:
                state_evidence = evidence.get(self.scope[i])
                if state_evidence != None:
                    messages[self.tree[i],0] += self.log_factors[i,state_evidence,0] + messages[i,state_evidence]
                    messages[self.tree[i],1] += self.log_factors[i,state_evidence,1] + messages[i,state_evidence]
                else:
                    # marginalization
                    messages[self.tree[i], 0] += logr(np.exp(self.log_factors[i, 0, 0] + messages[i,0]) + np.exp(self.log_factors[i, 1, 0] + messages[i,1]))
                    messages[self.tree[i], 1] += logr(np.exp(self.log_factors[i, 0, 1] + messages[i,0]) + np.exp(self.log_factors[i, 1, 1] + messages[i,1]))
            else:
                state_evidence = evidence.get(self.scope[i])
                if state_evidence != None:
                    logprob = self.log_factors[i,state_evidence,0] + messages[0,state_evidence]
                else:
                    # marginalization
                    logprob = logr(np.exp(self.log_factors[i,0,0]+messages[0,0])+np.exp(self.log_factors[i,1,0]+messages[0,1]))
        return logprob
Beispiel #13
0
    def naive_marginal(self, evidence = {}):
        probm = 0.0
        
        M = {}
        for i in range(self.n_features):
            if evidence.get(i) == None:
                M[i] = [0,1]

        A = [dict(zip(M,prod)) for prod in itertools.product(*(M[param] for param in M))]

        for D in A:
            D.update(evidence)
            prob = self.log_factors[0, D[0], 0]
            for i in range(1,self.n_features):
                prob = prob + self.log_factors[i, D[i], D[self.tree[i]]]
            probm += np.exp(prob)
        return logr(probm)
Beispiel #14
0
    def naive_marginal(self, evidence={}):
        probm = 0.0

        M = {}
        for i in range(self.n_features):
            if evidence.get(i) == None:
                M[i] = [0, 1]

        A = [
            dict(zip(M, prod))
            for prod in itertools.product(*(M[param] for param in M))
        ]

        for D in A:
            D.update(evidence)
            prob = self.log_factors[0, D[0], 0]
            for i in range(1, self.n_features):
                prob = prob + self.log_factors[i, D[i], D[self.tree[i]]]
            probm += np.exp(prob)
        return logr(probm)
Beispiel #15
0
 def mpe(self, evidence={}):
     mpe_log_proba = 0.0
     state_evidence = evidence.get(self.or_feature_scope)
     if state_evidence is not None:
         if state_evidence == 0:
             (mpe_state, mpe_log_proba) = self.left_child.mpe(evidence)
             mpe_state[self.or_feature_scope] = 0
             mpe_log_proba += logr(self.left_weight)
         else:
             (mpe_state, mpe_log_proba) = self.right_child.mpe(evidence)
             mpe_state[self.or_feature_scope] = 1
             mpe_log_proba += logr(self.right_weight)
     else:
         (left_mpe_state, left_mpe_log_proba) = self.left_child.mpe(evidence)
         (right_mpe_state, right_mpe_log_proba) = self.right_child.mpe(evidence)
         if left_mpe_log_proba + logr(self.left_weight) > right_mpe_log_proba + logr(self.right_weight):
             mpe_state = left_mpe_state
             mpe_state[self.or_feature_scope] = 0
             mpe_log_proba = left_mpe_log_proba + logr(self.left_weight)
         else:
             mpe_state = right_mpe_state
             mpe_state[self.or_feature_scope] = 1
             mpe_log_proba = right_mpe_log_proba + logr(self.right_weight)
     return (mpe_state, mpe_log_proba)
Beispiel #16
0
    def or_cut(self):
        """ WRITEME """
        # print(" > trying to cut ... ")
        sys.stdout.flush()

        found = False

        bestlik = self.orig_ll
        best_clt_l = None
        best_clt_r = None
        best_feature_cut = None
        best_left_weight = 0.0
        best_right_weight = 0.0
        best_right_data = None
        best_left_data = None
        best_v_ll = 0.0
        best_gain = -np.inf
        best_left_sample_weight = None
        best_right_sample_weight = None
        best_left_vdata = None
        best_right_vdata = None
        if self.sum_nodes:

            # check for clustering
            n_clusters = 2
            cov_type = 'tied'
            rand_gen = None
            n_iters = 1000
            n_restarts = 1
            gmm_c = sklearn.mixture.GMM(n_components=n_clusters,
                                        covariance_type=cov_type,
                                        random_state=rand_gen,
                                        n_iter=n_iters,
                                        n_init=n_restarts)

            gmm_c.fit(self.data)

            clustering = gmm_c.predict(self.data)

            # preventing to have a cluster with zero instances
            cardinality = np.sum(clustering)
            # print("   - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ")
            if cardinality > 0 and (self.data.shape[0] - cardinality) > 0:

                cluster_0 = (clustering == 0)

                cluster_0_data = self.data[cluster_0]
                cluster_1_data = self.data[~cluster_0]

                cluster_0_tree = Cltree()
                cluster_1_tree = Cltree()

                cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0]
                cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0]

                cluster_0_tree.fit(cluster_0_data,
                                   vdata=self.vdata,
                                   m_priors=self.m_priors,
                                   j_priors=self.j_priors,
                                   scope=self.scope,
                                   alpha=self.alpha * cluster_0_weight,
                                   and_leaves=self.and_leaves,
                                   sample_weight=None)
                cluster_1_tree.fit(cluster_1_data,
                                   m_priors=self.m_priors,
                                   j_priors=self.j_priors,
                                   scope=self.scope,
                                   alpha=self.alpha * cluster_1_weight,
                                   and_leaves=self.and_leaves,
                                   sample_weight=None)

                cluster_0_ll = cluster_0_tree.score_samples_log_proba(
                    cluster_0_data, sample_weight=None)
                cluster_1_ll = cluster_1_tree.score_samples_log_proba(
                    cluster_1_data, sample_weight=None)

                # log sum exp
                clustering_ll = 0.0
                for d in self.data:
                    clustering_ll = clustering_ll + logr(
                        cluster_0_weight *
                        np.exp(cluster_0_tree.score_sample_log_proba(d)) +
                        cluster_1_weight *
                        np.exp(cluster_1_tree.score_sample_log_proba(d)))
                clustering_ll = clustering_ll / self.data.shape[0]

                # print("ll:", clustering_ll)

            else:
                clustering_ll = -np.inf
        else:
            clustering_ll = -np.inf

        if self.random_forest:
            if self.d > self.node.cltree.n_features:
                selected = range(self.node.cltree.n_features)
            else:
                selected = sorted(
                    random.sample(range(self.node.cltree.n_features), self.d))

        else:
            selected = range(self.node.cltree.n_features)

        for feature in selected:
            condition = self.data[:, feature] == 0
            new_features = np.ones(self.data.shape[1], dtype=bool)
            new_features[feature] = False
            left_data = self.data[condition, :][:, new_features]
            right_data = self.data[~condition, :][:, new_features]

            vdata_condition = self.vdata[:, feature] == 0
            left_vdata = self.vdata[vdata_condition, :][:, new_features]
            right_vdata = self.vdata[~vdata_condition, :][:, new_features]

            if self.sample_weight is not None:
                left_sample_weight = self.sample_weight[condition]
                right_sample_weight = self.sample_weight[~condition]
                left_weight = np.sum(left_sample_weight) / np.sum(
                    self.sample_weight)
                right_weight = np.sum(right_sample_weight) / np.sum(
                    self.sample_weight)
            else:
                left_sample_weight = None
                right_sample_weight = None
                left_weight = (left_data.shape[0]) / (self.data.shape[0])
                right_weight = (right_data.shape[0]) / (self.data.shape[0])

            if left_data.shape[0] > 0 and right_data.shape[
                    0] > 0 and left_vdata.shape[0] > 0 and right_vdata.shape[
                        0] > 0:

                left_scope = np.concatenate(
                    (self.node.cltree.scope[0:feature],
                     self.node.cltree.scope[feature + 1:]))
                right_scope = np.concatenate(
                    (self.node.cltree.scope[0:feature],
                     self.node.cltree.scope[feature + 1:]))
                CL_l = Cltree()
                CL_r = Cltree()

                CL_l.fit(left_data,
                         self.m_priors,
                         self.j_priors,
                         scope=left_scope,
                         alpha=self.alpha * left_weight,
                         and_leaves=self.and_leaves,
                         sample_weight=left_sample_weight,
                         noise=self.noise)
                CL_r.fit(right_data,
                         self.m_priors,
                         self.j_priors,
                         scope=right_scope,
                         alpha=self.alpha * right_weight,
                         and_leaves=self.and_leaves,
                         sample_weight=right_sample_weight,
                         noise=self.noise)

                l_ll = CL_l.score_samples_log_proba(
                    left_data, sample_weight=left_sample_weight)
                r_ll = CL_r.score_samples_log_proba(
                    right_data, sample_weight=right_sample_weight)

                if self.sample_weight is not None:
                    ll = ((l_ll + logr(left_weight)) *
                          np.sum(left_sample_weight) +
                          (r_ll + logr(right_weight)) *
                          np.sum(right_sample_weight)) / np.sum(
                              self.sample_weight)
                else:
                    ll = ((l_ll + logr(left_weight)) * left_data.shape[0] +
                          (r_ll + logr(right_weight)) *
                          right_data.shape[0]) / self.data.shape[0]
            else:
                ll = -np.inf

            if ll > bestlik:
                bestlik = ll
                best_clt_l = CL_l
                best_clt_r = CL_r
                best_feature_cut = feature
                best_left_weight = left_weight
                best_right_weight = right_weight
                best_right_data = right_data
                best_left_data = left_data
                best_right_vdata = right_vdata
                best_left_vdata = left_vdata
                best_l_ll = l_ll
                best_r_ll = r_ll
                best_left_sample_weight = left_sample_weight
                best_right_sample_weight = right_sample_weight

                found = True
        """
        if (self.depth+1) % 2 == 0:
            bestlik = self.orig_ll
        else:
            clustering_ll = self.orig_ll
        """

        gain = (bestlik - self.orig_ll)
        # print ("   - gain cut:", gain, end = "")

        gain_c = (clustering_ll - self.orig_ll)
        # print (" gain clustering:", gain_c)

        if (found == True
                and gain > self.min_gain) or (gain_c > gain
                                              and gain_c > self.min_gain):

            if (gain > gain_c):

                self.node = OrNode()
                Csn._or_nodes = Csn._or_nodes + 1
                Csn._or_edges = Csn._or_edges + 2

                self.node.or_feature = best_feature_cut
                # print("   - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll)

                instances = self.data.shape[0]

                self.node.left_weight = best_left_weight
                self.node.right_weight = best_right_weight

                # free memory before to recurse
                self.free_memory()

                self.node.left_child = Csn(
                    data=best_left_data,
                    vdata=best_left_vdata,
                    clt=best_clt_l,
                    ll=best_l_ll,
                    min_instances=self.min_instances,
                    min_features=self.min_features,
                    alpha=self.alpha * best_left_weight,
                    d=self.d,
                    random_forest=self.random_forest,
                    m_priors=self.m_priors,
                    j_priors=self.j_priors,
                    n_original_samples=self.n_original_samples,
                    and_leaves=self.and_leaves,
                    and_inners=self.and_inners,
                    min_gain=self.min_gain,
                    depth=self.depth + 1,
                    sample_weight=best_left_sample_weight,
                    forest_approach=self.forest_approach,
                    noise=self.noise)
                self.node.right_child = Csn(
                    data=best_right_data,
                    vdata=best_right_vdata,
                    clt=best_clt_r,
                    ll=best_r_ll,
                    min_instances=self.min_instances,
                    min_features=self.min_features,
                    alpha=self.alpha * best_right_weight,
                    d=self.d,
                    random_forest=self.random_forest,
                    m_priors=self.m_priors,
                    j_priors=self.j_priors,
                    n_original_samples=self.n_original_samples,
                    and_leaves=self.and_leaves,
                    and_inners=self.and_inners,
                    min_gain=self.min_gain,
                    depth=self.depth + 1,
                    sample_weight=best_right_sample_weight,
                    forest_approach=self.forest_approach,
                    noise=self.noise)

            else:
                self.node = SumNode()
                # print("   - Adding a sum node")

                Csn._sum_nodes = Csn._sum_nodes + 1

                instances = self.data.shape[0]

                self.node.weights.append(cluster_0_weight)
                self.node.weights.append(cluster_1_weight)

                # free memory before to recurse
                self.free_memory()

                self.node.children.append(
                    Csn(data=cluster_0_data,
                        vdata=None,
                        clt=cluster_0_tree,
                        ll=cluster_0_ll,
                        min_instances=self.min_instances,
                        min_features=self.min_features,
                        alpha=self.alpha * cluster_0_weight,
                        d=self.d,
                        random_forest=self.random_forest,
                        m_priors=self.m_priors,
                        j_priors=self.j_priors,
                        n_original_samples=self.n_original_samples,
                        and_leaves=self.and_leaves,
                        and_inners=self.and_inners,
                        min_gain=self.min_gain,
                        depth=self.depth + 1,
                        sample_weight=None,
                        forest_approach=self.forest_approach,
                        noise=self.noise))
                self.node.children.append(
                    Csn(data=cluster_1_data,
                        vdata=None,
                        clt=cluster_1_tree,
                        ll=cluster_1_ll,
                        min_instances=self.min_instances,
                        min_features=self.min_features,
                        alpha=self.alpha * cluster_1_weight,
                        d=self.d,
                        random_forest=self.random_forest,
                        m_priors=self.m_priors,
                        j_priors=self.j_priors,
                        n_original_samples=self.n_original_samples,
                        and_leaves=self.and_leaves,
                        and_inners=self.and_inners,
                        min_gain=self.min_gain,
                        depth=self.depth + 1,
                        sample_weight=None,
                        forest_approach=self.forest_approach,
                        noise=self.noise))

        else:
            # Make a forest
            if self.and_leaves:
                self.node.cltree.makeForest(
                    vdata=self.vdata, forest_approach=self.forest_approach)
            # print(" no cutting")
            """if self.node.cltree.is_forest():
Beispiel #17
0
    def and_cut(self):
        """ WRITEME """
        n_features = self.data.shape[1]
        self.forest = np.zeros(n_features, dtype=np.int)
        self.roots = []

        # naive approach to build the tree_forest
        for i in range(n_features):
            if self.node.cltree.tree[i] == -1:
                self.roots.append(i)
        for i in range(n_features):
            if self.node.cltree.tree[i] != -1:
                parent = self.node.cltree.tree[i]
                while self.node.cltree.tree[parent] != -1:
                    parent = self.node.cltree.tree[parent]
                self.forest[i] = parent
            else:
                self.forest[i] = i

        self.tree_forest = []
        for r in self.roots:
            t_forest = []
            for i in range(n_features):
                if self.forest[i] == r:
                    t_forest.append(i)
            self.tree_forest.append(t_forest)
        """print ("AND node")
        print (self.tree_forest)"""

        for i in range(self.node.cltree.num_trees):

            # print(" tree", self.tree_forest[i])
            sys.stdout.flush()

            tree_n_features = len(self.tree_forest[i])

            if self.data.shape[0] > self.min_instances:
                if tree_n_features >= self.min_features:

                    tree_data = self.data[:, self.tree_forest[i]]

                    found = False

                    orig_ll = self.node.cltree.score_samples_scope_log_proba(
                        self.data, self.tree_forest[i])

                    bestlik = orig_ll
                    best_clt_l = None
                    best_clt_r = None
                    best_feature_cut = None
                    best_left_weight = 0.0
                    best_right_weight = 0.0
                    best_right_data = None
                    best_left_data = None
                    best_v_ll = 0.0
                    best_gain = -np.inf

                    if self.random_forest:
                        if self.d > tree_n_features:
                            selected = range(tree_n_features)
                        else:
                            selected = sorted(
                                random.sample(range(tree_n_features), self.d))
                    else:
                        selected = range(tree_n_features)

                    for feature in selected:
                        condition = tree_data[:, feature] == 0
                        new_features = np.ones(tree_data.shape[1], dtype=bool)
                        new_features[feature] = False
                        left_data = tree_data[condition, :][:, new_features]
                        right_data = tree_data[~condition, :][:, new_features]
                        left_weight = (left_data.shape[0]) / (
                            tree_data.shape[0])
                        right_weight = (right_data.shape[0]) / (
                            tree_data.shape[0])

                        if self.sample_weight is not None:
                            left_sample_weight = self.sample_weight[condition]
                            right_sample_weight = self.sample_weight[
                                ~condition]
                        else:
                            left_sample_weight = None
                            right_sample_weight = None

                        if left_data.shape[0] > 1 and right_data.shape[0] > 1:
                            # compute the tree features id
                            tree_scope = np.zeros(tree_n_features,
                                                  dtype=np.int)
                            for f in range(tree_n_features):
                                tree_scope[f] = self.node.cltree.scope[
                                    self.tree_forest[i][f]]

                            left_scope = np.concatenate(
                                (tree_scope[0:feature],
                                 tree_scope[feature + 1:]))
                            right_scope = np.concatenate(
                                (tree_scope[0:feature],
                                 tree_scope[feature + 1:]))

                            CL_l = Cltree()
                            CL_r = Cltree()

                            CL_l.fit(left_data,
                                     vdata=self.vdata,
                                     m_priors=self.m_priors,
                                     j_priors=self.j_priors,
                                     scope=left_scope,
                                     alpha=self.alpha * left_weight,
                                     and_leaves=self.and_leaves,
                                     sample_weight=left_sample_weight)
                            CL_r.fit(right_data,
                                     vdata=self.vdata,
                                     m_priors=self.m_priors,
                                     j_priors=self.j_priors,
                                     scope=right_scope,
                                     alpha=self.alpha * right_weight,
                                     and_leaves=self.and_leaves,
                                     sample_weight=right_sample_weight)

                            l_ll = CL_l.score_samples_log_proba(left_data)
                            r_ll = CL_r.score_samples_log_proba(right_data)

                            ll = ((l_ll + logr(left_weight)) *
                                  left_data.shape[0] +
                                  (r_ll + logr(right_weight)) *
                                  right_data.shape[0]) / self.data.shape[0]
                        else:
                            ll = -np.inf

                        if ll > bestlik:
                            bestlik = ll
                            best_clt_l = CL_l
                            best_clt_r = CL_r
                            best_feature_cut = feature
                            best_left_weight = left_weight
                            best_right_weight = right_weight
                            best_right_data = right_data
                            best_left_data = left_data
                            best_l_ll = l_ll
                            best_r_ll = r_ll

                            best_left_sample_weight = left_sample_weight
                            best_right_sample_weight = right_sample_weight

                            found = True

                    gain = (bestlik - orig_ll)
                    # print (" gain:", gain, end = " ")
                    """if gain <= self.min_gain:
                        print("no improvement")"""

                    if found == True and gain > self.min_gain:

                        if not is_and_node(self.node):
                            clt = self.node.cltree
                            self.node = AndNode()
                            self.node.cltree = clt
                            self.node.children_left = [
                                None
                            ] * self.node.cltree.num_trees
                            self.node.children_right = [
                                None
                            ] * self.node.cltree.num_trees
                            self.node.or_features = [
                                None
                            ] * self.node.cltree.num_trees
                            self.node.left_weights = [
                                None
                            ] * self.node.cltree.num_trees
                            self.node.right_weights = [
                                None
                            ] * self.node.cltree.num_trees
                            self.node.tree_forest = self.tree_forest

                        Csn._or_nodes = Csn._or_nodes + 1
                        Csn._or_edges = Csn._or_edges + 2

                        self.node.or_features[i] = best_feature_cut
                        # print(" cutting on feature ", self.node.or_features[i])

                        instances = self.data.shape[0]

                        self.node.left_weights[i] = best_left_weight
                        self.node.right_weights[i] = best_right_weight

                        self.node.children_left[i] = Csn(
                            data=best_left_data,
                            vdata=self.vdata,
                            clt=best_clt_l,
                            ll=best_l_ll,
                            min_instances=self.min_instances,
                            min_features=self.min_features,
                            alpha=self.alpha * best_left_weight,
                            d=self.d,
                            random_forest=self.random_forest,
                            m_priors=self.m_priors,
                            j_priors=self.j_priors,
                            n_original_samples=self.n_original_samples,
                            and_leaves=self.and_leaves,
                            and_inners=self.and_inners,
                            min_gain=self.min_gain,
                            depth=self.depth + 1,
                            sample_weight=best_left_sample_weight)
                        self.node.children_right[i] = Csn(
                            data=best_right_data,
                            vdata=self.vdata,
                            clt=best_clt_r,
                            ll=best_r_ll,
                            min_instances=self.min_instances,
                            min_features=self.min_features,
                            alpha=self.alpha * best_right_weight,
                            d=self.d,
                            random_forest=self.random_forest,
                            m_priors=self.m_priors,
                            j_priors=self.j_priors,
                            n_original_samples=self.n_original_samples,
                            and_leaves=self.and_leaves,
                            and_inners=self.and_inners,
                            min_gain=self.min_gain,
                            depth=self.depth + 1,
                            sample_weight=best_right_sample_weight)
                """else:
                    print( " > no cutting due to few features")
            else:
                print(" > no cutting due to few instances")"""
        if is_and_node(self.node):
            Csn._and_nodes += 1

        # free memory before to recurse
        self.free_memory()
Beispiel #18
0
Datei: csn.py Projekt: Rhuax/dcsn
    def or_cut(self):
        """ WRITEME """
        # print(" > trying to cut ... ")
        sys.stdout.flush()

        found = False

        bestlik = self.orig_ll
        best_clt_l = None
        best_clt_r = None
        best_feature_cut = None
        best_left_weight = 0.0
        best_right_weight = 0.0
        best_right_data = None
        best_left_data = None
        best_v_ll = 0.0
        best_gain = -np.inf
        best_left_sample_weight = None
        best_right_sample_weight = None
        best_left_vdata = None
        best_right_vdata = None
        if self.sum_nodes:

            # check for clustering
            n_clusters = 2
            cov_type = 'tied'
            rand_gen = None
            n_iters = 1000
            n_restarts = 1
            gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type,
                                        random_state=rand_gen, n_iter=n_iters, n_init=n_restarts)

            gmm_c.fit(self.data)

            clustering = gmm_c.predict(self.data)

            # preventing to have a cluster with zero instances
            cardinality = np.sum(clustering)
            # print("   - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ")
            if cardinality > 0 and (self.data.shape[0] - cardinality) > 0:

                cluster_0 = (clustering == 0)

                cluster_0_data = self.data[cluster_0]
                cluster_1_data = self.data[~cluster_0]

                cluster_0_tree = Cltree()
                cluster_1_tree = Cltree()

                cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0]
                cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0]

                cluster_0_tree.fit(cluster_0_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors,
                                   scope=self.scope, alpha=self.alpha * cluster_0_weight,
                                   and_leaves=self.and_leaves, sample_weight=None)
                cluster_1_tree.fit(cluster_1_data, m_priors=self.m_priors, j_priors=self.j_priors, scope=self.scope,
                                   alpha=self.alpha * cluster_1_weight,
                                   and_leaves=self.and_leaves, sample_weight=None)

                cluster_0_ll = cluster_0_tree.score_samples_log_proba(cluster_0_data, sample_weight=None)
                cluster_1_ll = cluster_1_tree.score_samples_log_proba(cluster_1_data, sample_weight=None)

                # log sum exp
                clustering_ll = 0.0
                for d in self.data:
                    clustering_ll = clustering_ll + logr(
                        cluster_0_weight * np.exp(cluster_0_tree.score_sample_log_proba(d)) + cluster_1_weight * np.exp(
                            cluster_1_tree.score_sample_log_proba(d)))
                clustering_ll = clustering_ll / self.data.shape[0]

                # print("ll:", clustering_ll)

            else:
                clustering_ll = -np.inf
        else:
            clustering_ll = -np.inf

        if self.random_forest:
            if self.d > self.node.cltree.n_features:
                selected = range(self.node.cltree.n_features)
            else:
                selected = sorted(random.sample(range(self.node.cltree.n_features), self.d))

        else:
            selected = range(self.node.cltree.n_features)

        for feature in selected:
            condition = self.data[:, feature] == 0
            new_features = np.ones(self.data.shape[1], dtype=bool)
            new_features[feature] = False
            left_data = self.data[condition, :][:, new_features]
            right_data = self.data[~condition, :][:, new_features]

            vdata_condition = self.vdata[:, feature] == 0
            left_vdata = self.vdata[vdata_condition, :][:, new_features]
            right_vdata = self.vdata[~vdata_condition, :][:, new_features]

            if self.sample_weight is not None:
                left_sample_weight = self.sample_weight[condition]
                right_sample_weight = self.sample_weight[~condition]
                left_weight = np.sum(left_sample_weight) / np.sum(self.sample_weight)
                right_weight = np.sum(right_sample_weight) / np.sum(self.sample_weight)
            else:
                left_sample_weight = None
                right_sample_weight = None
                left_weight = (left_data.shape[0]) / (self.data.shape[0])
                right_weight = (right_data.shape[0]) / (self.data.shape[0])

            if left_data.shape[0] > 0 and right_data.shape[0] > 0 and left_vdata.shape[0] > 0 and right_vdata.shape[
                0] > 0:

                left_scope = np.concatenate((self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:]))
                right_scope = np.concatenate((self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:]))
                CL_l = Cltree()
                CL_r = Cltree()

                CL_l.fit(left_data, self.m_priors, self.j_priors, scope=left_scope,
                         alpha=self.alpha * left_weight,
                         and_leaves=self.and_leaves, sample_weight=left_sample_weight, noise=self.noise)
                CL_r.fit(right_data,self.m_priors, self.j_priors, scope=right_scope,
                         alpha=self.alpha * right_weight,
                         and_leaves=self.and_leaves, sample_weight=right_sample_weight, noise=self.noise)

                l_ll = CL_l.score_samples_log_proba(left_data, sample_weight=left_sample_weight)
                r_ll = CL_r.score_samples_log_proba(right_data, sample_weight=right_sample_weight)

                if self.sample_weight is not None:
                    ll = (
                             (l_ll + logr(left_weight)) * np.sum(left_sample_weight) + (
                                 r_ll + logr(right_weight)) * np.sum(
                                 right_sample_weight)) / np.sum(self.sample_weight)
                else:
                    ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) *
                          right_data.shape[0]) / self.data.shape[0]
            else:
                ll = -np.inf

            if ll > bestlik:
                bestlik = ll
                best_clt_l = CL_l
                best_clt_r = CL_r
                best_feature_cut = feature
                best_left_weight = left_weight
                best_right_weight = right_weight
                best_right_data = right_data
                best_left_data = left_data
                best_right_vdata = right_vdata
                best_left_vdata = left_vdata
                best_l_ll = l_ll
                best_r_ll = r_ll
                best_left_sample_weight = left_sample_weight
                best_right_sample_weight = right_sample_weight

                found = True
        """
        if (self.depth+1) % 2 == 0:
            bestlik = self.orig_ll
        else:
            clustering_ll = self.orig_ll
        """

        gain = (bestlik - self.orig_ll)
        # print ("   - gain cut:", gain, end = "")

        gain_c = (clustering_ll - self.orig_ll)
        # print (" gain clustering:", gain_c)


        if (found == True and gain > self.min_gain) or (gain_c > gain and gain_c > self.min_gain):

            if (gain > gain_c):

                self.node = OrNode()
                Csn._or_nodes = Csn._or_nodes + 1
                Csn._or_edges = Csn._or_edges + 2

                self.node.or_feature = best_feature_cut
                # print("   - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll)

                instances = self.data.shape[0]

                self.node.left_weight = best_left_weight
                self.node.right_weight = best_right_weight

                # free memory before to recurse
                self.free_memory()

                self.node.left_child = Csn(data=best_left_data,
                                           vdata=best_left_vdata,
                                           clt=best_clt_l, ll=best_l_ll,
                                           min_instances=self.min_instances,
                                           min_features=self.min_features, alpha=self.alpha * best_left_weight,
                                           d=self.d, random_forest=self.random_forest,
                                           m_priors=self.m_priors, j_priors=self.j_priors,
                                           n_original_samples=self.n_original_samples,
                                           and_leaves=self.and_leaves, and_inners=self.and_inners,
                                           min_gain=self.min_gain, depth=self.depth + 1,
                                           sample_weight=best_left_sample_weight,
                                           forest_approach=self.forest_approach,
                                           noise=self.noise)
                self.node.right_child = Csn(data=best_right_data,
                                            vdata=best_right_vdata,
                                            clt=best_clt_r, ll=best_r_ll,
                                            min_instances=self.min_instances,
                                            min_features=self.min_features, alpha=self.alpha * best_right_weight,
                                            d=self.d,
                                            random_forest=self.random_forest,
                                            m_priors=self.m_priors, j_priors=self.j_priors,
                                            n_original_samples=self.n_original_samples,
                                            and_leaves=self.and_leaves, and_inners=self.and_inners,
                                            min_gain=self.min_gain, depth=self.depth + 1,
                                            sample_weight=best_right_sample_weight,
                                            forest_approach=self.forest_approach,
                                            noise=self.noise)

            else:
                self.node = SumNode()
                # print("   - Adding a sum node")

                Csn._sum_nodes = Csn._sum_nodes + 1

                instances = self.data.shape[0]

                self.node.weights.append(cluster_0_weight)
                self.node.weights.append(cluster_1_weight)

                # free memory before to recurse
                self.free_memory()

                self.node.children.append(Csn(data=cluster_0_data, vdata=None,
                                              clt=cluster_0_tree, ll=cluster_0_ll,
                                              min_instances=self.min_instances,
                                              min_features=self.min_features, alpha=self.alpha * cluster_0_weight,
                                              d=self.d, random_forest=self.random_forest,
                                              m_priors=self.m_priors, j_priors=self.j_priors,
                                              n_original_samples=self.n_original_samples,
                                              and_leaves=self.and_leaves, and_inners=self.and_inners,
                                              min_gain=self.min_gain, depth=self.depth + 1,
                                              sample_weight=None,
                                              forest_approach=self.forest_approach, noise=self.noise))
                self.node.children.append(Csn(data=cluster_1_data, vdata=None,
                                              clt=cluster_1_tree, ll=cluster_1_ll,
                                              min_instances=self.min_instances,
                                              min_features=self.min_features, alpha=self.alpha * cluster_1_weight,
                                              d=self.d,
                                              random_forest=self.random_forest,
                                              m_priors=self.m_priors, j_priors=self.j_priors,
                                              n_original_samples=self.n_original_samples,
                                              and_leaves=self.and_leaves, and_inners=self.and_inners,
                                              min_gain=self.min_gain, depth=self.depth + 1,
                                              sample_weight=None,
                                              forest_approach=self.forest_approach, noise=self.noise))

        else:
            # Make a forest
            if self.and_leaves:
                self.node.cltree.makeForest(vdata=self.vdata, forest_approach=self.forest_approach)
            # print(" no cutting")
            """if self.node.cltree.is_forest():
Beispiel #19
0
Datei: csn.py Projekt: Rhuax/dcsn
    def and_cut(self):
        """ WRITEME """
        n_features = self.data.shape[1]
        self.forest = np.zeros(n_features, dtype=np.int)
        self.roots = []

        # naive approach to build the tree_forest
        for i in range(n_features):
            if self.node.cltree.tree[i] == -1:
                self.roots.append(i)
        for i in range(n_features):
            if self.node.cltree.tree[i] != -1:
                parent = self.node.cltree.tree[i]
                while self.node.cltree.tree[parent] != -1:
                    parent = self.node.cltree.tree[parent]
                self.forest[i] = parent
            else:
                self.forest[i] = i

        self.tree_forest = []
        for r in self.roots:
            t_forest = []
            for i in range(n_features):
                if self.forest[i] == r:
                    t_forest.append(i)
            self.tree_forest.append(t_forest)

        """print ("AND node")
        print (self.tree_forest)"""

        for i in range(self.node.cltree.num_trees):

            # print(" tree", self.tree_forest[i])
            sys.stdout.flush()

            tree_n_features = len(self.tree_forest[i])

            if self.data.shape[0] > self.min_instances:
                if tree_n_features >= self.min_features:

                    tree_data = self.data[:, self.tree_forest[i]]

                    found = False

                    orig_ll = self.node.cltree.score_samples_scope_log_proba(self.data, self.tree_forest[i])

                    bestlik = orig_ll
                    best_clt_l = None
                    best_clt_r = None
                    best_feature_cut = None
                    best_left_weight = 0.0
                    best_right_weight = 0.0
                    best_right_data = None
                    best_left_data = None
                    best_v_ll = 0.0
                    best_gain = -np.inf

                    if self.random_forest:
                        if self.d > tree_n_features:
                            selected = range(tree_n_features)
                        else:
                            selected = sorted(random.sample(range(tree_n_features), self.d))
                    else:
                        selected = range(tree_n_features)

                    for feature in selected:
                        condition = tree_data[:, feature] == 0
                        new_features = np.ones(tree_data.shape[1], dtype=bool)
                        new_features[feature] = False
                        left_data = tree_data[condition, :][:, new_features]
                        right_data = tree_data[~condition, :][:, new_features]
                        left_weight = (left_data.shape[0]) / (tree_data.shape[0])
                        right_weight = (right_data.shape[0]) / (tree_data.shape[0])

                        if self.sample_weight is not None:
                            left_sample_weight = self.sample_weight[condition]
                            right_sample_weight = self.sample_weight[~condition]
                        else:
                            left_sample_weight = None
                            right_sample_weight = None

                        if left_data.shape[0] > 1 and right_data.shape[0] > 1:
                            # compute the tree features id
                            tree_scope = np.zeros(tree_n_features, dtype=np.int)
                            for f in range(tree_n_features):
                                tree_scope[f] = self.node.cltree.scope[self.tree_forest[i][f]]

                            left_scope = np.concatenate((tree_scope[0:feature], tree_scope[feature + 1:]))
                            right_scope = np.concatenate((tree_scope[0:feature], tree_scope[feature + 1:]))

                            CL_l = Cltree()
                            CL_r = Cltree()

                            CL_l.fit(left_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors,
                                     scope=left_scope, alpha=self.alpha * left_weight,
                                     and_leaves=self.and_leaves, sample_weight=left_sample_weight)
                            CL_r.fit(right_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors,
                                     scope=right_scope, alpha=self.alpha * right_weight,
                                     and_leaves=self.and_leaves, sample_weight=right_sample_weight)

                            l_ll = CL_l.score_samples_log_proba(left_data)
                            r_ll = CL_r.score_samples_log_proba(right_data)

                            ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) *
                                  right_data.shape[0]) / self.data.shape[0]
                        else:
                            ll = -np.inf

                        if ll > bestlik:
                            bestlik = ll
                            best_clt_l = CL_l
                            best_clt_r = CL_r
                            best_feature_cut = feature
                            best_left_weight = left_weight
                            best_right_weight = right_weight
                            best_right_data = right_data
                            best_left_data = left_data
                            best_l_ll = l_ll
                            best_r_ll = r_ll

                            best_left_sample_weight = left_sample_weight
                            best_right_sample_weight = right_sample_weight

                            found = True

                    gain = (bestlik - orig_ll)
                    # print (" gain:", gain, end = " ")

                    """if gain <= self.min_gain:
                        print("no improvement")"""

                    if found == True and gain > self.min_gain:

                        if not is_and_node(self.node):
                            clt = self.node.cltree
                            self.node = AndNode()
                            self.node.cltree = clt
                            self.node.children_left = [None] * self.node.cltree.num_trees
                            self.node.children_right = [None] * self.node.cltree.num_trees
                            self.node.or_features = [None] * self.node.cltree.num_trees
                            self.node.left_weights = [None] * self.node.cltree.num_trees
                            self.node.right_weights = [None] * self.node.cltree.num_trees
                            self.node.tree_forest = self.tree_forest

                        Csn._or_nodes = Csn._or_nodes + 1
                        Csn._or_edges = Csn._or_edges + 2

                        self.node.or_features[i] = best_feature_cut
                        # print(" cutting on feature ", self.node.or_features[i])

                        instances = self.data.shape[0]

                        self.node.left_weights[i] = best_left_weight
                        self.node.right_weights[i] = best_right_weight

                        self.node.children_left[i] = Csn(data=best_left_data, vdata=self.vdata,
                                                         clt=best_clt_l, ll=best_l_ll,
                                                         min_instances=self.min_instances,
                                                         min_features=self.min_features,
                                                         alpha=self.alpha * best_left_weight,
                                                         d=self.d, random_forest=self.random_forest,
                                                         m_priors=self.m_priors, j_priors=self.j_priors,
                                                         n_original_samples=self.n_original_samples,
                                                         and_leaves=self.and_leaves, and_inners=self.and_inners,
                                                         min_gain=self.min_gain, depth=self.depth + 1,
                                                         sample_weight=best_left_sample_weight)
                        self.node.children_right[i] = Csn(data=best_right_data, vdata=self.vdata,
                                                          clt=best_clt_r, ll=best_r_ll,
                                                          min_instances=self.min_instances,
                                                          min_features=self.min_features,
                                                          alpha=self.alpha * best_right_weight, d=self.d,
                                                          random_forest=self.random_forest,
                                                          m_priors=self.m_priors, j_priors=self.j_priors,
                                                          n_original_samples=self.n_original_samples,
                                                          and_leaves=self.and_leaves, and_inners=self.and_inners,
                                                          min_gain=self.min_gain, depth=self.depth + 1,
                                                          sample_weight=best_right_sample_weight)

                """else:
                    print( " > no cutting due to few features")
            else:
                print(" > no cutting due to few instances")"""
        if is_and_node(self.node):
            Csn._and_nodes += 1

        # free memory before to recurse
        self.free_memory()
Beispiel #20
0
    def or_cut(self):
        """ WRITEME """
        print(" > trying to cut ... ")
        sys.stdout.flush()

        found = False

        bestlik = self.orig_ll
        best_clt_l = None
        best_clt_r = None
        best_feature_cut = None
        best_left_weight = 0.0
        best_right_weight = 0.0
        best_right_data = None
        best_left_data = None
        best_v_ll = 0.0
        best_gain = -np.inf
        best_left_sample_weight = None
        best_right_sample_weight = None
                            

        if self.sum_nodes:

            # check for clustering
            n_clusters = 2
            cov_type = 'tied'
            rand_gen = None
            n_iters = 1000
            n_restarts=1
            gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type,
                                        random_state=rand_gen, n_iter=n_iters, n_init=n_restarts)

            gmm_c.fit(self.data)

            clustering = gmm_c.predict(self.data)

            # preventing to have a cluster with zero instances
            cardinality = np.sum(clustering)
            print("   - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ")
            if cardinality > 0 and (self.data.shape[0] - cardinality) > 0:

                cluster_0 = (clustering == 0)

                cluster_0_data = self.data[cluster_0]
                cluster_1_data = self.data[~cluster_0]

                cluster_0_tree = Cltree()
                cluster_1_tree = Cltree()

                cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0]
                cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0]

                cluster_0_tree.fit(cluster_0_data,self.m_priors,self.j_priors,scope=self.scope,alpha=self.alpha*cluster_0_weight, 
                           and_leaves=self.and_leaves, sample_weight = None)
                cluster_1_tree.fit(cluster_1_data,self.m_priors,self.j_priors,scope=self.scope,alpha=self.alpha*cluster_1_weight, 
                           and_leaves=self.and_leaves, sample_weight = None)

                cluster_0_ll = cluster_0_tree.score_samples_log_proba(cluster_0_data, sample_weight = None)
                cluster_1_ll = cluster_1_tree.score_samples_log_proba(cluster_1_data, sample_weight = None)

                # log sum exp
                clustering_ll = 0.0
                for d in self.data:
                    clustering_ll = clustering_ll + logr( cluster_0_weight * np.exp(cluster_0_tree.score_sample_log_proba(d)) + cluster_1_weight * np.exp(cluster_1_tree.score_sample_log_proba(d)))
                clustering_ll = clustering_ll / self.data.shape[0]

                print("ll:", clustering_ll)

            else:
                clustering_ll = -np.inf
        else:
            clustering_ll = -np.inf

        cutting_features = []
        for f in range(self.node.cltree.n_features):
            if self.scope[f] not in self.leaf_vars:
                cutting_features.append(f)
        

        if self.random_forest:
            if self.d > len(cutting_feaures):
                selected = cutting_features
            else:
                selected = sorted(random.sample(cutting_features, self.d))

        else:
            selected = cutting_features

        PQ = []
        ll = 0.0
        CL_l = None 
        CL_r = None
        feature = None
        left_weight = 0.0
        right_weight = 0.0
        left_data = None
        right_data = None
        l_ll = 0.0 
        r_ll = 0.0
        left_sample_weight = 0.0
        right_sample_weight = 0.0
            
        for feature in selected:
            condition = self.data[:,feature]==0
            new_features = np.ones(self.data.shape[1], dtype=bool)
            new_features[feature] = False
            left_data = self.data[condition,:][:, new_features]
            right_data = self.data[~condition,:][:, new_features]

            if self.sample_weight is not None:
                left_sample_weight = self.sample_weight[condition]
                right_sample_weight = self.sample_weight[~condition]
                left_weight = np.sum(left_sample_weight ) / np.sum(self.sample_weight )
                right_weight = np.sum(right_sample_weight ) / np.sum(self.sample_weight )        
            else:
                left_sample_weight = None
                right_sample_weight = None
                left_weight = (left_data.shape[0] ) / (self.data.shape[0] )
                right_weight = (right_data.shape[0] ) / (self.data.shape[0] )        

            if left_data.shape[0] > 0 and right_data.shape[0] > 0:          

                left_scope = np.concatenate((self.node.cltree.scope[0:feature],self.node.cltree.scope[feature+1:]))
                right_scope = np.concatenate((self.node.cltree.scope[0:feature],self.node.cltree.scope[feature+1:]))
                CL_l = Cltree()
                CL_r = Cltree()

                CL_l.fit(left_data,self.m_priors,self.j_priors,scope=left_scope,alpha=self.alpha*left_weight, 
                         and_leaves=self.and_leaves, sample_weight = left_sample_weight, 
                         multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure)
                CL_r.fit(right_data,self.m_priors,self.j_priors,scope=right_scope,alpha=self.alpha*right_weight, 
                         and_leaves=self.and_leaves, sample_weight = right_sample_weight, 
                         multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure)

                l_ll = CL_l.score_samples_log_proba(left_data, sample_weight = left_sample_weight)
                r_ll = CL_r.score_samples_log_proba(right_data, sample_weight = right_sample_weight)


                if self.sample_weight is not None:
                    ll = ((l_ll+logr(left_weight))*np.sum(left_sample_weight) + (r_ll+logr(right_weight))*np.sum(right_sample_weight))/np.sum(self.sample_weight)
                else:
                    ll = ((l_ll+logr(left_weight))*left_data.shape[0] + (r_ll+logr(right_weight))*right_data.shape[0])/self.data.shape[0]
            else:
                ll = -np.inf

            if len(PQ) == 0:
                PQ.append((ll, CL_l, CL_r, feature, left_weight, right_weight, 
                           left_data, right_data, l_ll, r_ll, 
                           left_sample_weight, right_sample_weight))
            else:
                for e in range(len(PQ)):
                    if PQ[e][0] < ll:
                        PQ.insert(e, (ll, CL_l, CL_r, feature, left_weight, right_weight, 
                                      left_data, right_data, l_ll, r_ll, 
                                      left_sample_weight, right_sample_weight))
                        break
                if len(PQ)>3:
                    PQ.pop()


            if ll>bestlik:

                bestlik = ll
                best_clt_l = CL_l
                best_clt_r = CL_r
                best_feature_cut = feature
                best_left_weight = left_weight
                best_right_weight = right_weight
                best_right_data = right_data
                best_left_data = left_data
                best_l_ll = l_ll
                best_r_ll = r_ll

                best_left_sample_weight = left_sample_weight
                best_right_sample_weight = right_sample_weight
                
                found = True

        gain = (bestlik - self.orig_ll)
        print ("   - gain cut:", gain, end = "")

        gain_c = (clustering_ll - self.orig_ll)
        print (" gain clustering:", gain_c)


        if (found==True and gain > self.min_gain) or (gain_c > gain and gain_c > self.min_gain):

            PQ = []
            if self.depth < 4 and len(PQ)>1:


#                Csn._sum_nodes = Csn._sum_nodes + 1

                instances = self.data.shape[0]

                sum_w = 0.0
                for i in range(len(PQ)):
                    sum_w += np.exp(PQ[i][0])
#                    sum_w += PQ[i][0]
                for i in range(len(PQ)):
                    self.node.weights.append(np.exp(PQ[i][0])/sum_w)
#                    self.node.weights.append(PQ[i][0]/sum_w)
#                    self.node.weights.append(1/len(PQ))
                    self.node.children.append(OrNode())

                    (pq_ll, pq_CL_l, pq_CL_r, pq_feature, pq_left_weight, pq_right_weight, 
                     pq_left_data, pq_right_data, pq_l_ll, pq_r_ll, 
                     pq_left_sample_weight, pq_right_sample_weight) = PQ[i]

                    self.node.children[i].or_feature_scope = self.scope[pq_feature]
                    self.node.children[i].or_feature = pq_feature

                    instances = self.data.shape[0]

                    self.node.children[i].left_weight = pq_left_weight
                    self.node.children[i].right_weight = pq_right_weight

                    self.node.children[i].left_child = Csn(data=pq_left_data, 
                                                           clt=pq_CL_l, ll=pq_l_ll, 
                                                           min_instances=self.min_instances, 
                                                           min_features=self.min_features, 
                                                           alpha=self.alpha*pq_left_weight, 
                                                           d=self.d, random_forest=self.random_forest,
                                                           leaf_vars = self.leaf_vars,
                                                           m_priors = self.m_priors, j_priors = self.j_priors,
                                                           n_original_samples = self.n_original_samples,
                                                           and_leaves=self.and_leaves, 
                                                           and_inners=self.and_inners,
                                                           min_gain = self.min_gain, 
                                                           depth=self.depth+1,
                                                           sample_weight = pq_left_sample_weight,
                                                           multilabel = self.multilabel, 
                                                           n_labels=self.n_labels, 
                                                           ml_tree_structure=self.ml_tree_structure)
                    self.node.children[i].right_child = Csn(data=pq_right_data, 
                                                            clt=pq_CL_r, ll=pq_r_ll, 
                                                            min_instances=self.min_instances, 
                                                            min_features=self.min_features, 
                                                            alpha=self.alpha*pq_right_weight, d=self.d, 
                                                            random_forest=self.random_forest,
                                                            leaf_vars = self.leaf_vars,
                                                            m_priors = self.m_priors, j_priors = self.j_priors,
                                                            n_original_samples = self.n_original_samples,
                                                            and_leaves=self.and_leaves, 
                                                            and_inners=self.and_inners,
                                                            min_gain = self.min_gain, 
                                                            depth=self.depth+1,
                                                            sample_weight = pq_right_sample_weight,
                                                            multilabel = self.multilabel, 
                                                            n_labels=self.n_labels, 
                                                            ml_tree_structure=self.ml_tree_structure)
              

            elif (gain > gain_c):
            
                self.node = OrNode()
                self.node.or_feature_scope = self.scope[best_feature_cut]
                Csn._or_nodes = Csn._or_nodes + 1
                Csn._or_edges = Csn._or_edges + 2

                self.node.or_feature = best_feature_cut
                print("   - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll)

                instances = self.data.shape[0]

                self.node.left_weight = best_left_weight
                self.node.right_weight = best_right_weight

                # free memory before to recurse
                self.free_memory()

                self.node.left_child = Csn(data=best_left_data, 
                                           clt=best_clt_l, ll=best_l_ll, 
                                           min_instances=self.min_instances, 
                                           min_features=self.min_features, alpha=self.alpha*best_left_weight, 
                                           d=self.d, random_forest=self.random_forest,
                                           leaf_vars = self.leaf_vars,
                                           m_priors = self.m_priors, j_priors = self.j_priors,
                                           n_original_samples = self.n_original_samples,
                                           and_leaves=self.and_leaves, and_inners=self.and_inners,
                                           min_gain = self.min_gain, depth=self.depth+1,
                                           sample_weight = best_left_sample_weight,
                                           multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure)
                self.node.right_child = Csn(data=best_right_data, 
                                            clt=best_clt_r, ll=best_r_ll, 
                                            min_instances=self.min_instances, 
                                            min_features=self.min_features, alpha=self.alpha*best_right_weight, d=self.d, 
                                            random_forest=self.random_forest,
                                            leaf_vars = self.leaf_vars,
                                            m_priors = self.m_priors, j_priors = self.j_priors,
                                            n_original_samples = self.n_original_samples,
                                            and_leaves=self.and_leaves, and_inners=self.and_inners,
                                            min_gain = self.min_gain, depth=self.depth+1,
                                            sample_weight = best_right_sample_weight,
                                            multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure)

            else:
                self.node = SumNode()
                print("   - Adding a sum node")

                Csn._sum_nodes = Csn._sum_nodes + 1

                instances = self.data.shape[0]

                self.node.weights.append(cluster_0_weight)
                self.node.weights.append(cluster_1_weight)

                # free memory before to recurse
                self.free_memory()

                self.node.children.append(Csn(data=cluster_0_data, 
                                              clt=cluster_0_tree, ll=cluster_0_ll, 
                                              min_instances=self.min_instances, 
                                              min_features=self.min_features, alpha=self.alpha*cluster_0_weight, 
                                              d=self.d, random_forest=self.random_forest,
                                              m_priors = self.m_priors, j_priors = self.j_priors,
                                              n_original_samples = self.n_original_samples,
                                              and_leaves=self.and_leaves, and_inners=self.and_inners,
                                              min_gain = self.min_gain, depth=self.depth+1,
                                              sample_weight = None))
                self.node.children.append(Csn(data=cluster_1_data, 
                                              clt=cluster_1_tree, ll=cluster_1_ll, 
                                              min_instances=self.min_instances, 
                                              min_features=self.min_features, alpha=self.alpha*cluster_1_weight, d=self.d, 
                                              random_forest=self.random_forest,
                                              m_priors = self.m_priors, j_priors = self.j_priors,
                                              n_original_samples = self.n_original_samples,
                                              and_leaves=self.and_leaves, and_inners=self.and_inners,
                                              min_gain = self.min_gain, depth=self.depth+1,
                                              sample_weight = None))

        else:
            print(" no cutting")
            if self.node.cltree.is_forest():
                print("   -> Forest with",self.node.cltree.num_trees, "trees")
            else:
                print("   -> Tree")
Beispiel #21
0
    def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel = False, n_labels=0, ml_tree_structure=0):
        """Fit the model to the data.

        Parameters
        ----------
        X : ndarray, shape=(n, m)
        The data array.

        m_priors: 
        the marginal priors for each feature
        
        j_priors: 
        the joint priors for each couple of features

        alpha: float, default=1.0
        the constant for the smoothing

        sample_weight: ndarray, shape=(n,)
        The weight of each sample.

        scope: 
        unique identifiers for the features

        and_leaves: boolean, default=False

        multilabel: boolean, default=False
        its value indicates whether the cltree are used for multilabel classification 
        problems when imported by mlcsn.py

        n_labels: integer, default=0
        in case of multilabel classification problem indicates the number of labels,
        assumed to be the n_labels rows of X

        ml_tree_structure: integer, default=0
        in case of multilabel classification problem indicates the structure of the tree 
        to be learned. The set of features F corresponds to the union of A (the attributes)
        and Y (the labels):
        - 0, no constraint on the resulting tree
        - 1, the parent of each variable in Y must have the parent in Y, while the parent of each
        variable in A can have the parent in A or in Y. A label variable depends on a label 
        variable; an attribute variable can depend on a label variable or on an attribute variable
        - 2, the parent of each variable in Y must have the parent in Y, and the parent of each
        variable in A can have the parent in Y. A label variable depends on a label variable; an 
        attribute variable depends on a label variable
        
        """


        self.alpha = alpha
        self.and_leaves = and_leaves
        self.n_features = X.shape[1]

        rootTree = False
        if scope is None:
            self.scope = np.array([i for i in range(self.n_features)])
            rootTree = True
        else:
            self.scope = scope

        if sample_weight is None:
            self.n_samples = X.shape[0]
        else:
            self.n_samples = np.sum(sample_weight)


        (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors)


        MI = self.cMI(log_probs, log_j_probs)


        if multilabel == True:
            if ml_tree_structure == 1:
                MI[-n_labels:,-n_labels:] += np.max(MI)
            elif ml_tree_structure == 2:
                MI[-n_labels:,-n_labels:] += np.max(MI)
                MI[:-n_labels,:-n_labels] = 0
            elif ml_tree_structure == 3:
                MI[:-n_labels,:-n_labels] = 0
        
        " the tree is represented as a sequence of parents"

        mst = minimum_spanning_tree(-(MI))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)

        self.df_order = dfs_tree[0]
        self.post_order = dfs_tree[0][::-1]
        self.tree = np.zeros(self.n_features, dtype=np.int)
        self.tree[0] = -1
        for p in range(1, self.n_features):
            self.tree[p]=dfs_tree[1][p]

        
        penalization = logr(X.shape[0])/(2*X.shape[0])

        if self.and_leaves == True:
            for p in range(1,self.n_features):
                if MI[self.tree[p],p]<penalization:
                    self.tree[p]=-1
                    self.num_trees = self.num_trees + 1
            if self.num_trees > 1:
                self._forest = True

        """
        selected_MI = []
        for p in range(1,self.n_features):
            selected_MI.append((p,MI[self.tree[p],p]))
        selected_MI.sort(key=lambda mi: mi[1], reverse=True)
        for p in range(10,self.n_features-1):
            self.tree[selected_MI[p][0]]=-1
        """

        if multilabel == True and rootTree:
            pX = 0
            for i in range(self.n_features-n_labels):
                if self.tree[i]>=(self.n_features-n_labels):
                    pX += 1
            pY = 0
            for i in range(self.n_features-n_labels,self.n_features):
                if self.tree[i]>=(self.n_features-n_labels):
                    pY += 1
                    
            print("Xs with Y parent: ", pX)
            print("Ys with Y parent: ", pY)            

        self.num_edges = self.n_features - self.num_trees
        # computing the factored represetation
        self.log_factors = np.zeros((self.n_features, 2, 2))
        self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
Beispiel #22
0
    def fit(self,
            X,
            m_priors,
            j_priors,
            alpha=1.0,
            sample_weight=None,
            scope=None,
            and_leaves=False,
            multilabel=False,
            n_labels=0,
            ml_tree_structure=0):
        """Fit the model to the data.

        Parameters
        ----------
        X : ndarray, shape=(n, m)
        The data array.

        m_priors: 
        the marginal priors for each feature
        
        j_priors: 
        the joint priors for each couple of features

        alpha: float, default=1.0
        the constant for the smoothing

        sample_weight: ndarray, shape=(n,)
        The weight of each sample.

        scope: 
        unique identifiers for the features

        and_leaves: boolean, default=False

        multilabel: boolean, default=False
        its value indicates whether the cltree are used for multilabel classification 
        problems when imported by mlcsn.py

        n_labels: integer, default=0
        in case of multilabel classification problem indicates the number of labels,
        assumed to be the n_labels rows of X

        ml_tree_structure: integer, default=0
        in case of multilabel classification problem indicates the structure of the tree 
        to be learned. The set of features F corresponds to the union of A (the attributes)
        and Y (the labels):
        - 0, no constraint on the resulting tree
        - 1, the parent of each variable in Y must have the parent in Y, while the parent of each
        variable in A can have the parent in A or in Y. A label variable depends on a label 
        variable; an attribute variable can depend on a label variable or on an attribute variable
        - 2, the parent of each variable in Y must have the parent in Y, and the parent of each
        variable in A can have the parent in Y. A label variable depends on a label variable; an 
        attribute variable depends on a label variable
        
        """

        self.alpha = alpha
        self.and_leaves = and_leaves
        self.n_features = X.shape[1]

        rootTree = False
        if scope is None:
            self.scope = np.array([i for i in range(self.n_features)])
            rootTree = True
        else:
            self.scope = scope

        if sample_weight is None:
            self.n_samples = X.shape[0]
        else:
            self.n_samples = np.sum(sample_weight)

        (log_probs,
         log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors,
                                               j_priors)

        MI = self.cMI(log_probs, log_j_probs)

        if multilabel == True:
            if ml_tree_structure == 1:
                MI[-n_labels:, -n_labels:] += np.max(MI)
            elif ml_tree_structure == 2:
                MI[-n_labels:, -n_labels:] += np.max(MI)
                MI[:-n_labels, :-n_labels] = 0
            elif ml_tree_structure == 3:
                MI[:-n_labels, :-n_labels] = 0

        " the tree is represented as a sequence of parents"

        mst = minimum_spanning_tree(-(MI))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)

        self.df_order = dfs_tree[0]
        self.post_order = dfs_tree[0][::-1]
        self.tree = np.zeros(self.n_features, dtype=np.int)
        self.tree[0] = -1
        for p in range(1, self.n_features):
            self.tree[p] = dfs_tree[1][p]

        penalization = logr(X.shape[0]) / (2 * X.shape[0])

        if self.and_leaves == True:
            for p in range(1, self.n_features):
                if MI[self.tree[p], p] < penalization:
                    self.tree[p] = -1
                    self.num_trees = self.num_trees + 1
            if self.num_trees > 1:
                self._forest = True
        """
        selected_MI = []
        for p in range(1,self.n_features):
            selected_MI.append((p,MI[self.tree[p],p]))
        selected_MI.sort(key=lambda mi: mi[1], reverse=True)
        for p in range(10,self.n_features-1):
            self.tree[selected_MI[p][0]]=-1
        """

        if multilabel == True and rootTree:
            pX = 0
            for i in range(self.n_features - n_labels):
                if self.tree[i] >= (self.n_features - n_labels):
                    pX += 1
            pY = 0
            for i in range(self.n_features - n_labels, self.n_features):
                if self.tree[i] >= (self.n_features - n_labels):
                    pY += 1

            print("Xs with Y parent: ", pX)
            print("Ys with Y parent: ", pY)

        self.num_edges = self.n_features - self.num_trees
        # computing the factored represetation
        self.log_factors = np.zeros((self.n_features, 2, 2))
        self.log_factors = compute_log_factors(self.tree, self.n_features,
                                               log_probs, log_j_probs,
                                               self.log_factors)
Beispiel #23
0
 def score_sample_log_proba(self, x):
     """ WRITEME """
     prob = 0.0
     for s in range(len(self.children)):
         prob = prob + (self.weights[s] * np.exp(self.children[s].score_sample_log_proba(x)))
     return logr(prob)