def MM_distMat(models):
    print "Calculating JS_distMat"
    start = time.time()
    distMat = np.zeros((models.shape[0],models.shape[0]))
    nrow = models[0].shape[0]
    ncol = models[0].shape[1] - 1 # the last column is for marginal distribution
    for index_a,mat_a in enumerate(models):
        for index_b,mat_b in enumerate(models[0:index_a]):
            M = 0.5*(mat_a + mat_b)
            Dist_a_M = 0
            Dist_b_M = 0
            for i in range(nrow):
                if mat_a[i,ncol]>0:
                    Dist_a_M += mat_a[i,ncol]*stats.entropy(mat_a[i,0:ncol],M[i,0:ncol]) # conditioned KL
                if mat_b[i,ncol]>0:
                    Dist_b_M += mat_b[i,ncol]*stats.entropy(mat_b[i,0:ncol],M[i,0:ncol]) # conditioned KL

            Dist_a_M += stats.entropy(mat_a[:,ncol],M[:,ncol]) # conditioning KL
            Dist_b_M += stats.entropy(mat_b[:,ncol],M[:,ncol]) # conditioning KL
                      
            distMat[index_a,index_b] = np.sqrt(0.5*(Dist_a_M + Dist_b_M)) # according to j-s
            distMat[index_b,index_a] = distMat[index_a,index_b] # symmetric
        print "calculated " + str(int((index_a*(index_a+1))*100/(models.shape[0]*(models.shape[0]+1)))) + "% in " + str(int((time.time()-start)/60)) + " minutes"
    distMat = distMat/distMat.max()
    print "JS_distMat calculation time is %s minutes " %str(int(time.time()-start)/60)
    #print "normalization OFF"
    return distMat
Example #2
0
def kullback_leibler(vec1, vec2, num_features=None):
    """
    A distance metric between two probability distributions.
    Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
    Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
    If the distribution draws from a certain number of docs, that value must be passed.
    """
    if scipy.sparse.issparse(vec1):
        vec1 = vec1.toarray()
    if scipy.sparse.issparse(vec2):
        vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 
    if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
        if num_features != None: # if not None, make as large as the documents drawing from
            dense1 = sparse2full(vec1, num_features)
            dense2 = sparse2full(vec2, num_features)
            return entropy(dense1, dense2)
        else:
            max_len = max(len(vec1), len(vec2))
            dense1 = sparse2full(vec1, max_len)
            dense2 = sparse2full(vec2, max_len)
            return entropy(dense1, dense2)
    else:
        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
        if len(vec1) == 1:
            vec1 = vec1[0]
        if len(vec2) == 1:
            vec2 = vec2[0]
        return scipy.stats.entropy(vec1, vec2)
def average_prob(pk, qk=[], val=[]):
    """It is used to compute the average value of the distribution and the
    disimilarity to the mean distribution. It could be used to compute how
    spontaneous is an element to generate bursts.

    pk: array_like, shape (N,) or shape (N, M)
        distributions to operate with.
    qk: array_like, shape (N,)
        representative distribution or mean distribution.
    val: list or array_like
        values of each bin of the distribution.

    """

    # Initial variables
    m = len(pk.shape)
    n = pk.shape[0] if m == 1 else pk.shape[1]
    val = range(n) if val == [] else val
    qk = 1./n*np.ones(n) if qk == [] else qk

    # Position value
    if m == 1:
        val_avg = np.mean(np.multiply(pk-qk, val))
    else:
        val_avg = np.mean(np.multiply(pk-qk, val), axis=1)

    # Relative entropy over the average
    if m == 1:
        rel_dis = entropy(pk, qk)
    else:
        rel_dis = np.array([entropy(pk[i], qk) for i in range(pk.shape[0])])

    return val_avg, rel_dis
def entropy_rate(weighted_adj_matrix, stat_dist=None, base=2, print_prefix=''):
    print(print_prefix + 'calc entropy rate')
    if stat_dist is None:
        stat_dist = stationary_dist(weighted_adj_matrix)
    assert not np.any(stat_dist < 0)
    assert np.isclose(stat_dist.sum(), 1.)
    # assert np.all(weighted_adj_matrix.sum(axis=0) > 0)
    if scipy.sparse.issparse(weighted_adj_matrix):
        if not isinstance(weighted_adj_matrix, csc_matrix):
            weighted_adj_matrix = weighted_adj_matrix.tocsc()
        get_col = weighted_adj_matrix.getcol
        col_entropy = (get_col(i).data for i in xrange(weighted_adj_matrix.shape[0]))
        col_entropy = np.array(map(lambda x: stats.entropy(x, base=base), col_entropy)).flatten()
    else:
        col_entropy = np.array(stats.entropy(weighted_adj_matrix, base=base)).flatten()
    stat_dist = np.array(stat_dist).flatten()
    assert stat_dist.shape == col_entropy.shape
    col_entropy *= stat_dist
    finite_elements = np.isfinite(col_entropy)
    if not all(finite_elements):
        print(print_prefix + 'WARN: entropy rate contains not finite elements. (inf, nan)')
    rate = np.sum(col_entropy[finite_elements])
    if not np.isfinite(rate):
        print(print_prefix + 'entropy rate not finite')
        exit()
    return rate
def are_imgs_natural(orig_image, image_bounds):
  """Determines natural images based on the entropy of hue and luminance."""

  rgb_to_hsv = np.vectorize(colorsys.rgb_to_hsv)
  is_natural = []
  for bound in image_bounds:
    image = orig_image.copy()
    width, height = image.size
    subimage = image.crop(bound)
    s_width, s_height = subimage.size
    arr = np.array(np.asarray(subimage).astype('float'))
    r, g, b = np.rollaxis(arr, axis=-1)
    h = rgb_to_hsv(r, g, b)[0]
    hist_h = np.histogram(h, bins=32, range=(0.0, 1.0))
    hist_h = list(hist_h[0])
    hist_h = [h/float(sum(hist_h)) for h in hist_h]
    entropy_h = stats.entropy(hist_h)

    subimage_l = image.crop(bound).convert('L')
    hist_l = subimage_l.histogram()
    hist_l = [h/float(sum(hist_l)) for h in hist_l]
    entropy_l = stats.entropy(hist_l)

    # Images larger than a certain size are assumed to be natural images.
    if (float(s_width)/width < ICON_MAX_SIZE and
        float(s_height)/height < ICON_MAX_SIZE):
      is_natural.append(entropy_l > LUMINANCE_THRESHOLD or
                        entropy_h > HUE_THRESHOLD)
    else:
      is_natural.append(True)
    image.close()
    subimage.close()
    subimage_l.close()
  return is_natural
Example #6
0
    def eval_wrapper(self, seq):
        """Evaluates current sequence for models based on different modes.
           Returns boolean of success."""

        if self.eval_mode == "None":
            return True
        elif self.eval_mode == "Rank Offset":
            seq_len = len(seq)

            if seq_len < self.eval_param:
                return False

            for name,model in self.models.iteritems():
                prob_seq = model.eval_seq(seq[:self.eval_param])
                for i in range(self.eval_param):
                    self.eval_output[name][i] += [prob_seq[i]]

        elif self.eval_mode == "Kullback-Leibler":
            for name,model in self.models.iteritems():
                #NOTE: KL doesn't deal well with 0s so need to perturb by a very small number to avoid NaNs
                perturb_obs = [1e-15]*model.num_obs
                perturb_trans = [1e-15]*model.num_states
                # observation matrix
                out = 0
                for i in range(1,model.num_states): #normalized across rows, skip start state since no obs
                    out+=stats.entropy(model.obs[i,:],self.eval_param['obs'][i,:] + perturb_obs)/model.num_states
                self.eval_output[name]['obs'].append(out)
                # transition matrix
                out = 0
                for i in range(model.num_states-1): #normalized across rows, skip end state since no transitions
                    out+=stats.entropy(model.trans[i,:],self.eval_param['trans'][i,:] + perturb_trans)/model.num_states
                self.eval_output[name]['trans'].append(out)
        return True
def Jensen_Shannon_divergence(list_p, list_w=[]):
    """Compute the Jensen-Shannon divergence generically.

    Parameters
    ----------
    list_p: list, array_like
        the list of probability distributions.
    list_w: list
        the list of weights for each probability distribution.

    Returns
    -------
    div: float
        JS divergence value.

    """

    # Check and format inputs
    assert(len(list_p) > 1)
    list_w = [1/2., 1/2.] if list_w == [] else list_w
    w = np.array(list_w)[np.newaxis]
    probs = np.array(list_p) if type(list_p) == list else list_p

    # Compute measure
    div = entropy(np.sum(np.multiply(w.T, probs))) -\
        np.sum(np.multiply(w, entropy(probs.T)))
    return div
Example #8
0
    def dist(ft1, ft2, method='bh'):
        """
        Computes the distance between two sets of histogram features.

        Args:
            ft1, ft2: numpy.ndarray (vector)
            histograms as returned by compute()

            method: string
            the method used for computing the distance between the two sets of features:
            'kl' - Kullback-Leibler divergence (symmetrized by 0.5*(KL(p,q)+KL(q,p))
            'js' - Jensen-Shannon divergence: 0.5*(KL(p,m)+KL(q,m)) where m=(p+q)/2
            'bh' - Bhattacharyya distance: -log(sqrt(sum_i (p_i*q_i)))
            'ma' - Matusita distance: sqrt(sum_i (sqrt(p_i)-sqrt(q_i))**2)
        """
        # distance methods
        dm = {'kl': lambda x_, y_: 0.5 * (entropy(x_, y_) + entropy(y_, x_)),
              'js': lambda x_, y_: 0.5 * (entropy(x_, 0.5 * (x_ + y_)) + entropy(y_, 0.5 * (x_ + y_))),
              'bh': lambda x_, y_: -np.log(np.sum(np.sqrt(x_ * y_))),
              'ma': lambda x_, y_: np.sqrt(np.sum((np.sqrt(x_) - np.sqrt(y_)) ** 2))
              }

        method = method.lower()
        if method not in dm.keys():
            raise ValueError('Unknown method')

        return dm[method](ft1, ft2)
Example #9
0
def plot_KL(data):
    """Kullback-Leibler divergence, given a Dataset object.
    The 'true' distribution is the data one"""
    frequencies = data.frequencies
    Ncat = data.Ncat
    fiducial = data.generate_mc(100)

    sh, loc, sc = data.lognorm_par()
    freq_ln = [np.sort(stats.lognorm.rvs(sh, scale=sc, size=Ncat, random_state=s))[::-1]
               for s in range(1, 1001)]
    kl_ln = [stats.entropy(frequencies, r) for r in freq_ln]

    lengths = [min(Ncat, len(mc)) for mc in fiducial]  # Cut to the minimum Ncat
    kl_data = [stats.entropy(frequencies[:lengths[i]], mc[:lengths[i]]) for i, mc in enumerate(fiducial)]

    # Plot KL divergence. Use kdeplot instead of histogram
    fig = plt.figure(figsize=[10, 6.18])
    plt.title('Kullback-Leibler divergence')
    # plt.hist(kl_data, bins=10, normed=True, label='MC', alpha=0.5)
    # plt.hist(kl_ln, bins=10, normed=True, label='Lognormal', alpha=0.5, color='Blue')
    sns.kdeplot(np.array(kl_data), label='MC', alpha=0.6, color='Blue')
    sns.kdeplot(np.array(kl_ln), label='Lognormal', alpha=0.6, color='Orange')
    plt.xlim(xmin=0.)
    # plt.axvline(ks_tree[0], c='Purple', label = 'Tree model')
    # plt.axvline(kl_ln, c='Orange', label = 'Lognormal')
    plt.legend(loc='best')
    # plt.savefig(os.path.join('all_data', 'KL_'+data.name+'.png'))
    return
Example #10
0
def jensen_shannon(vec1, vec2, num_features=None):
    """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`.

    Parameters
    ----------
    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    num_features : int, optional
        Number of features in vector.

    Returns
    -------
    float
        Jensen-Shannon distance between `vec1` and `vec2`.

    Notes
    -----
    This is symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`.

    """
    vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features)
    avg_vec = 0.5 * (vec1 + vec2)
    return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
Example #11
0
def distFunc(ys,xs):
    '''
    # Calculate distance between two empirical distributions.
    # input parameters for model.
    # output: distance between generated data and data xs.
    '''
    if (np.sum(ys)==0):
        return np.inf
    else:
        if xs.ndim == 1:
            kernely = stats.gaussian_kde(ys)
            kernelx = stats.gaussian_kde(xs)
            xx = np.linspace(np.min(xs),np.max(xs)) #range over data.
            return stats.entropy(kernelx(xx),qk=kernely(xx)) #KL-divergence.
        else:
            #dimensions are (npoints,nparams) to keep consistent with sci-kit
            #learn
            kernely = stats.gaussian_kde(ys.T)
            kernelx = stats.gaussian_kde(xs.T)
            #range over n-dimensional data (npoints,nparams)
            mesh = [np.linspace(np.min(xs[:,i]),np.max(xs[:,i]))
                    for i in range(xs.shape[1])]
            xx = np.meshgrid(*mesh)
            xx = np.array([x.ravel() for x in xx]).T
            return stats.entropy(kernelx(xx.T),qk=kernely(xx.T)) #KL-divergence.
Example #12
0
def sj_divergence(p, q):
    """
    Compute the Shannon-Jensen divergence, a symmetric KL-divergence.
    :param p: a probability distribution
    :param q: another probability distribution
    :return: SJ divergence measure
    """
    return 0.5 * (entropy(p, q) + entropy(q, p))
Example #13
0
def jensen_shannon(vec1, vec2, num_features=None):
    """
    A method of measuring the similarity between two probability distributions.
    It is a symmetrized and finite version of the Kullback–Leibler divergence.
    """
    vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features)
    avg_vec = 0.5 * (vec1 + vec2)
    return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
 def graph_JSD(src,edge,dst):
     P = src['X1']
     Q = dst['X1']
     _P = P / norm(P, ord=1)
     _Q = Q / norm(Q, ord=1)
     _M = 0.5 * (_P + _Q)
     edge['distance'] = 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
     return (src, edge, dst)
Example #15
0
 def test_entropy_positive(self):
     """See ticket #497"""
     pk = [0.5, 0.2, 0.3]
     qk = [0.1, 0.25, 0.65]
     eself = stats.entropy(pk, pk)
     edouble = stats.entropy(pk, qk)
     assert 0.0 == eself
     assert edouble >= 0.0
Example #16
0
 def test_entropy_positive(self):
     # See ticket #497
     pk = [0.5,0.2,0.3]
     qk = [0.1,0.25,0.65]
     eself = stats.entropy(pk,pk)
     edouble = stats.entropy(pk,qk)
     assert_(0.0 == eself)
     assert_(edouble >= 0.0)
Example #17
0
def entropy(X, axis=0):
    if axis==1:
        Y = [stats.entropy(x) for x in X]
    elif axis==0:
        Y = [stats.entropy(x) for x in X.T]
    else:
        raise ValueError('Error: Choose axis=0 or axis=1')
    return np.array(Y)
Example #18
0
def norm_entropy(count_list):
    """compute entropy from coverage"""

    count_vector = np.array(count_list)
    prob_vector = count_vector / float(count_vector.sum())
    prob_uniform = np.array([1.0/len(prob_vector)] * len(prob_vector))
    H_norm = entropy(prob_vector) / entropy(prob_uniform)
    return H_norm 
Example #19
0
 def done(self):
     import numpy as np
     vals = np.array(self.vals) / np.sum(self.vals)
     from scipy.stats import entropy
     if args.pad is None or args.pad <= len(vals):
         e = entropy(vals, base = args.base)
     else:
         e = entropy(np.append(vals, [0.0] * (args.pad - len(vals))), base = args.base)
     args.outfile.write(self.tup + [e])
def calc_entropy(AT, sigma=None, weights=None, known_nodes=None, entropy_base=2):
    if weights is not None:
        weights = np.array(weights)
        if weights.sum() != 1.0:
            weights /= weights.sum()
    if sigma is None:
        entropy = stats.entropy(AT.T, base=entropy_base)
        if weights is not None:
            entropy *= weights
        return entropy.mean()

    selection_range = set(range(AT.shape[0]))
    # print 'sigma:\n', sigma
    if known_nodes is not None:
        ones_mat = np.ones(AT.shape, dtype=np.float)
        # sigma *= sigma_in.min()
        # print 'known nodes:', known_nodes
        #print 'orig sigma\n', sigma
        for v in map(int, known_nodes):
            ones_mat[v, :] = sigma[v, :]
            ones_mat[:, v] = sigma[:, v]
        sigma = ones_mat
        #sigma[0, 2] = 100
        #print 'sigma:\n', sigma
        #print 'max:', sigma.max(axis=1).reshape(sigma.shape[1], 1)
        sigma /= sigma.mean(axis=1).reshape(sigma.shape[1], 1)
        #print 'norm sigma:\n', sigma
    #print 'mean:', sigma.mean(axis=1)
    # print 'sigma:\n', sigma
    total_entropy = 0
    for v in selection_range:
        # exclude loop
        current_selection = list(selection_range - {v})
        # stack the katz row of the target vertex N-1 times
        #print sigma
        row_sigma = sigma[v, :]
        #print row_sigma
        #print 'AT\n',AT
        #print '@:',current_selection
        #print AT[:,current_selection]
        # multiply katz with transposed AT -> only katz values on real links
        res = np.multiply(row_sigma, AT[current_selection, :])
        #print res
        # calc entropy per row and add it to the overall entropy
        ent = stats.entropy(res.T, base=entropy_base)
        if weights is not None:
            ent *= weights[current_selection]
        #print ent
        total_entropy += ent.sum()
    num_v = AT.shape[0]
    total_entropy /= (num_v * (num_v - 1))
    print 'total entropy:', total_entropy

    #print 'total entropy:', total_entropy
    if known_nodes is not None:
        return total_entropy, int(len(known_nodes) / AT.shape[0] * 100)
    return total_entropy
Example #21
0
def KLDivergenceSim(a,b,topics):
    from scipy.stats import entropy
    import math
    a = fill_list_from_dict(a,topics)
    b = fill_list_from_dict(b,topics)
    entropyOf_A_to_B = entropy(a,b)
    entropyOf_B_to_A = entropy(b,a)
    minusSummedEntropy = -(entropyOf_A_to_B+entropyOf_B_to_A)
    return math.exp(minusSummedEntropy)
 def _get_max_entropy(options):
     if 'max_entropy' not in Config.RESTRICTIONS['diversity']:
         pdf = DiversityMaintenance._pdf([0], options)
         other_pdf = DiversityMaintenance._pdf([2], options)
         e1 = stats.entropy(pdf, other_pdf)
         e2 = stats.entropy(other_pdf, pdf)
         total = e1+e2
         Config.RESTRICTIONS['diversity']['max_entropy'] = total
     return Config.RESTRICTIONS['diversity']['max_entropy']
Example #23
0
	def make_nb_plot(self):
		self._get_nb_estimate()
		p = self.nb_prob
		n = self.nb_size
		x = np.arange(0, 100)
		pmf = nbinom.pmf(x, n, p)
		self.line_neg_binomial, = plt.plot(x, pmf, ls=":", linewidth=2)

		self.nb_real_kl = entropy(self.probs, pmf)
		self.nb_grammar_kl = entropy(self.b_prob[:100], pmf[:100])
    def test_entropy_base(self):
        pk = np.ones(16, float)
        S = stats.entropy(pk, base=2.)
        assert_(abs(S - 4.) < 1.e-5)

        qk = np.ones(16, float)
        qk[:8] = 2.
        S = stats.entropy(pk, qk)
        S2 = stats.entropy(pk, qk, base=2.)
        assert_(abs(S/S2 - np.log(2.)) < 1.e-5)
def approximate_mixture_data():
    num_loc_proposals = 2
    num_imp_samp = 1000
    n_comp = 2
    p_comp = np.array([0.7, 0.3])
    dim = 1
    num_obs = 100
    obs = None
    
    means = []
    
    for i in range(n_comp):
        means.append([20*i]*dim)
        if obs is None:            
            obs = dist.mvt(means[-1], np.eye(dim),30).rvs(np.int(np.round(num_obs*p_comp[i])))
        else:
            obs = np.vstack([obs, dist.mvt(means[-1], np.eye(dim),30).rvs(np.int(np.round(num_obs*p_comp[i])))])

    count = {"local_lpost" :0, "local_llhood" :0, "naive_lpost" :0 ,"naive_llhood" :0,"standard_lpost" :0 ,"standard_llhood" :0}
    print(means)
    #return
    def count_closure(name):
        def rval():
            count[name] = count[name] + 1
        return rval
    
    initial_samples = []
    for _ in range(10):
        initial_samples.append(DirCatTMM(obs, [1]*n_comp, dist.mvt(np.mean(means,0), np.eye(dim)*5, dim),
                                  dist.invwishart(np.eye(dim) * 5, dim+1 ),
                                  stats.gamma(1,scale=1)))
#    (naive_samp, naive_lpost) = pmc.sample(num_imp_samp, initial_samples,
#                               DirCatTMMProposal(naive_multi_proposals = num_loc_proposals,
#                                                     lpost_count = count_closure("naive_lpost"),
#                                                     llhood_count =  count_closure("naive_llhood")),
#                               population_size = 4)
    (infl_samp, infl_lpost) = pmc.sample(num_imp_samp, initial_samples,
                               DirCatTMMProposal(num_local_proposals = num_loc_proposals,
                                                     lpost_count = count_closure("local_lpost"),
                                                     llhood_count =  count_closure("local_llhood")),
                               population_size = 4)
                               
    (stand_samp, stand_lpost) = pmc.sample(num_imp_samp * num_loc_proposals, initial_samples,
                               DirCatTMMProposal(lpost_count = count_closure("standard_lpost"),
                                                     llhood_count =  count_closure("standard_llhood")),
                               population_size = 4)

    print("===============\n",p_comp, means,
#          "\n\n--NAIVE--\n",
#          naive_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, naive_samp[-1].comp_indic.sum(0))+1, count["naive_llhood"], count["naive_lpost"],
          "\n\n--LOCAL--\n",
          infl_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, infl_samp[-1].comp_indic.sum(0))+1, count["local_llhood"], count["local_lpost"],
          "\n\n--STANDARD--\n",
          stand_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, stand_samp[-1].comp_indic.sum(0))+1, count["standard_llhood"], count["standard_lpost"],"\n\n")   
    return {"infl":(infl_samp, infl_lpost), "standard":(stand_samp, stand_lpost)}
def JSD(P, Q):
    """
    Calculates the Jensen-Shannon divergence as a metric (sq_root)
    See: http://www.researchgate.net/publication/3084774_A_new_metric_for_probability_distributions
    """
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return math.sqrt(0.5 * (entropy(_P, _M) + entropy(_Q, _M)))
    
    
Example #27
0
def kl_distance(pk,qk=None):
    '''
    One has to remember that KL divergence is done for two probability distributions 
    and not a data series, so first get the normalized histogram to feed into this.
    One can also first estimate the distribution and then feed it to get the distribution.
    '''
    if qk != None:
        min_len = min(len(pk),len(qk))
        return entropy(pk[:min_len], qk[:min_len],base=2)
    else:    
        return entropy(pk, qk,base=2)
Example #28
0
	def make_lognorm_plot(self):
		#mu, sigma = self.m_log, self.s_log
		#sigma, _, mu  = lognorm.fit(self.len_list, floc=0)
		mu, sigma = norm.fit(np.log(self.len_list))
		#mu, sigma = 3.45, 0.45
		print mu, sigma
		x = np.linspace(1, 100, 100)
		pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2)) / (x * sigma * np.sqrt(2 * np.pi)))
		self.line_lognorm, = plt.plot(x, pdf, linewidth=2, color='r')

		self.lognorm_real_kl = entropy(self.probs, pdf)
		self.lognorm_grammar_kl = entropy(self.b_prob[:100], pdf[:100])
def MM_distMat(models):
    distMat = np.zeros((models.shape[0],models.shape[0]))
    nrow = models[0].shape[0]
    ncol = models[0].shape[1] - 1 # the last column is for marginal distribution
    for index_a,mat_a in enumerate(models):
        print "calculating row " + str(index_a) + " out of " + str(models.shape[0])
        print "time passed so far: " + str(int((time.time()-start)/60)) + " minutes"
        for index_b,mat_b in enumerate(models):
            distMat[index_a,index_b] = sum(mat_a[i,ncol]*stats.entropy(mat_a[i,0:ncol],mat_b[i,0:ncol]) for i in range(nrow)) # conditioned KL
            distMat[index_a,index_b] += stats.entropy(mat_a[:,ncol],mat_b[:,ncol]) # conditioning KL
    distMat = distMat/distMat.max()
    return distMat
def get_information_gain(df):
    values = df["attr"].unique()
    entropy_values = []
    for value in values:
        target = df[df["attr"] == value]["target"]
        positive_cases = sum(target)
        negative_cases = len(target) - sum(target)
        entropy = stats.entropy([positive_cases, negative_cases], base=2)
        weighted_entropy = len(df[df["attr"] == value])/len(df["attr"]) * entropy
        entropy_values.append(weighted_entropy)

    return stats.entropy([sum(df["target"]), len(df["target"]) - sum(df["target"])], base=2) - sum(entropy_values)
Example #31
0
def empirical_entropy(dist, bins=10):
    # Histogram of counts across the bins
    hist_counts, _ = np.histogramdd(dist,
                                    bins=bins,
                                    range=[(0, 1) for _ in dist[0]])
    return entropy(hist_counts.flatten())  # Scipy will normalise
Example #32
0
 def pd_entropy(s):
     distribution = s.value_counts(normalize=True, dropna=self.dropna)
     return stats.entropy(distribution, base=self.base)
Example #33
0
            if opts.compute_CIS:
                cur_preds.append(pred)
            # path = os.path.join(opts.output_folder, 'input{:03d}_output{:03d}.jpg'.format(i, j))
            basename = os.path.basename(names[1])
            path = os.path.join(opts.output_folder + "_%02d" % j, basename)
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            vutils.save_image(outputs.data, path, padding=0, normalize=True)
        if opts.compute_CIS:
            cur_preds = np.concatenate(cur_preds, 0)
            py = np.sum(
                cur_preds, axis=0
            )  # prior is computed from outputs given a specific input
            for j in range(cur_preds.shape[0]):
                pyx = cur_preds[j, :]
                CIS.append(entropy(pyx, py))
        if not opts.output_only:
            # also save input images
            vutils.save_image(images.data,
                              os.path.join(opts.output_folder,
                                           'input{:03d}.jpg'.format(i)),
                              padding=0,
                              normalize=True)
    if opts.compute_IS:
        all_preds = np.concatenate(all_preds, 0)
        py = np.sum(all_preds, axis=0)  # prior is computed from all outputs
        for j in range(all_preds.shape[0]):
            pyx = all_preds[j, :]
            IS.append(entropy(pyx, py))

    if opts.compute_IS:
Example #34
0
        ax.set_xlabel(r'$\alpha$')
        ax.set_ylabel('test accuracy (%)')
        ax.set_title('Effect of different Dirichlet priors on test accuracy for naive bayes document classification')
        ## format for less visual cluster
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        plt.savefig('img/hw2_-_alpha,test_accuracy.png', format='png')
        # plt.show()



    ## feature selection
    size_vocab = len(df_vocab)
    alpha = 1./size_vocab
    p_y, p_x_given_y_tab = train_naive_bayes(df_train_data, df_train_labels, df_vocab, df_ng_labels, alpha)
    ## calc p(x) = \sum_y p(x,y) = \sum_y p(x|y)p(y)
    p_xy = np.power(10, p_x_given_y_tab).mul(p_y['p'])
    p_x = p_xy.sum(axis=1)

    i_x = -1.*np.log2(p_x) # information content of a particular word (not an ensemble) (aka. self-information)

    h_x_given_Y = np.power(10, p_x_given_y_tab).apply(lambda r: entropy(r, base=2), axis=1) # note that H(X|Y=y) = \sum_{x \in X} p(x|y) \log_2 p(x|y), but I did \sum_{y \in Y} p(x|y) \log_2 p(x|y), summing over the condition (labels) rather than summing over possible outcomes (words)

    ## why does "*" work? see notes
    tmp_df_res = (i_x * h_x_given_Y).to_frame(name='metric').join(df_vocab).sort('metric', ascending=True).join(np.log10(p_x).to_frame('log_p_x'))
    ## print out the top 100 words
    pd.set_option('display.max_rows', None)
    print tmp_df_res.iloc[:100]
Example #35
0
 def get_uniform_entropy(self):
     uniform_probability = 1.0 / len(self.diagnoses)
     return entropy(
         list(map(lambda diag: uniform_probability, self.diagnoses)))
def distOf2Dist(dist1, dist2):
    p = dist1.values()
    q = dist2.values()
    d1 = entropy(p, q)
    d2 = entropy(q, p)
    return (d1 + d2) / 2
Example #37
0
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    df = orig_df

    # Get observed frameshift rates
    obs_fs = {'+0': 0, '+1': 0, '+2': 0}
    all_ins_lens = set(df[df['Category'].isin(['ins',
                                               'ins_notatcut'])]['Length'])
    for ins_len in all_ins_lens:
        crit = (df['Category'].isin(['ins', 'ins_notatcut'])) & (df['Length']
                                                                 == ins_len)
        fs = ins_len % 3
        count = sum(df[crit]['Count'])
        key = '+%s' % (int(fs))
        obs_fs[key] += count

    all_del_lens = set(df[df['Category'].isin(['del',
                                               'del_notatcut'])]['Length'])
    for del_len in all_del_lens:
        crit = (df['Category'].isin(['del', 'del_notatcut'])) & (df['Length']
                                                                 == del_len)
        fs = (-1 * del_len) % 3
        count = sum(df[crit]['Count'])
        key = '+%s' % (int(fs))
        obs_fs[key] += count

    tot = sum(obs_fs.values())
    for key in obs_fs:
        obs_fs[key] /= tot

    # Predict
    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp frequency
    freq = rate_1bpins / (1 - rate_1bpins)
    pred = list(dlpred)
    pred.insert(0, freq)
    pred = np.array(pred) / sum(pred)

    pred_fs = {'+0': 0, '+1': 0, '+2': 0}
    pred_fs['+1'] += pred[0]
    for idx in range(1, len(pred)):
        del_len = idx
        fs = (-1 * del_len) % 3
        key = '+%s' % (int(fs))
        pred_fs[key] += pred[idx]

    # Bae predict
    bae_fs = {'+0': 0, '+1': 0, '+2': 0}
    bae_dlpred = bae_prediction(seq, cutsite)
    for idx in range(len(bae_dlpred)):
        del_len = idx + 1
        fs = (-1 * del_len) % 3
        key = '+%s' % (int(fs))
        bae_fs[key] += bae_dlpred[idx]

    for fs in ['+0', '+1', '+2']:
        alldf_dict['Frame'].append(fs)
        alldf_dict['Bae'].append(bae_fs[fs])
        alldf_dict['inDelphi'].append(pred_fs[fs])
        alldf_dict['Obs'].append(obs_fs[fs])

        alldf_dict['_Experiment'].append(exp)
        alldf_dict['rs'].append(rs)

    return alldf_dict
Example #38
0
def get_inception_score(imgs,
                        cuda=True,
                        batch_size=32,
                        resize=False,
                        splits=1):
    """
        Computes the inception score of the generated images imgs
        imgs -- Torch dataset of (3xHxW) numpy images normalized in the range [-1, 1]
        cuda -- whether or not to run on GPU
        batch_size -- batch size for feeding into Inception v3
        splits -- number of splits
    """
    N = len(imgs)

    assert batch_size > 0
    assert N > batch_size

    # Set up dtype
    if cuda:
        dtype = torch.cuda.FloatTensor
    else:
        if torch.cuda.is_available():
            print(
                "WARNING: You have a CUDA device, so you should probably set cuda=True"
            )
        dtype = torch.FloatTensor

    # Set up dataloader
    dataloader = torch.utils.data.DataLoader(imgs, batch_size=batch_size)

    # Load inception model
    inception_model = inception_v3(pretrained=True,
                                   transform_input=False).type(dtype)
    inception_model.eval()
    up = nn.Upsample(size=(299, 299), mode='bilinear').type(dtype)

    def get_pred(x):
        if resize:
            x = up(x)
        x = inception_model(x)
        return F.softmax(x).data.cpu().numpy()

    # Get predictions
    preds = np.zeros((N, 1000))

    for i, batch in enumerate(dataloader, 0):
        batch = batch.type(dtype)
        batchv = Variable(batch)
        batch_size_i = batch.size()[0]

        preds[i * batch_size:i * batch_size + batch_size_i] = get_pred(batchv)

    # Now compute the mean kl-div
    split_scores = []

    for k in range(splits):
        part = preds[k * (N // splits):(k + 1) * (N // splits), :]
        py = np.mean(part, axis=0)
        scores = []
        for i in range(part.shape[0]):
            pyx = part[i, :]
            scores.append(entropy(pyx, py))
        split_scores.append(np.exp(np.mean(scores)))

    return np.mean(split_scores), np.std(split_scores)
Example #39
0
def agreement(arr):
    return 1 - entropy(arr, base=2) / np.log2(len(arr))
Example #40
0
def get_llrs(X, y, features, label=1, binary=True, debug_word_set=None):
    '''
    Computes Log-Likelihood Ratio (LLR) for the input features.
    Parameters:
        -----------
        X: Numpy matrix (m x n)
            Consisting of training instances as rows and features as columns
        y: numpy matrix
            The labels of the instances
        label: int (Default 1)
            consider this label as positive label and compute LLR. This value should be one of the values in y
        binary: boolean (Default True)
            If true then features are counted as binary. ie either the feature occurs in that document or not. Term freq within the document is ignored.
        debug_word_set: set()
            Prints the debug information if the feature is in this set
    Returns:
        -----------
        list of tuples of the form [(a,b), ...] where a: the feature, b: the llr score of the feature
    Usage:
        -----------
        X = count_vect.transform(data.get_texts())
        print('feat_X.shape %d,%d' % X.shape)
        y = data.target
        features = np.array(count_vect.get_feature_names())
        llrs = get_llrs(X, y, features, label=1, binary=True)
        features = np.array([item[0] for item in llrs])
        weights = np.array([item[1] for item in llrs])
    '''
    if binary:
        nc = (y == label).sum()
        nc_ = (y != label).sum()
        X = binarize(X)
    else:
        nc = X[y == label].sum()
        nc_ = X[y != label].sum()
    print('nc, nc_ = %d,%d' % (nc, nc_))
    counts = X[y == label, :]
    counts_ = X[y != label, :]
    k1h = counts.sum(axis=0) + 1
    k2h = counts_.sum(axis=0) + 1
    k1t = nc - k1h + 1
    k2t = nc_ - k2h + 1
    llrs = []
    for i, word in enumerate(features):
        if debug_word_set is not None and word not in words:
            continue
        mat = np.matrix([[k1h[0, i], k1t[0, i]], [k2h[0, i], k2t[0, i]]])
        if mat[0, 0] == 1:
            continue
        # llr = 2 * mat.sum() * (-entropy(mat.A1) - -entropy(mat.sum(axis=1)) - -entropy(mat.T.sum(axis=1)))
        Hmat = entropy(mat.A1)  # entropy of matrix
        Hrow = entropy(mat.sum(axis=1))[0]  # entropy of row sums
        Hcol = entropy(mat.T.sum(axis=1))[0]  # entropy of col sums
        llr = -2 * mat.sum() * (Hmat - Hrow - Hcol)
        llrs.append((word, llr))
        if debug_word_set is not None and word in debug_word_set:
            print(word)
            print(mat)
            print(mat.sum(), Hmat, Hrow, Hcol)
            print(llr)

    return llrs
Example #41
0
    def add_computed_columns(self):

        # probs and entropy
        self.df['prob_final_tote_odds'] = self.df.groupby(
            'race_id')['final_tote_odds'].transform(compute_probs_from_odds)

        self.df['entropy_final_tote_odds'] = self.df.groupby(
            'race_id')['prob_final_tote_odds'].transform(
                lambda x: entropy(x, base=len(x)))
        self.df['entropy_final_tote_odds'] = self.df[
            'entropy_final_tote_odds'].map(lambda x: nan if isneginf(x) else x)

        self.df['prob_morning_line_odds'] = self.df.groupby(
            'race_id')['morning_line'].transform(compute_probs_from_odds)
        self.df['rank_prob_morning_line_odds'] = self.df.groupby(
            'race_id')['prob_morning_line_odds'].rank(ascending=False)

        self.df['entropy_morning_line_odds'] = self.df.groupby(
            'race_id')['prob_morning_line_odds'].transform(
                lambda x: entropy(x, base=len(x)))
        self.df['entropy_morning_line_odds'] = self.df[
            'entropy_morning_line_odds'].map(lambda x: nan
                                             if isneginf(x) else x)

        self.df[
            'num_effective_starters_morning_line'] = self.df.entropy_morning_line_odds * self.df.num_starters_post
        self.df[
            'num_effective_starters_final_tote_odds'] = self.df.entropy_final_tote_odds * self.df.num_starters_post
        #self.df['drop_morning_line_odds'] = (self.df['num_starters_post'] - self.df['num_effective_starters_morning_line']).map(round)
        self.df['diff_logprob_final_tote_morning_line'] = self.df[
            'prob_final_tote_odds'].map(lambda x: math.log(x)) - self.df[
                'prob_morning_line_odds'].map(lambda x: math.log(x))

        self.df['rank_prob_final_tote_odds'] = self.df.groupby(
            'race_id')['prob_final_tote_odds'].rank(ascending=False)
        self.df['rank_diff_logprob_final_tote_morning_line'] = self.df.groupby(
            'race_id')['diff_logprob_final_tote_morning_line'].transform(
                lambda x: x.rank(ascending=False))

        # sprint if 1759 yards (1 mile) or less, route if more
        self.df['is_route'] = self.df['distance'].map(lambda x: int(x > 1759))
        self.df['num_starters_post'] = self.df['num_starters_post'].map(
            lambda x: int(x))
        self.df['cost_exacta_from_win_show'] = self.df[
            'num_starters_post'].map(lambda x: (x - 1) * 1)
        self.df['cost_trifecta_from_place_wc'] = self.df[
            'num_starters_post'].map(lambda x: (x - 1) * (x - 2) * 2)
        self.df['cost_superfecta_from_show_a1'] = self.df[
            'num_starters_post'].map(lambda x: (x - 1) * (x - 2) * (x - 3) * 3)
        self.df['cost_synth_place_tri'] = self.df['num_starters_post'].map(
            lambda x: (x - 1) * (x - 2) * 2)
        #self.df['cost_synth_'] = self.df['num_starters_post'].map(lambda x: (x - 1) * (x - 2) * 2)
        self.df['log_ratio_effectivestarters_morningline'] = -1.0 * log(
            self.df.num_effective_starters_morning_line /
            self.df.num_starters_post)

        self.df['max_prob_morning_line_odds'] = self.df.groupby(
            'race_id')['prob_morning_line_odds'].transform(lambda x: x.max())
        self.df['max_prob_final_tote_odds'] = self.df.groupby(
            'race_id')['prob_final_tote_odds'].transform(lambda x: x.max())
        self.df['underperformance_weighted'] = (
            self.df['rank_prob_final_tote_odds'] -
            self.df['official_finish_position']
        ) * self.df['prob_final_tote_odds']
Example #42
0
def Entropy2(labels, base=2):
    data = labels.value_counts()
    en = stats.entropy(data, base = base)
Example #43
0
    def kmeans(self, s, winbids, losebids):
        if len(s) == 0:
            return 0

        leafSize = self.LEAF_SIZE
        s1 = {}
        s2 = {}
        winbids1 = {}
        winbids2 = {}
        losebids1 = {}
        losebids2 = {}
        len1 = 0
        len2 = 0
        lenk = {}
        # random split s into s1 and s2, and calculate minPrice,maxPrice
        for i in range(int(len(s) / 2)):
            k = list(s.keys())[i]
            s1[k] = s[k]
            winbids1[k] = winbids[k]
            losebids1[k] = losebids[k]
            lenk[k] = sum(s[k])
            len1 += lenk[k]
        for i in range(int(len(s) / 2), len(s)):
            k = list(s.keys())[i]
            s2[k] = s[k]
            winbids2[k] = winbids[k]
            losebids2[k] = losebids[k]
            lenk[k] = sum(s[k])
            len2 += lenk[k]
        # EM-step
        KLD1 = 0.0
        KLD2 = 0.0
        KLD = 0.0
        pr = []
        count = 0
        isBreak = 0
        not_converged = 1
        while not_converged:
            count += 1
            # begin
            not_converged = 0
            # E-step:
            q1 = self.calProbDistribution(winbids1, losebids1)
            q2 = self.calProbDistribution(winbids2, losebids2)
            KLD = entropy(q1, q2)
            if count > 8 and KLD < KLD1:
                isBreak = 1
            if count > 3 and KLD < KLD1 and KLD == KLD2:
                isBreak = 1
            KLD2 = KLD1
            KLD1 = KLD
            # M-step:
            for k in s.keys():
                mk = self.calProbDistribution({k: winbids[k]},
                                              {k: losebids[k]})
                k1 = entropy(mk, q1)
                k2 = entropy(mk, q2)
                if k1 < k2:
                    if k in s1:
                        continue
                    if len2 - lenk[k] < leafSize:
                        continue
                    not_converged = 1
                    s1[k] = s[k]
                    winbids1[k] = winbids[k]
                    losebids1[k] = losebids[k]
                    len1 += lenk[k]
                    if k in s2:
                        len2 -= lenk[k]
                        s2.pop(k)
                        winbids2.pop(k)
                        losebids2.pop(k)
                elif k1 > k2:
                    if k in s2:
                        continue
                    if len1 - lenk[k] < leafSize:
                        continue
                    not_converged = 1
                    s2[k] = s[k]
                    winbids2[k] = winbids[k]
                    losebids2[k] = losebids[k]
                    len2 += lenk[k]
                    if k in s1:
                        len1 -= lenk[k]
                        s1.pop(k)
                        winbids1.pop(k)
                        losebids1.pop(k)
            if isBreak == 1:
                break
        return s1, s2, winbids1, winbids2, losebids1, losebids2
Example #44
0
def entropy1(labels, base=2):
    value, counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=2)
Example #45
0
def Entropy(input_vec):
    value, counts = np.unique(input_vec, return_counts=True)
    entropy_val = entropy(counts, base=2)
    return entropy_val
Example #46
0
    for i in range(51):
        prob_node_send_SYN.append(node_send_SYN[i]/total_SYN)
        prob_node_recv_SYN.append(node_recv_SYN[i]/total_SYN)
        prob_node_send_ENC.append(node_send_ENC[i]/total_ENC)
        prob_node_recv_ENC.append(node_recv_ENC[i]/total_ENC)

#prob_node_send_SYN --- probability of a node sending a SYN
#prob_node_recv_SYN --- probability of a node receiving a SYN.
#prob_node_send_ENC
#prob_node_recv_ENC

    return prob_node_send_SYN, prob_node_recv_SYN, prob_node_send_ENC, prob_node_recv_ENC


## Now, to calculate KL divergence, calculate the same probabilities as above for all test traces.

if __name__ == "__main__":
    f = open("normal.tr","r")
    p_send_SYN, p_recv_SYN, p_send_ENC, p_recv_ENC = probabilities(f)
    print("Entropies for normal trace file:\n")
    print("Send SYN:",entropy(p_send_SYN)/math.log(51))
    print("Send ENC:",entropy(p_send_ENC)/math.log(51))
    for i in range (1,4):
        fname = "trace"+str(i)+".tr"
        f1 = open(fname, "r")
        print("TEST trace: ",fname)
        t_send_SYN, t_recv_SYN, t_send_ENC, t_recv_ENC = probabilities(f1)
        print("KL divergence of send SYN:", entropy(p_send_SYN,t_send_SYN, base=10))
        print("KL divergence of send ENC:", entropy(p_send_ENC, t_send_ENC, base=10))
        
Example #47
0
 def _get_bugs_scores(self):
     bugs = self.get_bugs()
     comps_prob = dict(self.get_components_probabilities())
     bugs_prob = list(map(lambda x: comps_prob.get(x, 0), bugs))
     return np.mean(bugs_prob), np.std(bugs_prob), entropy(bugs_prob)
Example #48
0
def l_diversity(df_train, group_grid_list):
    grid_dtr = np.zeros(len(group_grid_list)) 
    for i in range(len(group_grid_list)):
        grid_dtr[i] = df_train.loc[df_train['grid'] == group_grid_list[i]].shape[0]
    grid_dtr_norm = grid_dtr / norm(grid_dtr, ord=1)
    return entropy(grid_dtr_norm)
Example #49
0
 def calc_component_entropy(self):
     return entropy(
         list(map(lambda x: x[1], self.get_components_probabilities())))
Example #50
0
def correction_experiment(dataset_name=None, 
                     tweak_train=None, 
                     p_P=None, tweak_test=None, p_Q=None, 
                     num_train_samples=None,
                     num_val_samples=None,
                     num_test_samples=None,
                     num_hidden=None, 
                     epochs=None,
                     batch_size=None):

    # set the context for compute
    ctx = mx.gpu()
    
    # set the context for data
    data_ctx = mx.gpu()

    # load the dataset
    X, y, Xtest, ytest = load_data(dataset_name)

    n = X.shape[0]
    dfeat = np.prod(X.shape[1:])

    # NOTE FOR IMPROVEMENT: eventually this should be returned by the data library
    num_labels = 10

    ################################################
    # Random permutation of the data
    ################################################

    rand_idx = np.random.permutation(n)
    X = X[rand_idx,...]
    y = y[rand_idx]

    ################################################
    #  First split examples between train and validation
    ################################################
    num = 2  
    Xtrain_source = X[:(n//num),:,:,:]
    ytrain_source = y[:(n//num)]
    Xval_source = X[(n//num):(2*n//num),:,:,:]
    yval_source = y[(n//num):(2*n//num):]

    ################################################
    #  Set the label distribution at train time
    ################################################
    if tweak_train:
#         print("Sampling training and validation data from p_P")
#         print("Current p_P: ", p_P)
        Xtrain, ytrain = tweak_dist(Xtrain_source, ytrain_source, num_labels, num_train_samples, p_P)
        Xval, yval = tweak_dist(Xval_source, yval_source, num_labels, num_val_samples, p_P)
    else:
        Xtrain, ytrain = Xtrain_source, ytrain_source
        Xval, yval = Xval_source, yval_source

    ################################################
    #  Set the label distribution for test data
    ################################################
    if tweak_test:
#         print("Sampling test data from p_Q")
#         print("Current p_Q: ", p_Q)
        Xtest, ytest = tweak_dist(Xtest, ytest, num_labels, num_test_samples, p_Q)
          
    ####################################
    # Train on p_P
    ####################################
    net = gluon.nn.HybridSequential()
    with net.name_scope():
        net.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net.add(gluon.nn.Dense(num_labels))

    net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
    net.hybridize()
    
    # Training
    weighted_train(net, softmax_cross_entropy, trainer, Xtrain, ytrain, Xval, yval, ctx, dfeat, epoch=epochs, weightfunc=None, data_ctx=data_ctx)


    # Prediction
    ypred_s, ypred_s_soft = predict_all(Xval, net, ctx, dfeat)
    ypred_t, ypred_t_soft = predict_all(Xtest, net, ctx, dfeat)


    # Converting to numpy array for later convenience
    ypred_s= ypred_s.asnumpy()
    ypred_s_soft = ypred_s_soft.asnumpy()
    ypred_t = ypred_t.asnumpy()
    ypred_t_soft = ypred_t_soft.asnumpy()
    
    ####################################
    # Estimate Wt and Py
    ####################################
    wt = estimate_labelshift_ratio(yval, ypred_s, ypred_t,num_labels)

    Py_est = estimate_target_dist(wt, yval,num_labels)

    Py_true = calculate_marginal(ytest,num_labels)
    Py_base = calculate_marginal(yval,num_labels)

    wt_true = Py_true/Py_base

    print(np.concatenate((wt,wt_true),axis=1))
    print(np.concatenate((Py_est,Py_true),axis=1))

#     print("||wt - wt_true||^2  = " + repr(np.sum((wt-wt_true)**2)/np.linalg.norm(wt_true)**2))
#     print("KL(Py_est|| Py_true) = " + repr(stats.entropy(Py_est,Py_base)))
    
    
    ####################################
    # Solve weighted ERM and compare to previously trained models
    ####################################
    data_test = mx.io.NDArrayIter(Xtest, ytest, batch_size, shuffle=False)

    acc_unweighted =  evaluate_accuracy(data_test, net, ctx, dfeat) # in fact, drawing confusion matrix maybe more informative

    print("Accuracy unweighted", acc_unweighted)

    training_weights=np.maximum(wt, 0)
    wt_ndarray = nd.array(training_weights,ctx=ctx)


    weightfunc = lambda x,y: wt_ndarray[y.asnumpy().astype(int)]

    # Train a model using the following!
    net2 = gluon.nn.HybridSequential()
    with net2.name_scope():
        net2.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net2.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net2.add(gluon.nn.Dense(num_labels))

    net2.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    trainer2 = gluon.Trainer(net2.collect_params(), 'sgd', {'learning_rate': .1})
    net2.hybridize()
    
    # NOTE WE ASSUME SAME NUMBER OF EPOCHS IN PERIOD 1 and PERIOD 2
    
    # Training
    weighted_train(net2, softmax_cross_entropy, trainer2, Xtrain, ytrain, 
                   Xval, yval, ctx, dfeat, epoch=epochs, weightfunc=weightfunc, data_ctx=data_ctx)

    data_test.reset()
    acc_weighted = evaluate_accuracy(data_test, net2, ctx, dfeat)

    print("Accuracy weighted", acc_weighted)
    
    return {"acc_unweighted": acc_unweighted, 
            "acc_weighted": acc_weighted,
            "wt": wt, 
            "wt_true": wt_true, 
            "wt_l2": np.sum((wt-wt_true)**2)/np.linalg.norm(wt_true)**2, 
            "kl_div": stats.entropy(Py_est,Py_base),
            "ypred_s": ypred_s,
            "ypred_s_soft": ypred_s_soft,
            "ypred_t:": ypred_t,
            "ypred_t_soft": ypred_t_soft,
            }
Example #51
0
 def calc_entropy(self):
     return entropy(list(map(lambda diag: diag.probability,
                             self.diagnoses)))
def evaluate(model, z_dim, N=1000, cuda=True, batch_size=32, resize=True, splits=1):
    """
    adapted from: https://github.com/sbarratt/inception-score-pytorch/blob/master/inception_score.py
    Computes the inception score of images generated by model
    model -- Pretrained Generator
    N -- Number of samples to test
    cuda -- whether or not to run on GPU
    batch_size -- batch size for feeding into Inception v3
    splits -- number of splits
    """

    assert batch_size > 0
    assert N > batch_size

    # Set up dtype
    if cuda:
        dtype = torch.cuda.FloatTensor
    else:
        if torch.cuda.is_available():
            print("WARNING: You have a CUDA device, so you should probably set cuda=True")
        dtype = torch.FloatTensor

    # Load inception model
    inception_model = inception_v3(pretrained=True, transform_input=False).type(dtype)
    inception_model.eval()
    up = nn.Upsample(size=(299, 299), mode='bilinear').type(dtype)

    def get_pred(N_s):

        z_ = torch.randn(N_s, z_dim).view(-1, z_dim, 1, 1)

        if cuda:
            z_ = z_.cuda()

        z_ = Variable(z_)

        x = model.forward(z_)

        if resize:
            x = up(x)
        x = inception_model(x)
        return F.softmax(x, dim=1).data.cpu().numpy()

    indexes = strided_app(np.arange(N), batch_size, batch_size)

    N = indexes[-1][-1] + 1

    # Get predictions
    preds = np.zeros((N, 1000))

    for i, idx in enumerate(indexes, 0):
        batch_size_i = idx.shape[0]

        preds[i * batch_size:i * batch_size + batch_size_i] = get_pred(batch_size_i)

    # Now compute the mean kl-div
    split_scores = []

    for k in range(splits):
        part = preds[k * (N // splits): (k + 1) * (N // splits), :]
        py = np.mean(part, axis=0)
        scores = []
        for i in range(part.shape[0]):
            pyx = part[i, :]
            scores.append(entropy(pyx, py))
        split_scores.append(np.exp(np.mean(scores)))

    return np.mean(split_scores) #, np.std(split_scores)
def KL(p, q):
    KLD = entropy(p, q)
    return KLD
Example #54
0
def KL_Classify(freq_dists):
    # A vs A
    AvsA_matrix = []
    for i in range(0, len(freq_dists[0])):
        AxVsAy = []
        for j in range(0, len(freq_dists[0])):
            d = entropy(freq_dists[0][i], freq_dists[0][j])
            AxVsAy.append(d)
        AvsA_matrix.append(AxVsAy)

    # A vs B
    AvsB_matrix = []
    for i in range(0, len(freq_dists[0])):
        AxVsBy = []
        for j in range(0, len(freq_dists[1])):
            d = entropy(freq_dists[0][i], freq_dists[1][j])
            AxVsBy.append(d)
        AvsB_matrix.append(AxVsBy)

    # B vs B
    BvsB_matrix = []
    for i in range(0, len(freq_dists[1])):
        BxVsBy = []
        for j in range(0, len(freq_dists[1])):
            d = entropy(freq_dists[1][i], freq_dists[1][j])
            BxVsBy.append(d)
        BvsB_matrix.append(BxVsBy)

    # B vs A
    BvsA_matrix = []
    for i in range(0, len(freq_dists[1])):
        BxVsAy = []
        for j in range(0, len(freq_dists[0])):
            d = entropy(freq_dists[1][i], freq_dists[0][j])
            BxVsAy.append(d)
        BvsA_matrix.append(BxVsAy)

    ##########################
    #Compute success metric
    #Set A - YouTube
    #Set B - CovertCast
    #TP = Correctly identify CovertCast
    #TN = Correctly identify YouTube
    ##########################

    total_KL_distances = 0
    success = 0
    TrueNegatives = 0
    TruePositives = 0

    #A - B
    for i in range(0, len(freq_dists[0])):
        for j in range(0, len(AvsA_matrix[i])):
            for k in range(0, len(AvsB_matrix[i])):
                total_KL_distances += 1
                if (AvsA_matrix[i][j] < AvsB_matrix[i][k]):
                    success += 1
                    TrueNegatives += 1
    # B - A
    for i in range(0, len(freq_dists[1])):
        for j in range(0, len(BvsB_matrix[i])):
            for k in range(0, len(BvsA_matrix[i])):
                total_KL_distances += 1
                if (BvsB_matrix[i][j] < BvsA_matrix[i][k]):
                    success += 1
                    TruePositives += 1

    print "Total Accuracy: " + str(success / float(total_KL_distances))
    print "TruePositives: " + str(
        TruePositives / float(total_KL_distances / 2.0))
    print "TrueNegatives: " + str(
        TrueNegatives / float(total_KL_distances / 2.0))
Example #55
0
 def entropy(self, arr):
     '''calculates the entropy of the normalized degree distribution'''
     return entropy(arr)
Example #56
0
def map_ref_sites(routed: xr.Dataset, gauge_reference: xr.Dataset,
                    gauge_sites=None, route_var = 'IRFroutedRunoff',
                    fill_method='kldiv'):
    """
    Assigns segs within routed boolean 'is_gauge' "identifiers" and
    what each seg's upstream and downstream reference seg designations are.
    
    Parameters
    ----------
    routed: xr.Dataset
        Contains the input flow timeseries data.
    gauge_reference: xr.Dataset
        Contains reference flow timeseries data for the same watershed 
        as the routed dataset.
    gauge_sites: list, optional
        If None, gauge_sites will be taken as all those listed in
        gauge_reference.
    route_var: str
        Variable name of flows used for fill_method purposes within routed.
        This is defaulted as 'IRFroutedRunoff'.
    fill_method: str
        While finding some upstream/downstream reference segs may be simple,
        (segs with 'is_gauge' = True are their own reference segs, others
        may be easy to find looking directly up or downstream), some river
        networks may have multiple options to select gauge sites and may fail
        to have upstream/downstream reference segs designated. 'fill_method'
        specifies how segs should be assigned upstream/downstream reference
        segs for bias correction if they are missed walking upstream or downstream.
        
        Currently supported methods:
            'leave_null'
                nothing is done to fill missing reference segs, np.nan values are
                replaced with a -1 seg designation and that's it
            'forward_fill'
                xarray's ffill method is used to fill in any np.nan values
            'r2'
                reference segs are selected based on which reference site that
                seg's flows has the greatest r2 value with
            'kldiv'
                reference segs are selected based on which reference site that
                seg's flows has the smallest KL Divergence value with
            'kge'
                reference segs are selected based on which reference site that
                seg's flows has the greatest KGE value with
    Returns
    -------
    routed: xr.Dataset
        Routed timeseries with reference gauge site river segments assigned to
        each river segement in the original routed. 
    """
    if isinstance(gauge_sites, type(None)):
        gauge_sites = gauge_reference['site'].values
    else:
        # need to typecheck since we do a for loop later and don't
        # want to end up iterating through a string by accident
        assert isinstance(gauge_sites, list)

    gauge_segs = gauge_reference.sel(site=gauge_sites)['seg'].values

    routed['is_gauge'] = False * routed['seg']
    routed['down_ref_seg'] = np.nan * routed['seg']
    routed['up_ref_seg'] = np.nan * routed['seg']
    routed['up_seg'] = 0 * routed['is_headwaters']
    routed['up_seg'].values = [find_up(routed, s) for s in routed['seg'].values]
    for s in routed['seg']:
        if s in list(gauge_segs):
            routed['is_gauge'].loc[{'seg':s}] = True
            routed['down_ref_seg'].loc[{'seg': s}] = s
            routed['up_ref_seg'].loc[{'seg': s}] = s

    for seg in routed['seg']:
        cur_seg = seg.values[()]
        while cur_seg in routed['seg'].values and np.isnan(routed['down_ref_seg'].sel(seg=cur_seg)):
            cur_seg = routed['down_seg'].sel(seg=cur_seg).values[()]
        if cur_seg in routed['seg'].values:
            routed['down_ref_seg'].loc[{'seg':seg}] = routed['down_ref_seg'].sel(seg=cur_seg).values[()]

    for seg in routed['seg']:
        cur_seg = seg.values[()]
        while cur_seg in routed['seg'].values and np.isnan(routed['up_ref_seg'].sel(seg=cur_seg)):
            cur_seg = routed['up_seg'].sel(seg=cur_seg).values[()]
        if cur_seg in routed['seg'].values:
            routed['up_ref_seg'].loc[{'seg':seg}] = routed['up_ref_seg'].sel(seg=cur_seg).values[()]

    # Fill in any remaining nulls (head/tailwaters)
    if fill_method == 'leave_null':
        # since there should be no -1 segs from mizuroute, we can set nan's to -1 to acknowledge
        # that they have been addressed and still set them apart from the rest of the data
        routed['up_ref_seg'] = (routed['up_ref_seg'].where(~np.isnan(routed['up_ref_seg']), other=-1))
        routed['down_ref_seg'] = (routed['down_ref_seg'].where(~np.isnan(routed['down_ref_seg']), other=-1))
    elif fill_method == 'forward_fill':
        routed['up_ref_seg'] = (routed['up_ref_seg'].where(
            ~np.isnan(routed['up_ref_seg']), other=routed['down_ref_seg'])).ffill('seg')
        routed['down_ref_seg'] = (routed['down_ref_seg'].where(
            ~np.isnan(routed['down_ref_seg']), other=routed['up_ref_seg'])).ffill('seg')
    elif fill_method == 'r2':

        fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0]
        fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0]

        routed['r2_up_gauge'] = 0 * routed['is_gauge']
        routed['r2_down_gauge'] = 0 * routed['is_gauge']

        for curr_seg in routed['seg'].values:
            up_ref_seg = np.nan
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values):
                up_ref_r2, up_ref_seg = find_max_r2(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['r2_up_gauge'].loc[{'seg':curr_seg}] = up_ref_r2
                routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg
            else:
                # this seg has already been filled in, but r2 still needs to be calculated
                ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg)).values
                up_ref_r2 = np.corrcoef(curr_seg_flow, ref_flow)[0, 1]**2
                routed['r2_up_gauge'].loc[{'seg':curr_seg}] = up_ref_r2

        for curr_seg in routed['seg'].values:
            down_ref_seg = np.nan
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values):
                down_ref_r2, down_ref_seg = find_max_r2(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['r2_down_gauge'].loc[{'seg':curr_seg}] = down_ref_r2
                routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg
            else:
                # this seg has already been filled in, but r2 still needs to be calculated
                ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg)).values
                down_ref_r2 = np.corrcoef(curr_seg_flow, ref_flow)[0, 1]**2
                routed['r2_down_gauge'].loc[{'seg':curr_seg}] = down_ref_r2


    elif fill_method == 'kldiv':
        fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0]
        fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0]

        routed['kldiv_up_gauge'] = 0 * routed['is_gauge']
        routed['kldiv_down_gauge'] = 0 * routed['is_gauge']
        for curr_seg in routed['seg'].values:
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values):
                up_ref_kldiv, up_ref_seg = find_min_kldiv(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['kldiv_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kldiv
                routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg
            else:
                # this seg has already been filled in, but kldiv still needs to be calculated
                # kldiv computation could probably be gutted in the furture ...
                TINY_VAL = 1e-6
                total_bins = int(np.sqrt(len(curr_seg_flow)))
                curr_seg_flow_pdf, curr_seg_flow_edges = np.histogram(
                    curr_seg_flow, bins=total_bins, density=True)
                curr_seg_flow_pdf[curr_seg_flow_pdf == 0] = TINY_VAL

                ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg).values).values
                ref_flow_pdf = np.histogram(ref_flow, bins=curr_seg_flow_edges, density=True)[0]
                ref_flow_pdf[ref_flow_pdf == 0] = TINY_VAL

                up_ref_kldiv = entropy(pk=ref_flow_pdf, qk=curr_seg_flow_pdf)
                routed['kldiv_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kldiv

        for curr_seg in routed['seg'].values:
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values):
                down_ref_kldiv, down_ref_seg = find_min_kldiv(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['kldiv_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kldiv
                routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg
            else:
                # this seg has already been filled in, but kldiv still needs to be calculated
                # kldiv computation could probably be gutted in the furture ...
                TINY_VAL = 1e-6
                total_bins = int(np.sqrt(len(curr_seg_flow)))
                curr_seg_flow_pdf, curr_seg_flow_edges = np.histogram(
                    curr_seg_flow, bins=total_bins, density=True)
                curr_seg_flow_pdf[curr_seg_flow_pdf == 0] = TINY_VAL

                ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg).values).values
                ref_flow_pdf = np.histogram(ref_flow, bins=curr_seg_flow_edges, density=True)[0]
                ref_flow_pdf[ref_flow_pdf == 0] = TINY_VAL

                down_ref_kldiv = entropy(pk=ref_flow_pdf, qk=curr_seg_flow_pdf)
                routed['kldiv_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kldiv

    elif fill_method == 'kge':

        fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0]
        fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0]

        routed['kge_up_gauge'] = 0 * routed['is_gauge']
        routed['kge_down_gauge'] = 0 * routed['is_gauge']

        for curr_seg in routed['seg'].values:
            up_ref_seg = np.nan
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values):
                up_ref_kge, up_ref_seg = find_max_kge(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['kge_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kge
                routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg
            else:
                # this seg has already been filled in, but kge still needs to be calculated
                ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg)).values
                up_ref_kge = kling_gupta_efficiency(curr_seg_flow, ref_flow)
                routed['kge_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kge

        for curr_seg in routed['seg'].values:
            down_ref_seg = np.nan
            curr_seg_flow = routed[route_var].sel(seg=curr_seg).values
            if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values):
                down_ref_kge, down_ref_seg = find_max_kge(routed[route_var].sel(seg=gauge_segs), curr_seg_flow)
                routed['kge_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kge
                routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg
            else:
                # this seg has already been filled in, but kge still needs to be calculated
                ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg)).values
                down_ref_kge = kling_gupta_efficiency(curr_seg_flow, ref_flow)
                routed['kge_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kge
    else:
        raise ValueError('Invalid method provided for "fill_method"')

    return routed
Example #57
0
def empirical_entropy_finite_support(dist, decimals=2):
    # Histogram of counts across the bins
    _, counts = np.unique(np.around(dist, decimals=decimals),
                          return_counts=True,
                          axis=0)
    return entropy(counts)  # Scipy will normalise
Example #58
0
    # z_dimension = 301  # include z=0
    campaign = sys.argv[1].split("/")[-1]

    win = b_train_origin > z_train_origin
    win_rate = win.sum() / record_size
    print("winning rate {0:.2f}%".format(win_rate * 100))

    zs = list(range(z_dimension))
    # calculate truth_pdf
    truth_pdf = []
    (unique_z,
     counts_z) = np.unique(z_test_origin,
                           return_counts=True)  # the unique has been sorted
    unique_z = unique_z.tolist()

    for i in range(z_dimension):
        count = counts_z[unique_z.index(
            i)] if i in unique_z else 0  # in case of dividing 0
        truth_pdf.append(count / test_size)

    # KMDT
    print("==========start to train KMDT==========")
    kmdt = KMDT(IFROOT=sys.argv[2],
                result_root=sys.argv[1],
                OFROOT=OFROOT,
                max_market_price=z_dimension)
    kmdt.train()
    mse_kmdt, anlp_kmdt, pdf_kmdt = kmdt.evaluate(x_test, z_test_origin)
    kl_pdf_kmdt = entropy(truth_pdf, pdf_kmdt)
    wd_pdf_kmdt = wasserstein_distance(truth_pdf, pdf_kmdt)
Example #59
0
def Entropy(labels, base=2):
    probs = pd.Series(labels).value_counts() / len(labels)
    en = stats.entropy(probs, base=base)
    return en
Example #60
0
    def train(self):
        dataset = self.getTrainData()
        # priceSet = [int(data[self.PAY_PRICE_INDEX]) for data in dataset]

        iStack = []
        iStack.append(1)
        dataStack = []
        dataStack.append(dataset.copy())
        while len(iStack) != 0:
            nodeIndex = iStack.pop()
            dataset = dataStack.pop()
            print("nodeIndex = " + str(nodeIndex))
            if 2 * nodeIndex >= 2**self.TREE_DEPTH:
                self.nodeData[nodeIndex] = dataset.copy()
                continue

            maxKLD = -1.0
            bestFeat = 0
            count = 0  # detect if there's no feature to split
            for featIndex in range(len(dataset[0])):
                if featIndex not in self.FEATURE_LIST:
                    continue
                s, winbids, losebids = self.dataset2s(dataset, featIndex)
                if len(s.keys()) <= 1:
                    continue
                count += 1
                tmpS1, tmpS2, winbids1, winbids2, losebids1, losebids2 = self.kmeans(
                    s, winbids, losebids)
                q1 = self.calProbDistribution(winbids1, losebids1)
                q2 = self.calProbDistribution(winbids2, losebids2)
                KLD = entropy(q1, q2)
                if count == 1:
                    maxKLD = KLD
                    bestFeat = featIndex
                    s1 = tmpS1.copy()
                    s2 = tmpS2.copy()
                if maxKLD < KLD and len(tmpS1) != 0 and len(tmpS2) != 0:
                    maxKLD = KLD
                    bestFeat = featIndex
                    s1 = tmpS1.copy()
                    s2 = tmpS2.copy()

            if count == 0 or len(s1.keys()) == 0 or len(
                    s2.keys()) == 0:  # no feature can split
                self.nodeData[nodeIndex] = dataset.copy()
                continue
            dataset1 = self.s2dataset(s1, dataset, bestFeat)
            dataset2 = self.s2dataset(s2, dataset, bestFeat)

            if len(dataset1) < self.LEAF_SIZE or len(
                    dataset2) < self.LEAF_SIZE:
                self.nodeData[nodeIndex] = dataset.copy()
                continue

            self.nodeInfos[nodeIndex] = NodeInfo(nodeIndex, bestFeat, maxKLD,
                                                 list(s1.keys()).copy(),
                                                 list(s2.keys()).copy())

            if len(dataset2) > 2 * self.LEAF_SIZE:
                iStack.append(2 * nodeIndex + 1)
                dataStack.append(dataset2.copy())
            else:
                self.nodeData[2 * nodeIndex + 1] = dataset2.copy()
            if len(dataset1) > 2 * self.LEAF_SIZE:
                iStack.append(2 * nodeIndex)
                dataStack.append(dataset1.copy())
            else:
                self.nodeData[2 * nodeIndex] = dataset1.copy()

        return