Example #1
0
    def test_kurtosis(self):
        self.assertEqual(bc.helper.tools.kurtosis([]), None)
        self.assertAlmostEqual(bc.helper.tools.kurtosis([1, 2, 3, 4, 5]), stats.kurtosis([1, 2, 3, 4, 5], fisher=False))
        self.assertAlmostEqual(bc.helper.tools.kurtosis([1, 6, 6, 6, 9, 17]), stats.kurtosis([1, 6, 6, 6, 9, 17], fisher=False))

        self.assertAlmostEqual(bc.helper.tools.kurtosis(self.list_1), stats.kurtosis(self.list_1, fisher=False))
        self.assertAlmostEqual(bc.helper.tools.kurtosis(self.list_2), stats.kurtosis(self.list_2, fisher=False))
Example #2
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #3
0
 def best_ica_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/wine_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def test_kurtosis():
    """ test equation of kurtosis in scipy """
    n = 100
    x = np.random.rand(n)

    # biased estimator
    b_kurt = spstats.kurtosis(x, bias=True)

    k4 = sum((v-x.mean())**4 for v in x)/n
    k2 = sum((v-x.mean())**2 for v in x)/n
    b_kurt2 = k4/k2**2 - 3
    print ("biased kurtosis:", b_kurt2)
    np.testing.assert_allclose(b_kurt, b_kurt2)

    # unbiased estimator
    ub_kurt = spstats.kurtosis(x, bias=False)

    k4 = sum((v-x.mean())**4 for v in x)/n
    k2 = sum((v-x.mean())**2 for v in x)/n
    ub_kurt2 = 1.0/(n-2)/(n-3) * ((n**2-1.0)*k4/k2**2.0 - 3*(n-1)**2.0)
    print ("ubbiased kurtosis:", ub_kurt2)

    k2 = sum((v-x.mean())**2 for v in x)/(n-1)
    ub_kurt3 = 1/(n-2)/(n-3) *((n**2-1.0)*(n/(n-1))**2. *k4/k2**2.0- 3*(
        n-1)**2.0)
    print ("ubbiased kurtosis:", ub_kurt3)
    np.testing.assert_allclose(ub_kurt, ub_kurt2)
    np.testing.assert_allclose(ub_kurt, ub_kurt3)
Example #5
0
    def calc_rejection(self, trg_data, W, H, H2):
        diffs = np.zeros(H2.shape[1])
        for c in range(self.src.num_cluster):
            inds = np.where(self.cluster_labels == c)[0]
            if inds.size > 0:
                min_h2 = np.min(H[:, inds])
                max_h2 = np.max(H[:, inds])
                foo = H[:, inds]-min_h2 / (max_h2 - min_h2)
                foo = np.max(foo, axis=0) - np.min(foo, axis=0)
                diffs[inds] = foo

        kurts = stats.kurtosis(H, fisher=False, axis=0)
        K1 = trg_data.T.dot(trg_data)
        K2 = W.dot(H).T.dot(W.dot(H))
        K3 = W.dot(H2).T.dot(W.dot(H2))

        reject = list()
        reject.append(('Kurtosis', stats.kurtosis(H, fisher=False, axis=0)))
        reject.append(('Entropy', -stats.entropy(H)))
        reject.append(('KTA kurt1', self.reject_classifier(K1, diffs)))
        reject.append(('KTA kurt2', self.reject_classifier(K2, kurts)))
        reject.append(('KTA kurt3', self.reject_classifier(K3, kurts)))
        reject.append(('Diffs', diffs))
        reject.append(('Dist L2 H', -np.sum((np.abs(trg_data - W.dot(H))**2. ), axis=0)))
        reject.append(('Dist L2 H2', -np.sum((np.abs(trg_data - W.dot(H2))**2. ), axis=0)))
        reject.append(('Dist L1 H', -np.sum(np.abs(trg_data - W.dot(H)), axis=0)))
        reject.append(('Dist L1 H2', -np.sum(np.abs(trg_data - W.dot(H2)), axis=0)))
        return reject
Example #6
0
def smerodatna_odchylka(data, min=0, max=0, plot=True):
    #
    #   pocita smerodatnou odchylku. Pokud min a max neni nastaveno, pocita se
    #       z celeho pole. Jinak to je vyber mezi min a max
    #
    #   in 'data'   - pole s daty
    #   in 'min'    - minimalni hodnota v poli pro posouzeni
    #   in 'max'    - maximalni hodnota v poli pro posouzeni
    #   in 'plot'   - rozhoduje o vykresleni grafu
    #
    #   out 'out'   - smerodatna odchylka
    #


    data = np.array(data)

    if min == 0 and max == 0:
        average = np.mean(data)
        median = np.median(data)
        standardDeviation=np.std(data)
        kurtosis = stats.kurtosis(data)
        skewness = stats.skew(data)
    else:
        crop = np.array([])
        for x in data:
            if min < x < max:
                crop=np.append(crop,x)
        average = np.mean(crop)
#        modus = stats.mode(crop)
#        modus = statistics.mode(crop)         !!!!!
        median = np.median(crop)
        standardDeviation=np.std(crop)
        kurtosis = stats.kurtosis(crop)
        skewness = stats.skew(crop)

    if plot:

        plt.figure()
        plt.axvspan(float(min), float(max), alpha=0.3, color='k')
        plt.axvspan(average-standardDeviation, average+standardDeviation, alpha=0.4, color='b')
        plt.axvspan(average+standardDeviation, average+standardDeviation+standardDeviation, alpha=0.4, color='r')
        plt.axvspan(average-standardDeviation, average-standardDeviation-standardDeviation, alpha=0.4, color='r')
        plt.axvline(x=median, linewidth=2, color='r')
        plt.axvline(x=average, linewidth=2, color='g')
        #plt.axvline(x=modus[0], linewidth=2, color='b')
        plt.hist(data, 1.0+3.3*math.log(np.shape(data)[0]), facecolor='green', alpha=0.75)
        plt.text(average, 10, "std: "+ str(standardDeviation),
                bbox={'facecolor':'green', 'alpha':0.75, 'pad':10})
        plt.show(block=False)


    print "___________________________________________________________"
    print "výběr hodnot od ", float(min), " po ", float(max)
    print "průměr: ", average
    print "median: ", median
    print "smerodatn odchylka je: ", standardDeviation
    print "spicatost: ", kurtosis
    print "sikmost: ", skewness

    return standardDeviation
def test_unbiased_HMM(precision=2):
    n_rv, n_sample = 10, 100
    n_scenario = 500
    data = np.random.rand(n_rv, n_sample)

    # original statistics
    tgt_moments = np.zeros((n_rv, 4))
    tgt_moments[:, 0] = data.mean(axis=1)
    tgt_moments[:, 1] = data.std(axis=1, ddof=1)
    tgt_moments[:, 2] = spstats.skew(data, axis=1, bias=False)
    tgt_moments[:, 3] = spstats.kurtosis(data, axis=1, bias=False)
    tgt_corrs = np.corrcoef(data)

    t0 = time()
    py_scenarios = HMM(tgt_moments, tgt_corrs, n_scenario, bias=False)
    print ("python unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format(
        n_rv, n_scenario, time()-t0))

    t1 = time()
    c_scenarios = c_HMM(tgt_moments, tgt_corrs, n_scenario, bias=False)
    print ("c unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format(
        n_rv, n_scenario, time()-t1))

    for scenarios in (py_scenarios,  c_scenarios):
        # scenarios statistics
        res_moments = np.zeros((n_rv, 4))
        res_moments[:, 0] = scenarios.mean(axis=1)
        res_moments[:, 1] = scenarios.std(axis=1, ddof=1)
        res_moments[:, 2] = spstats.skew(scenarios, axis=1, bias=False)
        res_moments[:, 3] = spstats.kurtosis(scenarios, axis=1, bias=False)
        res_corrs = np.corrcoef(scenarios)

        np.testing.assert_array_almost_equal(tgt_moments, res_moments, precision)
        np.testing.assert_array_almost_equal(tgt_corrs, res_corrs, precision)
def main():
    start_time = time.time()
    files = [DATA_DIR + file for file in os.listdir(DATA_DIR) if fnmatch.fnmatch(file, '*.csv')]
    bad_codes = [0, 7, 8, 9]
    kt = []
    cong = list(range(102, 114))

    for i in files:
        print('processing', i, '...')
        table, dem, rep = get_data(i)
        # for j in bad_codes:
        #     table[table == j] = np.nan
        #     dem[dem == j] = np.nan
        #     rep[rep == j] = np.nan
        total_pol = 10-np.mean(sp.kurtosis(table, fisher=True, nan_policy='omit'))
        dem_pol = 10-np.mean(sp.kurtosis(dem, fisher=True, nan_policy='omit'))
        rep_pol = 10-np.mean(sp.kurtosis(rep, fisher=True, nan_policy='omit'))
        print('total polarization:', total_pol)
        print('democrat only polarization:', dem_pol)
        print('republican only polarization:', rep_pol, '\n')
        kt.append(total_pol)

    plt.plot(cong, kt)
    plt.title('Polarization timeline (original data)')
    plt.xlabel('x-th Congress')
    plt.ylabel('10 - kurtosis')
    # plt.show()
    plt.savefig('polarization.pdf')
    print('time taken:', time.time()-start_time, 'seconds')
def getstats_base(X, linds):
    sval = {}
    for l_ind in linds:
        print(l_ind, X['model_state']['layers'][l_ind]['name'])
        layer = X['model_state']['layers'][l_ind]
        w = layer['weights'][0]
        karray = stats.kurtosis(w)
        kall = stats.kurtosis(w.ravel())
        cf0 = np.corrcoef(w)
        cf0t = np.corrcoef(w.T)
        wmean = w.mean(1)
        w2mean = (w**2).mean(1)
        lname = X['model_state']['layers'][l_ind]['name']
        sval[lname] = {'karray': karray, 'kall': kall, 'corr0': cf0, 'corr0_t': cf0t,
                            'wmean': wmean, 'w2mean': w2mean}

        if 'filterSize' in X['model_state']['layers'][l_ind]:
            fs = X['model_state']['layers'][l_ind]['filterSize'][0]
            ws = w.shape
            w = w.reshape((ws[0] / (fs**2), fs, fs, ws[1]))
            mat = np.row_stack([np.row_stack([w[i, j, :, :] for i in range(w.shape[0])]).T for j in range(w.shape[1])] )
            cf = np.corrcoef(mat.T)
            cft = np.corrcoef(mat)
            mat2 = np.row_stack([np.row_stack([w[i, :, :, j] for i in range(w.shape[0])]).T for j in range(w.shape[3])] )
            cf2 = np.corrcoef(mat2.T)
            cf2t = np.corrcoef(mat2)
            sval[lname].update({'corr': cf, 'corr2': cf2, 'corr_t': cft, 'corr2_t': cf2t})

    return sval
Example #10
0
 def test_kurtosis(self):
     
     # Using the scipy.stats definition which is optimized and unittested
     data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]]
     expt = []
     expt.append(stats.kurtosis(data[0]))
     expt.append(stats.kurtosis(data[1]))
     resulting_vals = kurtosis(data)
     self.assertTrue(np.array_equal(np.array(expt),
                                                 np.array(resulting_vals)))
Example #11
0
File: hw3.py Project: jezlax/python
def print_kurtosis(scaled_data):
#print the kurtosis of the scaled data
    print "Kurotsis of original DF:", kurtosis(scaled_data)

    #print the kurtosis of the ICA transformed columns 
    for i in range(1,len(scaled_data[0])+1):
        ica = FastICA(n_components=i)
        ica_fit = ica.fit_transform(scaled_data)

        print "Kurtosis of ICA Transformed data when i=" + str(i) + ":", kurtosis(ica_fit)
Example #12
0
    def test_kurtosis(self):
        for n in self.get_n():
            x, y, xm, ym = self.generate_xy_sample(n)
            r = stats.kurtosis(x)
            rm = stats.mstats.kurtosis(xm)
            assert_almost_equal(r, rm, 10)

            r = stats.kurtosis(y)
            rm = stats.mstats.kurtosis(ym)
            assert_almost_equal(r, rm, 10)
def stats_plots(V, labelsin, title=None):
  """
  4 plots of basic statistical properties. IC = intraclass correlation, 
  or the noise sources between the groups.
  """
  import scipy.stats as stats
  colors = ['darkkhaki', 'royalblue', 'forestgreen','tomato']
  var = [np.var(i) for i in V]
  skew = [stats.skew(i) for i in V]
  kurt = [stats.kurtosis(i) for i in V]
  uniq = list(set(labelsin))
  v_sort = [[] for u in uniq] # Make a blank list, preparing for IC
  v_means = [[] for u in uniq] # v_means is a list of list of means for each cell of each type
  v_var, v_skew, v_kurt = [[] for u in uniq], [[] for u in uniq], [[] for u in uniq]
  for v in range(len(V)):
    i = uniq.index(labelsin[v])
    v_sort[i].append(V[v])
    v_means[i].append(np.mean(V[v]))
    v_var[i].append(np.var(V[v]))
    v_skew[i].append(stats.skew(V[v]))
    v_kurt[i].append(stats.kurtosis(V[v]))
  # ic = var_between^2 / (var_between^2 + var_within^2)  
  ic = []
  for v in range(len(uniq)):
    I = np.var(v_means[v])**2 / \
        (np.var(v_means[v])**2 + sum([np.var(i) for i in v_sort[v]])**2)
    ic.append([I])
  print(ic)
  group_means = [np.mean(k) for k in v_means] # group_means are the master means (only 4)
  master_ic = np.var(group_means)**2 / \
              (np.var(group_means)**2 + sum([np.var(i) for i in v_means])**2)
  print('Master IC for this set: %.5f' %master_ic)
  ## Plotting stuff
  fig = plt.figure()
  axs = [fig.add_subplot(221), fig.add_subplot(222), 
         fig.add_subplot(223), fig.add_subplot(224)]
  t**s = ['Variance', 'Skew', 'Kurtosis', 'Intraclass correlation']
  plot_vars = [v_var, v_skew, v_kurt, ic]
  for a in axs: # For each plot
    for u in range(len(uniq)): # For each cell type
      a.scatter(np.ones(len(plot_vars[axs.index(a)][u]))*u, plot_vars[axs.index(a)][u], 
                c=colors[u], s=80, edgecolor='k', alpha=0.6)
      if axs.index(a) == 3:
        a.set_yticks([0,0.12,0.24])
      else:
        a.locator_params(axis='y', nbins=4)
      a.set_xticks([])
      a.set_title(t**s[axs.index(a)])
  # Legend and title
  #patches = [mpatches.Patch(color=colors[u], label=uniq[u]) for u in range(len(uniq))]
  #plt.legend(handles=patches, loc=5)
  if title is not None:
    plt.suptitle(title, fontsize=20)
  plt.show()
Example #14
0
    def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None):
        """
        Returns the confidence interval for kurtosis.

        Parameters
        ----------

        sig : float
            The significance level.  Default is .05

        upper_bound : float
            Maximum value of kurtosis the upper limit can be.
            Default is .99 confidence limit assuming normality.

        lower_bound : float
            Minimum value of kurtosis the lower limit can be.
            Default is .99 confidence limit assuming normality.

        Returns
        --------
        Interval : tuple
            Lower and upper confidence limit

        Notes
        -----
        For small n, upper_bound and lower_bound may have to be
        provided by the user.  Consider using test_kurt to find
        values close to the desired significance level.

        If function returns f(a) and f(b) must have different signs, consider
        expanding the bounds.
        """
        endog = self.endog
        nobs = self.nobs
        if upper_bound is None:
            upper_bound = kurtosis(endog) + \
            (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5) * \
               (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
                 (nobs + 5.))) ** .5)
        if lower_bound is None:
            lower_bound = kurtosis(endog) - \
            (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5) * \
               (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
                 (nobs + 5.))) ** .5)
        self.r0 = chi2.ppf(1 - sig, 1)
        llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \
                             kurtosis(endog))
        ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \
                             upper_bound)
        return   llim, ulim
Example #15
0
def kurtosis_da(resp):
    dims = resp.coords.dims   
    
    if ('x' in resp) and ('y' in dims):
        resp = resp.transpose('unit', 'shapes', 'x', 'y')
    elif ('x' in dims):
        resp = resp.transpose('unit', 'shapes', 'x')
    elif ('y' in dims):
        resp = resp.transpose('unit', 'shapes', 'y')
        
    stim_resp = np.array([(unit**2).sum((1, 2)) for unit in resp.values])
    pos_resp = np.array([(unit**2).sum(0).ravel() for unit in resp.values])
    k_stim = kurtosis(stim_resp, axis=1, fisher=False)
    k_pos = kurtosis(pos_resp, axis=1, fisher=False)
    return k_pos, k_stim
Example #16
0
def kurto_improved(x,dt,LENwin):
    """
    Determines the kurtosis of a timeseries. 
    
    dt = sampling interval in seconds. 
    LENwin =  time window (in secs) over which the skewness is determined 

    described by kuperkoch et al. 2010: calculate the kurtosis recursively
    Results are not satisfying, but one may want to improve this... it might
    save some time!
   
    """

    # find number of samples in averaging windows
    nLEN=int(LENwin/dt)+1
    #xabs=abs(x)
    kurtos=[]
    first_window=ss.kurtosis(x[0:(0+nLEN)],fisher=False)
    kurtos.append(first_window)
    i=1
    while i<(len(x) - nLEN +1):
      new_value=kurtos[i-1]-(x[i-1])**4+(x[i-1+nLEN])**4
      kurtos.append(new_value)
      i+=1
    return(kurtos)
Example #17
0
 def test_rolling_kurt(self):
     try:
         from scipy.stats import kurtosis
     except ImportError:
         raise nose.SkipTest('no scipy')
     self._check_moment_func(moments.rolling_kurt,
                             lambda x: kurtosis(x, bias=False))
Example #18
0
    def compute_profile(self):
        self.rec.label_contours(self.ji_intervals)
        distributions = {}
        for key, segments in self.rec.contour_labels.items():
            distributions[key] = []
            for indices in segments:
                distributions[key].extend(self.pitch_obj.pitch[indices[0]:indices[1]])

        parameters = {}
        for interval, distribution in distributions.items():
            distribution = np.array(distribution)
            #TODO: replace -10000 with whatever the bound is for invalid pitch values in cent scale
            distribution = distribution[distribution >= -10000]
            [n, be] = np.histogram(distribution, bins=1200)
            bc = (be[1:] + be[:-1])/2.0
            peak_pos = bc[np.argmax(n)]
            peak_mean = float(np.mean(distribution))
            peak_variance = float(variation(distribution))
            peak_skew = float(skew(distribution))
            peak_kurtosis = float(kurtosis(distribution))
            pearson_skew = float(3.0 * (peak_mean - peak_pos) / np.sqrt(abs(peak_variance)))
            parameters[interval] = {"position": float(peak_pos),
                                    "mean": peak_mean,
                                    "amplitude": float(max(n)),
                                    "variance": peak_variance,
                                    "skew1": peak_skew,
                                    "skew2": pearson_skew,
                                    "kurtosis": peak_kurtosis}
        all_amps = [parameters[interval]["amplitude"] for interval in parameters.keys()]
        peak_amp_sum = sum(all_amps)
        for interval in parameters.keys():
            parameters[interval]["amplitude"] = parameters[interval]["amplitude"]/peak_amp_sum

        self.intonation_profile = parameters
    def _get_grid_size(data, use_default_square=False):
        """
        Calculate the size of the grid.

        Parameters
        ----------
        data: array-like
            The normalized data.
        use_default_square: bool
            Define the grid as the minimal possible square.

        Returns
        -------
        int, int
            The width and height of the grid.

        """

        # if the grid would be square, this is the minimum size
        sqr_size = int(np.ceil(np.sqrt(len(data))))
        size_x = size_y = sqr_size

        if not use_default_square:
            kurt = kurtosis(data)
            kurt_x, kurt_y = np.int32(np.abs(np.ceil(kurt * 2)))
            size_x += kurt_x
            size_y += kurt_y

        return size_x, size_y
def jarque_bera(resids):
    """
    Calculate residual skewness, kurtosis, and do the JB test for normality

    Parameters
    -----------
    resids : array-like

    Returns
    -------
    JB, JBpv, skew, kurtosis

    JB = n/6*(S^2 + (K-3)^2/4)

    JBpv is the Chi^2 two-tail probability value

    skew is the measure of skewness

    kurtosis is the measure of kurtosis

    """
    resids = np.asarray(resids)
    # Calculate residual skewness and kurtosis
    skew = stats.skew(resids)
    kurtosis = 3 + stats.kurtosis(resids)

    # Calculate the Jarque-Bera test for normality
    JB = (resids.shape[0]/6) * (skew**2 + (1/4)*(kurtosis-3)**2)
    JBpv = stats.chi2.sf(JB,2);

    return JB, JBpv, skew, kurtosis
Example #21
0
    def _find_high_kurtosis(self, pcas, memory):
        random_state = check_random_state(self.random_state)

        if not self.kurtosis_thr:
            kurtosis_thr = -np.inf
        else:
            kurtosis_thr = self.kurtosis_thr
        n_components = self.n_components

        while n_components < 3 * self.n_components:
            group_maps = memory.cache(
                randomized_svd)(pcas, n_components)[0]
            group_maps = group_maps[:, :n_components]

            ica_maps = memory.cache(fastica)(group_maps, whiten=False,
                                             fun='cube',
                                             random_state=random_state)[2]
            ica_maps = ica_maps.T
            kurtosis = stats.kurtosis(ica_maps, axis=1)
            kurtosis_mask = kurtosis > kurtosis_thr
            if np.sum(kurtosis_mask) >= n_components:
                order = np.argsort(kurtosis)[::-1]
                ica_maps = ica_maps[order[:n_components]]
                break
            n_components += 1

            del group_maps
        else:
            raise ValueError('Could not find components with high-enough'
                             ' kurtosis')
        self.n_components_ = n_components
        return ica_maps
def test_cont_basic_slow():
    # same as above for slow distributions
    for distname, arg in distcont[:]:
        if distname not in distslow: continue
        distfn = getattr(stats, distname)
        np.random.seed(765456)
        sn = 1000
        rvs = distfn.rvs(size=sn,*arg)
        sm = rvs.mean()
        sv = rvs.var()
        skurt = stats.kurtosis(rvs)
        sskew = stats.skew(rvs)
        m,v = distfn.stats(*arg)
        yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, distname + \
              'sample mean test'
        # the sample skew kurtosis test has known failures, not very good distance measure
        #yield check_sample_skew_kurt, distfn, arg, sskew, skurt, distname
        yield check_moment, distfn, arg, m, v, distname
        yield check_cdf_ppf, distfn, arg, distname
        yield check_sf_isf, distfn, arg, distname
        yield check_pdf, distfn, arg, distname
        yield check_pdf_logpdf, distfn, arg, distname
        yield check_cdf_logcdf, distfn, arg, distname
        yield check_sf_logsf, distfn, arg, distname
        #yield check_oth, distfn, arg # is still missing
        if distname in distmissing:
            alpha = 0.01
            yield check_distribution_rvs, distname, arg, alpha, rvs
 def computeProfileStatScores(self):
     """
     Builds the scores using raw profile intensity data only. Returns the scores.
     
     Parameters:
     N/A
     
     Returns:
     An array of profile intensities as floating point values.
     """
     
     try:
         
         bins =[] 
         for intensity in self.profile:
             bins.append(float(intensity))
         
         mn = mean(bins)
         stdev = std(bins)
         skw = skew(bins)
         kurt = kurtosis(bins)
         
         stats = [mn,stdev,skw,kurt]
         return stats
     
     except Exception as e: # catch *all* exceptions
         print "Error getting Profile stat scores from PHCX file\n\t", sys.exc_info()[0]
         print self.format_exception(e)
         raise Exception("Profile stat score extraction exception")
         return []
 def computeDMCurveStatScores(self):
     """
     Returns a list of integer data points representing the candidate DM curve.
     
     Parameters:
     N/A
     
     Returns:
     A list data type containing data points.
     
     """
     
     try:
         bins=[]
         bins=self.profileOps.getDMCurveData(self.rawdata,self.profileIndex)
         
         mn = mean(bins)
         stdev = std(bins)
         skw = skew(bins)
         kurt = kurtosis(bins)
         
         stats = [mn,stdev,skw,kurt]
         return stats  
     
     except Exception as e: # catch *all* exceptions
         print "Error getting DM curve stat scores from PHCX file\n\t", sys.exc_info()[0]
         print self.format_exception(e)
         raise Exception("DM curve stat score extraction exception")
         return []
def get_stat_feature(fname):
    #b,_ = librosa.load(fname, res_type = 'kaiser_fast')
    b,_ = librosa.load(i, res_type = 'kaiser_fast')
    try:
        #basic statistical features
        length = len(b)
        mean = np.mean(b)
        minimum = np.min(b)
        maximum = np.max(b)
        std = np.std(b)
        rms = np.sqrt(np.mean(b**2))
        kurt = kurtosis(b)
        Skew = skew(b)
        #Audio length feature
        data,samp_rate = librosa.effects.trim(b,top_db = 40)
        len_init = len(data) 
        ratio_init = len_init/length
        splits = librosa.effects.split(b, top_db=40)
        if len(splits) > 1:
            b = np.concatenate([b[x[0]:x[1]] for x in splits]) 
        len_final = len(b) 
        ratio_final = len_final/length
        #return pd.Series([mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final])
        return pd.Series(np.hstack((mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final)))
    except:
        print("Bad file at {}".format(fname))
        return pd.Series([0]*11)      
Example #26
0
 def grid_color_stat(patient_grid_1_color):
     shape_stats = np.zeros(4)
     shape_stats[0] = np.mean(patient_grid_1_color.flatten())
     shape_stats[1] = np.std(patient_grid_1_color.flatten())
     shape_stats[2] = skew(patient_grid_1_color.flatten())
     shape_stats[3] = kurtosis(patient_grid_1_color.flatten())
     return shape_stats
Example #27
0
def calc_statistics(x):
    n = x.shape[0]  # 样本个数

    # 手动计算
    m = 0
    m2 = 0
    m3 = 0
    m4 = 0
    for t in x:
        m += t
        m2 += t*t
        m3 += t**3
        m4 += t**4
    m /= n
    m2 /= n
    m3 /= n
    m4 /= n

    mu = m
    sigma = np.sqrt(m2 - mu*mu)
    skew = (m3 - 3*mu*m2 + 2*mu**3) / sigma**3
    kurtosis = (m4 - 4*mu*m3 + 6*mu*mu*m2 - 4*mu**3*mu + mu**4) / sigma**4 - 3
    print '手动计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis

    # 使用系统函数验证
    mu = np.mean(x, axis=0)
    sigma = np.std(x, axis=0)
    skew = stats.skew(x)
    kurtosis = stats.kurtosis(x)
    return mu, sigma, skew, kurtosis
    def test_kurt(self):
        from scipy.stats import kurtosis

        string_series = tm.makeStringSeries().rename('series')

        alt = lambda x: kurtosis(x, bias=False)
        self._check_stat_op('kurt', alt, string_series)

        index = pd.MultiIndex(
            levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
            codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]
        )
        s = Series(np.random.randn(6), index=index)
        tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar'])

        # test corner cases, kurt() returns NaN unless there's at least 4
        # values
        min_N = 4
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                assert np.isnan(s.kurt())
                assert np.isnan(df.kurt()).all()
            else:
                assert 0 == s.kurt()
                assert (df.kurt() == 0).all()
Example #29
0
def mcnoise(data, noise_std, n, noise_scaling=1.):
    """
    Parameters
    ----------
    data : ndarray
        Array of data.
    noise_std : float
        Standard deviation of the noise
    n : int
        Number of repetition
    noise_scaling: float
        Scaling factor for noise

    Returns
    -------
    variance, variance error, skewness, skewness error, kurtosis, kurtosis error

    """
    noise_arr = np.random.normal(0, noise_std, (n, data.size)) * noise_scaling
    var_sample = np.var(data + noise_arr, axis=1)
    skew_sample = skew(data + noise_arr, axis=1)
    kurt_sample = kurtosis(data + noise_arr, axis=1)
    var_val = np.mean(var_sample)
    skew_val = np.mean(skew_sample)
    kurt_val = np.mean(kurt_sample)
    var_err = np.std(var_sample)
    skew_err = np.std(skew_sample)
    kurt_err = np.std(kurt_sample)
    return var_val, var_err, skew_val, skew_err, kurt_val, kurt_err
Example #30
0
def perf_stats(
        returns,
        returns_style='compound',
        return_as_dict=False,
        period=DAILY):
    """Calculates various performance metrics of a strategy, for use in
    plotting.show_perf_stats.

    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, noncumulative.
         - See full explanation in tears.create_full_tear_sheet.
    returns_style : str, optional
       See annual_returns' style
    return_as_dict : boolean, optional
       If True, returns the computed metrics in a dictionary.
    period : str, optional
        - defines the periodicity of the 'returns' data for purposes of
        annualizing. Can be 'monthly', 'weekly', or 'daily'
        - defaults to 'daily'.

    Returns
    -------
    dict / pd.DataFrame
        Performance metrics.

    """

    all_stats = OrderedDict()
    all_stats['annual_return'] = annual_return(
        returns,
        style=returns_style, period=period)
    all_stats['annual_volatility'] = annual_volatility(returns, period=period)
    all_stats['sharpe_ratio'] = sharpe_ratio(
        returns,
        returns_style=returns_style, period=period)
    all_stats['calmar_ratio'] = calmar_ratio(
        returns,
        returns_style=returns_style, period=period)
    all_stats['stability'] = stability_of_timeseries(returns)
    all_stats['max_drawdown'] = max_drawdown(returns)
    all_stats['omega_ratio'] = omega_ratio(returns)
    all_stats['sortino_ratio'] = sortino_ratio(returns)
    # TODO: The information_ratio method requires
    # a second argument for benchmark returns.
    # Setting information_ratio to NaN until
    # benchmark returns are added as an argument
    # to this method.
    all_stats['information_ratio'] = np.nan
    all_stats['skewness'] = stats.skew(returns)
    all_stats['kurtosis'] = stats.kurtosis(returns)
    if return_as_dict:
        return all_stats
    else:
        all_stats_df = pd.DataFrame(
            index=list(all_stats.keys()),
            data=list(all_stats.values()))
        all_stats_df.columns = ['perf_stats']
        return all_stats_df
Example #31
0
        def kurt(x):
            from scipy.stats import kurtosis  # noqa:F811

            if len(x) < 4:
                return np.nan
            return kurtosis(x, bias=False)
Example #32
0
       start_date = _start_date, 
       end_date = _end_date, 
       start_t = _start_t, 
       end_t = _end_t, 
    )

# print(df)

max_value = max(list(df[_col_name]))
average_value = float(round(np.mean(list(df[_col_name])), 2))
area_value = round(sum(list(df[_col_name])), 2)
median_value = round(float(np.median(list(df[_col_name]))), 2)
var_value = round( np.var(list(df[_col_name])), 2)
std_value = round( np.std(list(df[_col_name])), 2)
skew_value = round( skew(list(df[_col_name])), 2)
kurtosis_value = round( kurtosis(list(df[_col_name])), 2)
q1 = round( np.quantile(list(df[_col_name]), .25), 2)
q3 = round( np.quantile(list(df[_col_name]), .75), 2)
iqr_value = round( iqr(list(df[_col_name])), 2)
percentile10 = round( np.percentile(list(df[_col_name]), 10), 2)
percentile40 = round( np.percentile(list(df[_col_name]), 40), 2)
percentile60 = round( np.percentile(list(df[_col_name]), 60), 2)
percentile90 = round( np.percentile(list(df[_col_name]), 90), 2)
trim_mean10 = round( trim_mean(list(df[_col_name]), 0.1), 2)
trim_mean20 = round( trim_mean(list(df[_col_name]), 0.2), 2)

features = {
    'max': max_value,
    'average': average_value,
    'area': area_value,
    'median': median_value,
Example #33
0
    def process_results_run(self):

        #########
        # Prepare variables for results processing
        #########

        true_detected_positions = []
        temp_list = []
        for i_detector in range(self.n_drift_detectors) :
            true_detected_positions.append([])
            temp_list.append(copy(self.true_positions))

        #########
        # Process results from drift detection
        #########

        for i_detector in range(self.n_drift_detectors) :
            # Get a list of TP detected drifts
            for j in range(len(self.detected_positions[i_detector])) :
                try :
                    true_position = min([num for num in temp_list[i_detector] if num<self.detected_positions[i_detector][j]], key=lambda x:abs(x-self.detected_positions[i_detector][j]))
                    true_detected_positions[i_detector].append(true_position)
                    self.list_TP[i_detector].append(self.detected_positions[i_detector][j])

                    self.delays[i_detector].append(self.detected_positions[i_detector][j]-true_position)

                    ind = temp_list[i_detector].index(true_position)

                    if ind > 0:
                        del temp_list[i_detector][0:ind+1]
                    else :
                        del temp_list[i_detector][ind]

                except ValueError :
                    pass


            self.n_TP[i_detector] = len(self.list_TP[i_detector])
            self.n_FP[i_detector] = len(self.detected_positions[i_detector]) - self.n_TP[i_detector]

            self.list_n_detected[i_detector].append(self.n_detected_drifts[i_detector])
            self.list_n_TP[i_detector].append(self.n_TP[i_detector])
            self.list_n_FP[i_detector].append(self.n_FP[i_detector])
            self.list_delays[i_detector].append(np.mean(self.delays[i_detector]))

            # Exceptions raised if not enought drift detected to calculate stats, we decide to add Nan to the results then
            try :
                # Stats of meta-features : median, kurtosis, skewness, perc10, perc90
                self.stats_severity_list[i_detector].append([np.median(self.severity_list[i_detector]),
                                                            kurtosis(self.severity_list[i_detector]),
                                                            skew(self.severity_list[i_detector]),
                                                            np.percentile(self.severity_list[i_detector],10),
                                                            np.percentile(self.severity_list[i_detector],90)])
            except:
                #Debug
#                print('Severity')
#                print('Detector : '+str(self.list_names_drifts_detectors[i_detector]))
#                print('Stream : '+str(self.name_file))
#                print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector])))

                self.stats_severity_list[i_detector].append([np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan])

            try :
                self.stats_magnitude_list[i_detector].append([np.median(self.magnitude_list[i_detector]),
                                                            kurtosis(self.magnitude_list[i_detector]),
                                                            skew(self.magnitude_list[i_detector]),
                                                            np.percentile(self.magnitude_list[i_detector],10),
                                                            np.percentile(self.magnitude_list[i_detector],90)])
            except:
                #Debug
#                print('Magnitude')
#                print('Detector : '+str(self.list_names_drifts_detectors[i_detector]))
#                print('Stream : '+str(self.name_file))
#                print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector])))

                self.stats_magnitude_list[i_detector].append([np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan])
            try :
                self.stats_interval_list[i_detector].append([np.median(self.interval_list[i_detector]),
                                                            kurtosis(self.interval_list[i_detector]),
                                                            skew(self.interval_list[i_detector]),
                                                            np.percentile(self.interval_list[i_detector],10),
                                                            np.percentile(self.interval_list[i_detector],90)])
            except:
                #Debug
#                print('Interval')
#                print('Detector : '+str(self.list_names_drifts_detectors[i_detector]))
#                print('Stream : '+str(self.name_file))
#                print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector])))

                self.stats_interval_list[i_detector].append([np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan,
                                                            np.nan])


        # Reset for next run
        self.reset_run()
def extract_feature(X, y, fs):
    """ 特征提取
    
        @param X: 数据样本
        @param y:数据标签
        @param fs:原始数据采样频率
        @return FX: 特征向量
        @return Fy: 标签
    
    example:

    from utils.augment import preprocess
    from utils.feature import extract_feature
    # -1- 载入数据
    path = r"./data/0HP"
    data_mark = "FE"
    len_data = 1024
    overlap_rate = 50 # 50%
    random_seed = 1 
    fs = 12000

    X, y = preprocess(path, 
                      data_mark, 
                      fs, 
                      len_data/fs, 
                      overlap_rate, 
                      random_seed ) 
    # -2- 提取特征
    FX, Fy = extract_feature(X, y, fs)

    """
    def skewness(s) -> float:
        """ 
        偏度计算
        """
        N = len(s)
        s = np.ravel(s)
        mean = np.mean(s)
        rms = np.sqrt(np.dot(s, s) / N)
        return np.sum(np.power(np.abs(s) - mean, 3)) / (N * rms**3)

    def maxf_in_env_spectrum(data, fs) -> float:
        """ 
        包络谱最大幅值处的频率
        """
        data = np.ravel(data)
        N = len(data)
        T = 1 / fs
        analytic_signal = hilbert(data)
        am_enve = np.abs(analytic_signal).reshape(N, )
        yf = fft(am_enve - np.mean(am_enve))
        y_envsp = 2.0 / N * np.abs(yf[0:N // 2]).reshape(N // 2, 1)
        xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2)
        # 返回最大幅值的频率
        maxf = xf[np.argwhere(y_envsp == np.max(y_envsp))[0][0]]
        return maxf

    def hist_for_entropy(s):
        """ 
        对信号的直方图计算
            
            @param s:一维序列数据
            @return res: 直方图每个组对应的高度
            @return s_min:s的最小值
            @return s_max:s的最大值
            @return ncell:直方图的分组数目
            
        """
        s = np.ravel(s)
        N = len(s)
        s_max = np.max(s)
        s_min = np.min(s)
        delt = (s_max - s_min) / N
        c_0 = s_min - delt / 2
        c_N = s_max + delt / 2
        ncell = int(np.ceil(np.sqrt(N)))

        # c = f(s)
        c = np.round((s - c_0) / (c_N - c_0) * ncell + 1 / 2)

        # 计算分组数组出现的频次
        res = np.zeros(ncell)
        for i in range(0, N):
            ind = int(c[i])
            if ind >= 1 and ind <= ncell:
                res[ind - 1] = res[ind - 1] + 1

        return res, s_min, s_max, ncell

    def shannom_entropy_for_hist(s) -> float:
        """ 
        一维序列的香农信号熵
        
            @param x: 一维序列数据
            @return estimate: 香农信号熵的无偏估计值
        """
        h, s_min, s_max, ncell = hist_for_entropy(s)
        # 无偏估计
        h = h[h != 0]
        N = np.sum(h)
        estimate = -np.sum(h * np.log(h)) / N
        sigma = np.sum(h * np.log2(h)**2)
        sigma = np.sqrt((sigma / N - estimate**2) / (N - 1))
        estimate = estimate + np.log(N) + np.log((s_max - s_min) / ncell)
        nbias = -(ncell - 1) / (2 * N)
        estimate = estimate - nbias

        return estimate

    def pdf_for_median_am(s) -> float:
        """ 
        一维序列信号幅值中位数处的概率密度估计
            
            @param s: 一维序列信号
            @return 幅值中位数处的概率密度估计
        """
        N = len(s)
        res, s_min, s_max, ncell = hist_for_entropy(s)
        # 归一化的到概率密度
        pdf = res / N / (s_max - s_min) * ncell

        # 幅值中位数 映射 到直方图的组号
        delt = (s_max - s_min) / N
        c_min = s_min - delt / 2
        c_max = s_max + delt / 2

        s_median = np.median(s)
        s_median_icell = int(
            np.round((s_median - c_min) / (c_max - c_min) * ncell + 1 / 2))

        return pdf[s_median_icell]

    feature = {}
    N = len(X[0])
    feature['mean'] = [np.mean(x) for x in X]
    feature['rms'] = [np.sqrt(np.dot(np.ravel(x), np.ravel(x)) / N) for x in X]
    feature['std'] = [np.std(x) for x in X]
    feature['skewness'] = [skewness(x) for x in X]
    feature['kurtosis'] = [kurtosis(x, fisher=False) for x in X]
    feature['maxf'] = [maxf_in_env_spectrum(x, fs) for x in X]
    feature['signal_entropy'] = [shannom_entropy_for_hist(x) for x in X]
    feature['am_median_pdf'] = [pdf_for_median_am(x) for x in X]
    feature['label'] = [int(la) for la in y]

    # 返回pandas.DataFrame类型的特征矩阵
    f_datafram = pd.DataFrame([feature[k] for k in feature.keys()],
                              index=list(feature.keys())).T

    # 返回 FX,Fy
    features = [
        'mean', 'rms', 'std', 'skewness', 'kurtosis', 'maxf', 'signal_entropy',
        'am_median_pdf'
    ]
    FX, Fy = f_datafram[features], f_datafram['label']

    return FX, Fy
def normalized_kurtosis(x, tx):
    y = normalize(x, tx)
    return kurtosis(y)
Example #36
0
result['clustersCOrdered'] = clustersOrdered1

### Algoritmo de seleção dos primeiros features de cada cluster
# Distance
check = 0
featSelDist = []
bestKurtosis = 100
bestKurtosisID = {}
i = 0
for val in clustersOrdered:
    if val != check:
        check = val
        bestKurtosis = 100

    featureKurtosis = np.abs(
        kurtosis(featureData[:, result['orderedDistanceMatrixFeaturesID'][i]]))
    if (featureKurtosis < bestKurtosis):
        bestKurtosis = featureKurtosis
        bestKurtosisID[check] = result['orderedDistanceMatrixFeaturesID'][i]

    i = i + 1

for k in range(1, len(bestKurtosisID) + 1):
    featSelDist.append(bestKurtosisID[k])
result['featSelDist'] = featSelDist

# Correlation
check = 0
featSelCorr = []
bestKurtosis = 100
bestKurtosisID = {}
Example #37
0
    def _get_opinion_score_2darray_with_preprocessing(dataset_reader,
                                                      **kwargs):

        s_es = dataset_reader.opinion_score_2darray

        # dscore_mode: True - do differential-scoring
        #              False - don't do differential-scoring
        dscore_mode = kwargs[
            'dscore_mode'] if 'dscore_mode' in kwargs else False

        # zscore_mode: True - do z-scoring (normalizing to 0-mean 1-std)
        #              False - don't do z-scoring
        zscore_mode = kwargs[
            'zscore_mode'] if 'zscore_mode' in kwargs else False

        # subject_rejection: True - do subject rejection
        #              False - don't do subject rejection
        subject_rejection = kwargs[
            'subject_rejection'] if 'subject_rejection' in kwargs else False

        if dscore_mode is True:
            E, S = s_es.shape
            s_e = pd.DataFrame(s_es).mean(axis=1)  # mean along s
            s_e_ref = DmosModel._get_ref_mos(dataset_reader, s_e)
            s_es = s_es + dataset_reader.ref_score - np.tile(s_e_ref, (S, 1)).T

        if zscore_mode is True:
            E, S = s_es.shape
            mu_s = pd.DataFrame(s_es).mean(axis=0)  # mean along e
            simga_s = pd.DataFrame(s_es).std(ddof=1, axis=0)  # std along e
            s_es = (s_es - np.tile(mu_s, (E, 1))) / np.tile(simga_s, (E, 1))

        if subject_rejection is True:
            E, S = s_es.shape

            ps = np.zeros(S)
            qs = np.zeros(S)

            for s_e in s_es:
                s_e_notnan = s_e[~np.isnan(s_e)]
                mu = np.mean(s_e_notnan)
                sigma = np.std(s_e_notnan)
                kurt = stats.kurtosis(s_e_notnan, fisher=False)

                if 2 <= kurt and kurt <= 4:
                    for idx_s, s in enumerate(s_e):
                        if not np.isnan(s):
                            if s >= mu + 2 * sigma:
                                ps[idx_s] += 1
                            if s <= mu - 2 * sigma:
                                qs[idx_s] += 1
                else:
                    for idx_s, s in enumerate(s_e):
                        if not np.isnan(s):
                            if s >= mu + np.sqrt(20) * sigma:
                                ps[idx_s] += 1
                            if s <= mu - np.sqrt(20) * sigma:
                                qs[idx_s] += 1
            rejections = []
            acceptions = []
            for idx_s, subject in zip(range(S), range(S)):
                if (ps[idx_s] + qs[idx_s]) / E > 0.05 and np.abs(
                    (ps[idx_s] - qs[idx_s]) / (ps[idx_s] + qs[idx_s])) < 0.3:
                    rejections.append(subject)
                else:
                    acceptions.append(subject)

            s_es = s_es[:, acceptions]

        return s_es
    def create_feature_with_distribution(dataframe_column,
                                         number_observations):
        """
            Calculates the distribution of a Feature, Categorical or Numerical.
            Some Numerical features, if they have few distinct values, a categorical distribution also will be 
            applied.

            Params:

                @dataframe_column must be a dataframe column

            Returns a Feature
        """
        try:
            data_type = dataframe_column.dtypes
            data_category = FeatureHelper.get_data_category(data_type)

            categories = []
            num_statistics = None

            # print(len(feature))
            unique_values = len(dataframe_column.unique())
            missing_values_NA = dataframe_column.isna().sum()
            missing_values_NULL = dataframe_column.isnull().sum()

            can_be_seen_as_category = FeatureHelper.can_be_seen_as_category(
                data_type, unique_values, number_observations)
            can_be_seen_as_index = FeatureHelper.can_be_seen_as_index(
                data_type, unique_values, number_observations)

            # CHECK 1
            # Categorical or Numerical can be seen as Category
            if data_category == DATA_CATEGORY_CATEGORICAL or can_be_seen_as_category:

                value_counts = dataframe_column.value_counts()
                keys = value_counts.keys()

                # Total
                #TODO: optimize this calculation, avoid to calculate all times (total, get the dataset rows)
                #total = 0
                #for i in range(0, len(keys) ):
                #    total += value_counts[keys[i]]

                for i in range(0, len(keys)):
                    count = value_counts[keys[i]]

                    categories.append(
                        Category(value=keys[i],
                                 frequency=count,
                                 proportion=count / number_observations))

            # CHECK 2
            # Numerical Statistics
            if data_category == DATA_CATEGORY_NUMERICAL:

                num_statistics = NumericalStatistics(
                    mean_value=mean(dataframe_column),
                    median_value=median(dataframe_column),
                    standard_deviation=stdev(dataframe_column),
                    mode_value=stats.mode(dataframe_column),
                    max_value=max(dataframe_column),
                    min_value=min(dataframe_column),
                    kurtosis=stats.kurtosis(dataframe_column),
                    skewness=stats.skew(dataframe_column))

            feature = Feature(name=dataframe_column.name,
                              label=dataframe_column.name,
                              data_type=data_type,
                              data_category=data_category,
                              can_be_seen_as_category=can_be_seen_as_category,
                              can_be_seen_as_index=can_be_seen_as_index,
                              unique_values=unique_values,
                              missing_values_NA=missing_values_NA,
                              missing_values_NULL=missing_values_NULL,
                              statistics=num_statistics,
                              categories=categories)

            return feature

        except Exception as e:
            print(
                'Error: data_tabular - calc_distribution() \nException Message: ',
                e)
Example #39
0
print("男孩身高標準差=", std_boy)

statistics_stdev_boy = statistics.stdev(boys)
print("statistics_mean_boy=", statistics_stdev_boy)

# python 百分位數
# np
print("90百分位數=", np.percentile(boys, 90))
print("50百分位數=", np.percentile(boys, 50))
print("20百分位數=", np.percentile(boys, 20))
#stat
print("20百分位數=", stats.scoreatpercentile(boys, 20))

#計算峰度和偏度
print(stats.skew(boys))
print(stats.kurtosis(boys))

# pandas和 stat 接近
# python的峰帶

#最後,畫圖看分布
plt.hist(boys, alpha=.4, bins=40)
plt.title('boy,skewness={0},kurtosis={1}'.format(
    round(stats.skew(boys), 2), round(stats.kurtosis(boys), 2)))
plt.axvline(x=mean_boy)
plt.show()

# 今天學到不同統計量之間特性,
# 試著分析男生女生身高資料,
# 試著回答下面的問題:
# Q1:試著用今天所教的內容,如何描述這兩組資料的樣態?
Example #40
0
def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True):
    """
    Calculates the four kurtosis measures in Kim & White

    Parameters
    ----------
    y : array-like
    axis : int or None, optional
        Axis along which the kurtoses are computed.  If `None`, the
        entire array is used.
    ab: iterable, optional
        Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
        quantile cut-off for measuring the extreme tail and beta is the central
        quantile cutoff for the standardization of the measure
    db: iterable, optional
        Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
        quantile for measuring extreme values and gamma is the central quantile
        used in the the standardization of the measure
    excess : bool, optional
        If true (default), computed values are excess of those for a standard
        normal distribution.

    Returns
    -------
    kr1 : ndarray
          The standard kurtosis estimator.
    kr2 : ndarray
          Kurtosis estimator based on octiles.
    kr3 : ndarray
          Kurtosis estimators based on exceedence expectations.
    kr4 : ndarray
          Kurtosis measure based on the spread between high and low quantiles.

    Notes
    -----
    The robust kurtosis measures are defined

    .. math::

        KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right)
        +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)}
        {\\hat{q}_{.75}-\\hat{q}_{.25}}

    .. math::

        KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right)
        -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)}
        {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right)
        -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)}

    .. math::

        KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}}
        {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}}

    where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`.

    .. [1] Tae-Hwan Kim and Halbert White, "On more robust estimation of
       skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
       March 2004.
    """
    if (axis is None or (y.squeeze().ndim == 1 and y.ndim != 1)):
        y = y.ravel()
        axis = 0

    alpha, beta = ab
    delta, gamma = dg

    perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, delta, 100.0 - delta, gamma,
            100.0 - gamma)
    e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y,
                                                               perc,
                                                               axis=axis)

    expected_value = expected_robust_kurtosis(ab,
                                              dg) if excess else np.zeros(4)

    kr1 = stats.kurtosis(y, axis, False) - expected_value[0]
    kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1]
    if y.ndim == 1:
        kr3 = _kr3(y, alpha, beta)
    else:
        kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta)
    kr3 -= expected_value[2]
    kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3]
    return kr1, kr2, kr3, kr4
Example #41
0
def dataClean(alltrigs, opt, flag=1):
    """
    Examine triggers and weed out spikes and calibration pulses using kurtosis and
    outlier ratios
    
    alltrigs: triggers output from triggering
    opt: opt from config
    flag: 1 if defining window to check, 0 if want to check whole waveform for spikes
        (note that different threshold values should be used for different window lengths)
    
    Returns good trigs (trigs) and several junk types (junk, junkFI, junkKurt)
    """

    trigs = Stream()
    junkFI = Stream()
    junkKurt = Stream()
    junk = Stream()
    for i in range(len(alltrigs)):

        njunk = 0
        ntele = 0

        for n in range(opt.nsta):

            dat = alltrigs[i].data[n * opt.wshape:(n + 1) * opt.wshape]
            if flag == 1:
                datcut = dat[range(
                    int((opt.ptrig - opt.kurtwin / 2) * opt.samprate),
                    int((opt.ptrig + opt.kurtwin / 2) * opt.samprate))]
            else:
                datcut = dat

            if np.sum(np.abs(dat)) != 0.0:
                # Calculate kurtosis in window
                k = stats.kurtosis(datcut)
                # Compute kurtosis of frequency amplitude spectrum next
                datf = np.absolute(fft(dat))
                kf = stats.kurtosis(datf)
                # Calculate outlier ratio using z ((data-median)/mad)
                mad = np.nanmedian(np.absolute(dat - np.nanmedian(dat)))
                z = (dat - np.median(dat)) / mad
                # Outliers have z > 4.45
                orm = len(z[z > 4.45]) / np.array(len(z)).astype(float)

                if k >= opt.kurtmax or orm >= opt.oratiomax or kf >= opt.kurtfmax:
                    njunk += 1

                winstart = int(opt.ptrig * opt.samprate - opt.winlen / 10)
                winend = int(opt.ptrig * opt.samprate - opt.winlen / 10 +
                             opt.winlen)
                fftwin = np.reshape(fft(dat[winstart:winend]), (opt.winlen, ))
                if np.median(np.abs(dat[winstart:winend])) != 0:
                    fi = np.log10(
                        np.mean(
                            np.abs(
                                np.real(
                                    fftwin[int(opt.fiupmin * opt.winlen /
                                               opt.samprate
                                               ):int(opt.fiupmax * opt.winlen /
                                                     opt.samprate)]))) /
                        np.mean(
                            np.abs(
                                np.real(
                                    fftwin[int(opt.filomin * opt.winlen /
                                               opt.samprate
                                               ):int(opt.filomax * opt.winlen /
                                                     opt.samprate)]))))
                    if fi < opt.telefi:
                        ntele += 1

        # Allow if there are enough good stations to correlate
        if njunk <= (opt.nsta - opt.ncor) and ntele <= opt.teleok:
            trigs.append(alltrigs[i])
        else:
            if njunk > 0:
                if ntele > 0:
                    junk.append(alltrigs[i])
                else:
                    junkKurt.append(alltrigs[i])
            else:
                junkFI.append(alltrigs[i])

    return trigs, junk, junkFI, junkKurt
Example #42
0
def getKURT(vector):
    vector = np.asarray(vector)
    return stats.kurtosis(vector)
Example #43
0
"""
  Name     : c8_14_mean_std_skew_kurt.py
  Book     : Python for Finance (2nd ed.)
  Publisher: Packt Publishing Ltd. 
  Author   : Yuxing Yan
  Date     : 6/6/2017
  email    : [email protected]
             [email protected]
"""

from scipy import stats, random
import numpy as np
np.random.seed(12345)
ret = random.normal(0, 1, 500000)

print('mean    =', np.mean(ret))
print('std     =', np.std(ret))
print('skewness=', stats.skew(ret))
print('kurtosis=', stats.kurtosis(ret))
Example #44
0
scaler = StandardScaler()
X= scaler.fit_transform(X)
print("PCA analysis")
from sklearn.decomposition import FastICA
from scipy.stats import kurtosis
KURTOSIS = []
N_COMPS = np.arange(2,20,1)

for n_comps in N_COMPS:
    X = np.copy(X_safe)
    Y = np.copy(Y_safe)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    transformer = FastICA(n_components=n_comps,  random_state=111,tol=0.001)
    X = transformer.fit_transform(X)
    kurt = kurtosis(X,axis=1)
    kurt = np.mean(np.abs(kurt))
    KURTOSIS.append(kurt)
    print(" n_clusters = ", n_comps, " Kurtosis :", kurtosis)

fig, ax = plt.subplots()
plt.plot(N_COMPS, KURTOSIS , 'o', color = 'steelblue')
plt.plot(N_COMPS, KURTOSIS , '-', color = 'steelblue' , alpha = 0.5)
plt.title("Kurtosis, Independent component analysis")
plt.xlabel('Number of components') , plt.ylabel('Average kurtosis')
plt.savefig("plots/diabetes_ICA_kurtosis.png")

X = np.copy(X_safe)
Y = np.copy(Y_safe)
scaler = StandardScaler()
X = scaler.fit_transform(X)
Example #45
0
def dohistogram(que, Y, **kwargs):
    '''
    Return a histogram of Y-values and a gaussian
    fit of the histogram, excluding values that
    exceed either the compliance limit (for current
    or current-density) or the ceiling for R. We
    would like to include all data in the histogram,
    but outliers sometimes confuse the fitting
    routine, which defeats the purpose of machine-fitting
    '''

    defaultKwargs = {'label': '', 'density': False, 'warnings': False}
    kwargs = {**defaultKwargs, **kwargs}
    logger = logging.getLogger(__package__ + ".dohistogram")
    logger.addHandler(QueueHandler(que))

    def __handlematherror(msg):
        # TODO we can now split out the file name with the bad data in it!
        logger.warning(
            "Encountered this error while constructing histogram: %s",
            str(msg),
            exc_info=False)
        bins = np.array([0., 0., 0., 0.])
        freq = np.array([0., 0., 0., 0.])
        return bins, freq

    try:
        yrange = (Y.min(), Y.max())
    except ValueError as msg:
        logger.error("Error ranging data for histogram: %s", str(msg))
        yrange = (0, 0)

    if kwargs['label'] == "J" or kwargs['label'] == "lag":
        Y = Y[Y <= opts.compliance]
        if yrange != (0, 0):
            yrange = (Y.min() - 1, Y.max() + 1)
    if kwargs['label'] == "R":
        Y = Y[Y <= opts.maxr]
    if kwargs['label'] in ('DJDV', 'NDC'):
        nbins = opts.heatmapbins
    else:
        nbins = opts.bins
    if len(Y) < 10 and kwargs['warnings']:
        logger.warning("Histogram with only %d points.", len(Y))
    try:
        freq, bins = np.histogram(Y,
                                  range=yrange,
                                  bins=nbins,
                                  density=kwargs['density'])
    except ValueError as msg:
        bins, freq = __handlematherror(msg)
    except FloatingPointError as msg:
        bins, freq = __handlematherror(msg)

    if len(Y):
        Ym = signedgmean(Y)
        Ys = abs(Y.std())
    else:
        Ym, Ys = 0.0, 0.0

    p0 = [1., Ym, Ys]
    bin_centers = (bins[:-1] + bins[1:]) / 2
    coeff = p0
    covar = None
    assert (covar is None)
    hist_fit = np.array([x * 0 for x in range(0, len(bin_centers))])
    try:
        if opts.lorenzian:
            coeff, covar = curve_fit(lorenz,
                                     bin_centers,
                                     freq,
                                     p0=p0,
                                     maxfev=opts.maxfev)
            hist_fit = lorenz(bin_centers, *coeff)
        else:
            coeff, covar = curve_fit(gauss,
                                     bin_centers,
                                     freq,
                                     p0=p0,
                                     maxfev=opts.maxfev)
            hist_fit = gauss(bin_centers, *coeff)
    except RuntimeError:
        if opts.maxfev > 100 and kwargs['warnings']:
            logger.warning("|%s| Fit did not converge",
                           kwargs['label'],
                           exc_info=False)
    except ValueError as msg:
        if kwargs['warnings']:
            logger.warning(
                "|%s| Skipping data with ridiculous numbers in it (%s)",
                kwargs['label'],
                str(msg),
                exc_info=False)
    except FloatingPointError as msg:
        logger.error(
            "|%s| Encountered floating point error fitting Guasian: %s",
            kwargs['label'],
            str(msg),
            exc_info=False)

    try:
        skewstat, skewpval = skewtest(freq)
        kurtstat, kurtpval = kurtosistest(freq)
    except ValueError as msg:
        logger.error("|%s| Could not perform skewtest: %s",
                     kwargs['label'],
                     str(msg),
                     exc_info=False)
        skewstat, skewpval, kurtstat, kurtpval = 0.0, 0.0, 0.0, 0.0
    return {
        "bin": bin_centers,
        "freq": freq,
        "mean": coeff[1],
        "std": coeff[2],
        "var": coeff[2],
        "bins": bins,
        "fit": hist_fit,
        "Gmean": Ym,
        "Gstd": Ys,
        "skew": skew(freq),
        "kurtosis": kurtosis(freq),
        "skewstat": skewstat,
        "skewpval": skewpval,
        "kurtstat": kurtstat,
        "kurtpval": kurtpval
    }
Example #46
0
def handle_dimredux(X, outpath, PCA_cut=0.95, SVD_cut=0.95):
    # PCA
    pca = PCA(PCA_cut, whiten=True, svd_solver='auto', random_state=SEED)
    pcaRes = pca.fit_transform(X)
    # pcaRes = (pca.fit_transform(X), pca)
    plt.plot(pca.explained_variance_)
    plt.xlabel('Component')
    plt.ylabel('Eigenvalues')
    plt.title(
        f'Distribution of Eigenvalues over PCA components \n Explains {PCA_cut * 100}% of Variance, k={len(pca.explained_variance_)}'
    )
    plt.savefig(os.path.join(outpath, 'PCAEigenDist.png'),
                dpi=400,
                format='png')
    plt.close()

    # ICA
    ica = FastICA(whiten=True, random_state=SEED, max_iter=10000, tol=0.001)
    ica.fit(X)
    kvals = []
    xv = np.arange(2, ica.components_.shape[0])
    for i in xv:
        kvals.append(
            np.mean(kurtosis(np.dot(X, ica.components_[:i].T))**2)
        )  # transform X with increasingly more ICA components and calculate the kurtosis of the transformation
    ica_k = xv[np.argmax(kvals)]
    icaRes = np.dot(X, ica.components_[:ica_k].T
                    )  # Take the X transform with greatest kurtosis
    # icaRes = (np.dot(X, ica.components_[:ica_k].T), ica)  # Take the X transform with greatest kurtosis
    plt.plot(xv, kvals, label='Kurtosis')
    plt.vlines(ica_k, 0, np.max(kvals), label=f'Best K: {ica_k}')
    plt.yscale('log')
    plt.xlabel('ICA Components')
    plt.ylabel('Mean Squared Kurtosis')
    plt.title('Kurtosis of ICA Components')
    plt.legend()
    plt.savefig(os.path.join(outpath, 'ICAKurtosis.png'),
                dpi=400,
                format='png')
    plt.close()

    # RCA
    reconScore = []
    X_ts = []
    Xvals = np.arange(2, X.shape[1])
    for i in Xvals:
        rca = SRP(i, dense_output=True)
        X_t = rca.fit_transform(X)
        reverse = np.linalg.pinv(rca.components_.toarray())
        l = 0
        for j in range(9):
            rca = SRP(i, dense_output=True)
            X_t += rca.fit_transform(X)
            reverse += np.linalg.pinv(rca.components_.toarray())
            l += 1
        reconScore.append(
            ((X - np.dot(X_t / (1 + l), reverse.T / (1 + l)))**2).mean())
        X_ts.append(X_t / (1 + l))
    rca_k = Xvals[np.argmin(reconScore)]
    if rca_k > len(reconScore):
        rca_k = len(reconScore) - 1
    minError = reconScore[rca_k]
    rcaRes = X_ts[rca_k]
    # rcaRes = (X_ts[rca_k], None)
    plt.plot(Xvals, reconScore, label='Recon. Score')
    plt.vlines(rca_k, 0, max(reconScore), label=f'Best K: {rca_k}')
    plt.title('Reconstruction Scores (MSE) for Randomized Projections')
    plt.xlabel('Components')
    plt.ylabel('MSE')
    plt.legend()
    plt.savefig(os.path.join(outpath, 'RCARecon.png'), dpi=400, format='png')
    plt.close()

    # SVD
    svd = TruncatedSVD(X.shape[1] - 1, random_state=SEED)
    svd.fit(X)
    evr_Cumm = np.cumsum(svd.explained_variance_ratio_)
    svd_k = (evr_Cumm <= SVD_cut).sum()
    svdRes = svd.transform(X)
    # svdRes = (svd.transform(X), svd)
    plt.plot(evr_Cumm, label='Cumm. Ratio')
    plt.plot(svd.explained_variance_ratio_, label='Ratio of exp. var.')
    plt.vlines(svd_k, 0, 1, label=f'Best K: {svd_k}')
    plt.title(
        f'Choosing best k components for Truncated SVD \n Explains {SVD_cut * 100}% of variance'
    )
    plt.xlabel('Components')
    plt.ylabel('Ratio/Percentage of Explained Variance')
    plt.legend()
    plt.savefig(os.path.join(outpath, 'SVDChooseK.png'), dpi=400, format='png')
    plt.close()

    return pcaRes, icaRes, rcaRes, svdRes
Example #47
0
def CalcateDQF(fy4data, exdata, bias, dqf):
    fy4data0 = fy4data[np.where(dqf == 0)]
    exdata0 = exdata[np.where(dqf == 0)]
    bias0 = bias[np.where(dqf == 0)]
    fy4data0 = fy4data0[~fy4data0.mask]
    exdata0 = exdata0[~exdata0.mask]
    bias0 = bias0[~bias0.mask]

    fy4data1 = fy4data[np.where(dqf == 1)]
    exdata1 = exdata[np.where(dqf == 1)]
    bias1 = bias[np.where(dqf == 1)]
    fy4data1 = fy4data1[~fy4data1.mask]
    exdata1 = exdata1[~exdata1.mask]
    bias1 = bias1[~bias1.mask]

    fy4data2 = fy4data[np.where(dqf == 2)]
    exdata2 = exdata[np.where(dqf == 2)]
    bias2 = bias[np.where(dqf == 2)]
    fy4data2 = fy4data2[~fy4data2.mask]
    exdata2 = exdata2[~exdata2.mask]
    bias2 = bias2[~bias2.mask]

    fy4data3 = fy4data[~fy4data.mask]
    exdata3 = exdata[~exdata.mask]
    bias3 = bias[~bias.mask]
    # 指标计算
    slope0, intercept0, r_value0, p_value0, std_err0 = stats.linregress(
        fy4data0, exdata0)
    slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(
        fy4data1, exdata1)
    slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(
        fy4data2, exdata2)
    slope3, intercept3, r_value3, p_value3, std_err3 = stats.linregress(
        fy4data3, exdata3)

    metrics = collections.OrderedDict()
    metrics['QualID0_NUM'] = fy4data0.size  # 总数
    metrics['QualID0_MAX'] = round(bias0.max(), 4)  # 最大值
    metrics['QualID0_MIN'] = round(bias0.min(), 4)  # 最小值
    metrics['QualID0_MEDIAN'] = round(np.median(bias0), 4)  # 中位数
    metrics['QualID0_MEAN'] = round(bias0.mean(), 4)  # 平均值
    metrics['QualID0_AE'] = round(np.abs(bias0).mean(), 4)  # 绝对值平均数
    metrics['QualID0_STD'] = round(
        np.sqrt(np.square(bias0).sum() / (fy4data0.size - 1)), 4)
    metrics['QualID0_RMSE'] = round(
        np.sqrt(np.square(bias0).sum() / fy4data0.size), 4)  # 均方根误差
    metrics['QualID0_SKEW'] = round(stats.skew(bias0), 4)  # 偏度系数
    metrics['QualID0_KURT'] = round(stats.kurtosis(bias0), 4)  # 峰度系数
    metrics['QualID0_CORR'] = round(r_value0, 4)

    metrics['QualID0_slope'] = round(slope0, 4)
    metrics['QualID0_intercept'] = round(intercept0, 4)

    metrics['QualID1_NUM'] = fy4data1.size  # 总数
    metrics['QualID1_MAX'] = round(bias1.max(), 4)  # 最大值
    metrics['QualID1_MIN'] = round(bias1.min(), 4)  # 最小值
    metrics['QualID1_MEDIAN'] = round(np.median(bias1), 4)  # 中位数
    metrics['QualID1_MEAN'] = round(bias1.mean(), 4)  # 平均值
    metrics['QualID1_AE'] = round(np.abs(bias1).mean(), 4)  # 绝对值平均数
    metrics['QualID1_STD'] = round(
        np.sqrt(np.square(bias1).sum() / (fy4data1.size - 1)), 4)
    metrics['QualID1_RMSE'] = round(
        np.sqrt(np.square(bias1).sum() / fy4data1.size), 4)  # 均方根误差
    metrics['QualID1_SKEW'] = round(stats.skew(bias1), 4)  # 偏度系数
    metrics['QualID1_KURT'] = round(stats.kurtosis(bias1), 4)  # 峰度系数
    metrics['QualID1_CORR'] = round(r_value1, 4)

    metrics['QualID1_slope'] = round(slope1, 4)
    metrics['QualID1_intercept'] = round(intercept1, 4)

    metrics['QualID2_NUM'] = fy4data2.size  # 总数
    metrics['QualID2_MAX'] = round(bias2.max(), 4)  # 最大值
    metrics['QualID2_MIN'] = round(bias2.min(), 4)  # 最小值
    metrics['QualID2_MEDIAN'] = round(np.median(bias2), 4)  # 中位数
    metrics['QualID2_MEAN'] = round(bias2.mean(), 4)  # 平均值
    metrics['QualID2_AE'] = round(np.abs(bias2).mean(), 4)  # 绝对值平均数
    metrics['QualID2_STD'] = round(
        np.sqrt(np.square(bias2).sum() / (fy4data2.size - 1)), 4)
    metrics['QualID2_RMSE'] = round(
        np.sqrt(np.square(bias2).sum() / fy4data2.size), 4)  # 均方根误差
    metrics['QualID2_SKEW'] = round(stats.skew(bias2), 4)  # 偏度系数
    metrics['QualID2_KURT'] = round(stats.kurtosis(bias2), 4)  # 峰度系数
    metrics['QualID2_CORR'] = round(r_value2, 4)

    metrics['QualID2_slope'] = round(slope2, 4)
    metrics['QualID2_intercept'] = round(intercept2, 4)

    metrics['QualID3_NUM'] = fy4data3.size  # 总数
    metrics['QualID3_MAX'] = round(bias3.max(), 4)  # 最大值
    metrics['QualID3_MIN'] = round(bias3.min(), 4)  # 最小值
    metrics['QualID3_MEDIAN'] = round(np.median(bias3), 4)  # 中位数
    metrics['QualID3_MEAN'] = round(bias3.mean(), 4)  # 平均值
    metrics['QualID3_AE'] = round(np.abs(bias3).mean(), 4)  # 绝对值平均数
    metrics['QualID3_STD'] = round(
        np.sqrt(np.square(bias3).sum() / (fy4data3.size - 1)), 4)
    metrics['QualID3_RMSE'] = round(
        np.sqrt(np.square(bias3).sum() / fy4data3.size), 4)  # 均方根误差
    metrics['QualID3_SKEW'] = round(stats.skew(bias3), 4)  # 偏度系数
    metrics['QualID3_KURT'] = round(stats.kurtosis(bias3), 4)  # 峰度系数
    metrics['QualID3_CORR'] = round(r_value3, 4)

    metrics['QualID3_slope'] = round(slope3, 4)
    metrics['QualID3_intercept'] = round(intercept3, 4)
    return metrics
Example #48
0
    #     trad_date_from = dt.date(2011, 2, 1)
    #     trad_date_to = dt.date(2014, 1, 31)
    # get simple returns dataframe and dates
    dates, returns_df = simulate_trading(close_df, trad_date_from,
                                         trad_date_to, args.trad_freq,
                                         args.est_per_trad_days,
                                         args.trad_per_trad_days,
                                         args.no_pairs)
    # convert returns to log returns
    log_returns_df = np.log(returns_df + 1)
    # sum up returns and calculate cumulative sum of log returns
    cum_log_returns_df = np.cumsum(log_returns_df)
    # converto to simple return
    cum_returns_df = np.exp(cum_log_returns_df)
    #
    daily_ret = np.mean(log_returns_df)
    daily_vol = np.std(log_returns_df)
    ann_ret = np.mean(log_returns_df) * 252
    ann_vol_ret = np.std(log_returns_df) * np.sqrt(252)
    skew = stats.skew(log_returns_df)
    kurt = stats.kurtosis(log_returns_df)
    min_daily_ret = np.min(log_returns_df)
    max_daily_ret = np.max(log_returns_df)
    cum_ret = cum_returns_df[-1] - 1
    # plot
    plt.plot(dates, cum_returns_df, label='Growth of 1$')
    plt.legend()
    plt.grid(True)
    plt.xlabel('Date')
pass
Example #49
0
    minkowski(x, y, 3)
    for (x, y) in zip(np.nan_to_num(question1_vectors_train),
                      np.nan_to_num(question2_vectors_train))
]
train_df['braycurtis_distance2'] = [
    braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors_train),
                                       np.nan_to_num(question2_vectors_train))
]
train_df['skew_q1vec2'] = [
    skew(x) for x in np.nan_to_num(question1_vectors_train)
]
train_df['skew_q2vec2'] = [
    skew(x) for x in np.nan_to_num(question2_vectors_train)
]
train_df['kur_q1vec2'] = [
    kurtosis(x) for x in np.nan_to_num(question1_vectors_train)
]
train_df['kur_q2vec2'] = [
    kurtosis(x) for x in np.nan_to_num(question2_vectors_train)
]

question1_vectors_test = np.zeros((test.shape[0], 300))
question2_vectors_test = np.zeros((test.shape[0], 300))
error_count_test = 0
for i, q in tqdm(enumerate(test.question1.values)):
    question1_vectors_test[i, :] = sent2vec(q)
for i, q in tqdm(enumerate(test.question2.values)):
    question2_vectors_test[i, :] = sent2vec(q)
test_df['cosine_distance2'] = [
    cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors_test),
                                   np.nan_to_num(question2_vectors_test))
 def calc_kurtosis(data):
     return kurtosis(data)
Example #51
0
def nankurtosis(arr, axis=0):
    return stats.kurtosis(arr, axis=axis, nan_policy="omit")
def jump_diffusion(S=1,
                   X=0.5,
                   T=1,
                   mu=0.12,
                   sigma=0.3,
                   Lambda=0.25,
                   a=0.2,
                   b=0.2,
                   Nsteps=252,
                   Nsim=100,
                   alpha=0.05,
                   seed=None):
    '''
    Monte Carlo simulation [1] of Merton's Jump Diffusion Model [2].
    The model is specified through the stochastic differential equation (SDE):

                        dS(t)
                        ----- = mu*dt + sigma*dW(t) + dJ(t)
                        S(t-)

    with:

    mu, sigma: constants, the drift and volatility coefficients of the stock
               price process;
    W: a standard one-dimensional Brownian motion;
    J: a jump process, independent of W, with piecewise constant sample paths.
       It is defined as the sum of multiplicative jumps Y(j).

    Input
    ---------------------------------------------------------------------------
    S: float. The current asset price.
    X: float. The strike price, i.e. the price at which the asset may be bought
       (call) or sold (put) in an option contract [3].
    T: int or float. The maturity of the option contract, i.e. the final
       monitoring date.
    mu, sigma: float. Respectively, the drift and volatility coefficients of
               the asset price process.
    Lambda: float. The intensity of the Poisson process in the jump diffusion
            model ('lambda' is a protected keyword in Python).
    a, b: float. Parameters required to calculate, respectively, the mean and
          variance of a standard lognormal distribution, log(x) ~ N(a, b**2).
          (see code).
    Nsteps: int. The number of monitoring dates, i.e. the time steps.
    Nsim: int. The number of Monte Carlo simulations (at least 10,000 required
          to generate stable results).
    alpha: float. The confidence interval significance level, in [0, 1].
    seed: int. Set random seed, for reproducibility of the results. Default
          value is None (the best seed available is used, but outcome will vary
          in each experiment).

    References
    ---------------------------------------------------------------------------
    [1] Glasserman, P. (2003): 'Monte Carlo Methods in Financial Engineering',
        Springer Applications of Mathematics, Vol. 53
    [2] Merton, R.C. (1976): 'Option Pricing when Underlying Stock Returns are
        Discontinuous', Journal of Financial Economics, 3:125-144.
    [3] Hull, J.C. (2017): 'Options, Futures, and Other Derivatives', 10th
        Edition, Pearson.
    '''

    # Import required libraries
    import time
    import numpy as np
    from scipy import stats
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Set random seed
    np.random.seed(seed)
    '''
    Time the whole path-generating process, using a tic-toc method familiar
    to MATLAB users
    '''
    tic = time.time()

    # Calculate the length of the time step
    Delta_t = T / Nsteps
    '''
    Compute mean and variance of a standard lognormal distribution from user
    defined parameters a and b. The latter are useful to simulate the jump
    component in Monte Carlo.
    a and b are chosen such that log(Y(j)) ~ N(a, b**2). This implies that the
    mean and variance of the multiplicative jumps will be:

     * mean_Y = np.exp(a + 0.5*(b**2))
     * variance_Y = np.exp(2*a + b**2) * (np.exp(b**2)-1)

    '''
    mean_Y = np.exp(a + 0.5 * (b**2))
    variance_Y = np.exp(2 * a + b**2) * (np.exp(b**2) - 1)
    '''
    Calculate the theoretical drift (M) and volatility (V) of the stock price
    process under Merton's jump diffusion model. These values can be used to
    monitor the rate of convergence of Monte Carlo estimates as the number of
    simulated experiments increases, and can help spot errors, if any, in
    implementing the model.
    '''
    M = S * np.exp(mu * T + Lambda * T * (mean_Y - 1))
    V = S**2 * (np.exp((2*mu + sigma**2)*T \
        + Lambda*T*(variance_Y + mean_Y**2 - 1)) \
        - np.exp(2*mu*T + 2*Lambda*T*(mean_Y - 1)))
    '''
    Generate an Nsim x (Nsteps+1) array of zeros to preallocate the simulated
    paths of the Monte Carlo simulation. Each row of the matrix represents a
    full, possible path for the stock, each column all values of the asset at
    a particular instant in time.
    '''
    simulated_paths = np.zeros([Nsim, Nsteps + 1])

    # Replace the first column of the array with the vector of initial price S
    simulated_paths[:, 0] = S
    '''
    To account for the multiple sources of uncertainty in the jump diffusion
    process, generate three arrays of random variables.

     - The first one is related to the standard Brownian motion, the component
       epsilon(0,1) in epsilon(0,1) * np.sqrt(dt);
     - The second and third ones model the jump, a compound Poisson process:
       the former (a Poisson process with intensity Lambda) causes the asset
       price to jump randomly (random timing); the latter (a Gaussian variable)
       defines both the direction (sign) and intensity (magnitude) of the jump.
    '''
    Z_1 = np.random.normal(size=[Nsim, Nsteps])
    Z_2 = np.random.normal(size=[Nsim, Nsteps])
    Poisson = np.random.poisson(Lambda * Delta_t, [Nsim, Nsteps])

    # Populate the matrix with Nsim randomly generated paths of length Nsteps
    for i in range(Nsteps):
        simulated_paths[:,i+1] = simulated_paths[:,i]*np.exp((mu
                               - sigma**2/2)*Delta_t + sigma*np.sqrt(Delta_t) \
                               * Z_1[:,i] + a*Poisson[:,i] \
                               + np.sqrt(b**2) * np.sqrt(Poisson[:,i]) \
                               * Z_2[:,i])

    # Single out array of simulated prices at maturity T
    final_prices = simulated_paths[:, -1]

    # Compute mean, variance, standard deviation, skewness, excess kurtosis
    mean_jump = np.mean(final_prices)
    var_jump = np.var(final_prices)
    std_jump = np.std(final_prices)
    skew_jump = stats.skew(final_prices)
    kurt_jump = stats.kurtosis(final_prices)

    # Calculate confidence interval for the mean
    ci_low = mean_jump - std_jump / np.sqrt(Nsim) * stats.norm.ppf(1 -
                                                                   0.5 * alpha)
    ci_high = mean_jump + std_jump / np.sqrt(Nsim) * stats.norm.ppf(1 - 0.5 *
                                                                    alpha)

    # Print statistics, align results
    print("Merton's Jump Diffusion Model")
    print('-----------------------------')
    print('Theoretical Moments')
    print('-----------------------------')
    print('Mean (M){:>21.4f}'.format(M))
    print('Variance (V){:>17.4f}'.format(V))
    print('\nMonte Carlo Estimates')
    print('-----------------------------')
    print('Mean {:>24.4f}'.format(mean_jump))
    print('Variance {:>20.4f}'.format(var_jump))
    print('Standard deviation {:>10.4f}'.format(std_jump))
    print('Skewness {:>20.4f}'.format(skew_jump))
    print('Excess kurtosis {:>13.4f}'.format(kurt_jump))
    print('\nConfidence interval, Mean')
    print('-----------------------------')
    print('Alpha {:>23.2f}'.format(alpha))
    print('Lower bound {:>17.4f}'.format(ci_low))
    print('Upper bound {:>17.4f}'.format(ci_high))

    # Choose palette, figure size, and define figure axes
    sns.set(palette='viridis')
    plt.figure(figsize=(10, 8))
    ax = plt.axes()

    # Generate t, the time variable on the abscissae
    t = np.linspace(0, T, Nsteps + 1) * Nsteps

    # Plot the Monte Carlo simulated stock price paths
    jump_diffusion = ax.plot(t, simulated_paths.transpose())

    # Make drawn paths thinner by decreasing line width
    plt.setp(jump_diffusion, linewidth=1)

    # Set title (LaTeX notation) and x- and y- labels
    ax.set(title="Monte Carlo simulated stock price paths in Merton's jump \
diffusion model\n$S_0$ = {}, $\mu$ = {}, $\sigma$ = {}, $a$ = {}, $b$ = {}, \
$\lambda$ = {}, $T$ = {}, Nsteps = {}, Nsim = {}"\
           .format(S, mu, sigma, a, b, Lambda, T, Nsteps, Nsim), \
           xlabel='Time (days)', ylabel='Stock price')

    # Display figure in a Python environment
    plt.show()

    # Time and print the elapsed time
    toc = time.time()
    elapsed_time = toc - tic
    print('Total running time: {:.2f} ms'.format(elapsed_time * 1000))
#print (s)
np.all(s > 1)
np.all(s < 1)
count, bins, ignored = plt.hist(s, 15, normed=True)
print(count)
print(bins)
print(ignored)
plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
plt.show()
x = np.random.normal(0.75)

mu, sigma = 0, 0.1  # mean and standard deviation
n = np.random.normal(mu, sigma, 1000)
a = np.random.normal(0.75, size=1000)
np.std(a)
stats.kurtosis(a)
stats.skew(a)
# # Chi squared distributions
chi_squared = np.random.chisquare(2, size=1000)
stats.skew(chi_squared)
chi_squared = np.random.chisquare(6, size=1000)
stats.skew(chi_squared)

pyplot.pie([1, 2, 3])
pyplot.show()
df = pd.read_csv(r'C:\Users\Ashish\Desktop\Test\Grades\Grades.csv')
len(df)
df.head(5)
early = df[df['assignment1_submission'] <= '2015-12-31']
late = df[df['assignment1_submission'] > '2015-12-31']
early['assignment1_grade'].mean()
Example #54
0
plt.show()

# This plot is tall. It is leptokurtic.
# Most students performed similarly.
plt.hist(test_scores_lepto)
plt.show()

# The height of this plot neither short nor tall. It is mesokurtic.
plt.hist(test_scores_meso)
plt.show()

# We can measure kurtosis with the kurtosis function.
# Negative values indicate platykurtic distributions, positive values indicate leptokurtic distributions, and values near 0 are mesokurtic.
from scipy.stats import kurtosis

kurt_platy = kurtosis(test_scores_platy)
kurt_lepto = kurtosis(test_scores_lepto)
kurt_meso = kurtosis(test_scores_meso)

## 10. Modality ##

import matplotlib.pyplot as plt

# This plot has one mode. It is unimodal.
plt.hist(test_scores_uni)
plt.show()

# This plot has two peaks. It is bimodal.
# This could happen if one group of students learned the material and another learned something else, for example.
plt.hist(test_scores_bi)
plt.show()
Example #55
0
    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(
            self.transaction.persistent_model_metadata.train_metadata)

        header = self.transaction.input_data.columns
        origData = {}

        for column in header:
            origData[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(
            sampleSize(population_size=population_size,
                       margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR,
                       confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size),
                                                  sample_size)
        self.logging.info(
            'population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'
            .format(population_size=population_size,
                    sample_size=sample_size,
                    percent=(sample_size / population_size) * 100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]
            for i, val in enumerate(row):
                column = header[i]
                value = self.cast(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    origData[column].append(value)
                column_count[column] += 1
        stats = {}

        for i, col_name in enumerate(origData):
            col_data = origData[col_name]  # all rows in just one column
            data_type = self.getColumnDataType(col_data)

            # NOTE: Enable this if you want to assume that some numeric values can be text
            # We noticed that by default this should not be the behavior
            # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification
            #
            # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC:
            #     unique_count = len(set(col_data))
            #     if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN:
            #         data_type = DATA_TYPES.TEXT

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [
                            str(''),
                            str(None),
                            str(False),
                            str(np.nan), 'NaN', 'nan', 'NA'
                    ]:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(parseDate(element).timestamp())
                        except:
                            logging.warning(
                                'Could not convert string to date and it was expected, current value {value}'
                                .format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)

                col_data = [
                    float(i) for i in newData if str(i) not in [
                        '',
                        str(None),
                        str(False),
                        str(np.nan), 'NaN', 'nan', 'NA'
                    ]
                ]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.05
                    initial_step_size = abs(max_value - min_value) / 100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i - min_value) * inc_rate
                        i = i + i_inc

                    # TODO: Solve inc_rate for N
                    #    min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate ....
                    #
                    #      x_0 = 0
                    #      x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate
                    #
                    #      sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) )
                    #
                    #      mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) )
                    #
                    #     mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2))
                    #              = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) ))
                    #              = n*min(sum_(i=1)^(i=n)(inc_rate^i))
                    #    =>  sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n))
                    #
                    # # i + i*x

                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []

                is_float = True if max(
                    [1 if int(i) != i else 0
                     for i in col_data]) == 1 else False

                col_stats = {
                    "column":
                    col_name,
                    KEYS.DATA_TYPE:
                    data_type,
                    # "distribution": best_fit_name,
                    # "distributionParams": distribution_params,
                    "mean":
                    mean,
                    "median":
                    median,
                    "variance":
                    var,
                    "skewness":
                    skew,
                    "kurtosis":
                    kurtosis,
                    "emptyColumns":
                    empty_count[col_name],
                    "emptyPercentage":
                    empty_count[col_name] / column_count[col_name] * 100,
                    "max":
                    max_value,
                    "min":
                    min_value,
                    "is_float":
                    is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets":
                    xp
                }
                stats[col_name] = col_stats
            # else if its text
            else:

                # see if its a sentence or a word
                is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False
                dictionary, histogram = self.getWordsDictionary(
                    col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(dictionary) / len(
                        col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(
                            col_data) > 50 and is_full_text == False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {
                    "column":
                    col_name,
                    KEYS.DATA_TYPE:
                    DATA_TYPES.FULL_TEXT if is_full_text else data_type,
                    "dictionary":
                    dictionary,
                    "dictionaryAvailable":
                    dictionary_available,
                    "dictionaryLenghtPercentage":
                    dictionary_lenght_percentage,
                    "emptyColumns":
                    empty_count[col_name],
                    "emptyPercentage":
                    empty_count[col_name] / column_count[col_name] * 100,
                    "histogram":
                    histogram
                }
                stats[col_name] = col_stats

        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.persistent_model_metadata.column_stats = stats
        self.transaction.persistent_model_metadata.total_row_count = total_rows
        self.transaction.persistent_model_metadata.test_row_count = test_rows
        self.transaction.persistent_model_metadata.train_row_count = train_rows
        self.transaction.persistent_model_metadata.validation_row_count = validation_rows

        self.transaction.persistent_model_metadata.update()

        return stats
index_1 = {}

# loop over the image paths
for imagePath in imagePaths:
    # load the image and extract the filename
    image = cv2.imread(imagePath)
    filename = imagePath[imagePath.rfind("\\") + 1:]
    print(filename)
    #dinos/trex_01.png
    # extract the mean and standard deviation from each channel of the
    # BGR image, then update the index with the feature vector
    (means, stds) = cv2.meanStdDev(image)
    features = np.concatenate([means, stds]).flatten()
    index[filename] = features

    kurtosis_scipy = stats.kurtosis(image)
    skew_scipy = stats.skew(image)
    features1 = np.concatenate([skew_scipy, kurtosis_scipy]).flatten()
    index_1[filename] = features1

print(index["trex_01.png"])
print(index_1["trex_01.png"])

# display the query image and grab the sorted keys of the index dictionary
query = cv2.imread(imagePaths[0])
cv2.imshow("Query (trex_01.png)", query)
keys = sorted(index.keys())

# loop over the filenames in the dictionary
for (i, k) in enumerate(keys):
    # if this isq the query image, ignore it
Example #57
0
    def run(self, input_data, modify_light_metadata, hmd=None, print_logs=True):
        """
        # Runs the stats generation phase
        # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column
        # A lot of information about the data distribution and quality will  also be logged to the server in this phase
        """

        no_processes = multiprocessing.cpu_count() - 2
        if no_processes < 1:
            no_processes = 1
        pool = multiprocessing.Pool(processes=no_processes)

        if print_logs == False:
            self.log = logging.getLogger('null-logger')
            self.log.propagate = False

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(input_data.data_frame)

        if population_size < 50:
            sample_size = population_size
        else:
            sample_size = int(calculate_sample_size(population_size=population_size, margin_error=self.transaction.lmd['sample_margin_of_error'], confidence_level=self.transaction.lmd['sample_confidence_level']))
            #if sample_size > 3000 and sample_size > population_size/8:
            #    sample_size = min(round(population_size/8),3000)

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size), sample_size)
        self.log.info('population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100))

        all_sampled_data = input_data.data_frame.iloc[input_data_sample_indexes]

        stats = {}
        col_data_dict = {}

        for col_name in all_sampled_data.columns.values:
            col_data = all_sampled_data[col_name].dropna()
            full_col_data = all_sampled_data[col_name]

            data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type(col_data, input_data.data_frame, col_name)


            if column_status == 'Column empty':
                if modify_light_metadata:
                    self.transaction.lmd['malformed_columns']['names'].append(col_name)
                    self.transaction.lmd['malformed_columns']['indices'].append(i)
                continue

            new_col_data = []
            if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: #data_type == DATA_TYPES.DATE:
                for element in col_data:
                    if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null']:
                        new_col_data.append(None)
                    else:
                        try:
                            new_col_data.append(int(parse_datetime(element).timestamp()))
                        except:
                            self.log.warning(f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}')
                            new_col_data.append(None)
                col_data = new_col_data
            if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP:
                histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype)
                x = histogram['x']
                y = histogram['y']

                col_data = StatsGenerator.clean_int_and_date_data(col_data)
                # This means the column is all nulls, which we don't handle at the moment
                if len(col_data) < 1:
                    return None

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)


                    inc_rate = 0.1
                    initial_step_size = abs(max_value-min_value)/100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i-min_value)*inc_rate
                        i = i + i_inc
                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []

                is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
            elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE:
                histogram, _ = StatsGenerator.get_histogram(input_data.data_frame[col_name], data_type=data_type, data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "histogram": histogram,
                    "percentage_buckets": histogram['x']
                }

            elif curr_data_subtype == DATA_SUBTYPES.IMAGE:
                histogram, percentage_buckets = StatsGenerator.get_histogram(col_data, data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    'percentage_buckets': percentage_buckets,
                    'histogram': histogram
                }

            # @TODO This is probably wrong, look into it a bit later
            else:
                # see if its a sentence or a word
                histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype)
                dictionary = list(histogram.keys())

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(
                        dictionary) / len(col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False
                    if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False:
                        dictionary = []
                        dictionary_available = False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "histogram": histogram
                }
            stats[col_name] = col_stats
            stats[col_name]['data_type_dist'] = data_type_dist
            stats[col_name]['data_subtype_dist'] = data_subtype_dist
            stats[col_name]['column'] = col_name

            empty_count = len(full_col_data) - len(col_data)

            stats[col_name]['empty_cells'] = empty_count
            stats[col_name]['empty_percentage'] = empty_count * 100 / len(full_col_data)
            if 'separator' in additional_info:
                stats[col_name]['separator'] = additional_info['separator']
            col_data_dict[col_name] = col_data

        for col_name in all_sampled_data.columns:
            if col_name in self.transaction.lmd['malformed_columns']['names']:
                continue

            # Use the multiprocessing pool for computing scores which take a very long time to compute
            # For now there's only one and computing it takes way too long, so this is not enabled
            scores = []

            '''
            scores.append(pool.apply_async(compute_clf_based_correlation_score, args=(stats, all_sampled_data, col_name)))
            '''
            for score_promise in scores:
                # Wait for function on process to finish running
                score = score_promise.get()
                stats[col_name].update(score)

            for score_func in [compute_duplicates_score, compute_empty_cells_score, compute_data_type_dist_score, compute_z_score, compute_lof_score, compute_similariy_score, compute_value_distribution_score]:
                start_time = time.time()
                if 'compute_z_score' in str(score_func) or 'compute_lof_score' in str(score_func):
                    stats[col_name].update(score_func(stats, col_data_dict, col_name))
                else:
                    stats[col_name].update(score_func(stats, all_sampled_data, col_name))

                fun_name = str(score_func)
                run_duration = round(time.time() - start_time, 2)
                #print(f'Running scoring function "{run_duration}" took {run_duration} seconds !')

            stats[col_name].update(compute_consistency_score(stats, col_name))
            stats[col_name].update(compute_redundancy_score(stats, col_name))
            stats[col_name].update(compute_variability_score(stats, col_name))

            stats[col_name].update(compute_data_quality_score(stats, col_name))

        total_rows = len(input_data.data_frame)

        if modify_light_metadata:
            self.transaction.lmd['column_stats'] = stats

            self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error']

            self.transaction.lmd['data_preparation']['total_row_count'] = total_rows
            self.transaction.lmd['data_preparation']['used_row_count'] = sample_size
            self.transaction.lmd['data_preparation']['test_row_count'] = len(input_data.test_indexes[KEY_NO_GROUP_BY])
            self.transaction.lmd['data_preparation']['train_row_count'] = len(input_data.train_indexes[KEY_NO_GROUP_BY])
            self.transaction.lmd['data_preparation']['validation_row_count'] = len(input_data.validation_indexes[KEY_NO_GROUP_BY])

        pool.close()
        pool.join()

        self._log_interesting_stats(stats)

        return stats
Example #58
0
def dict_learning(X, dictionary=None, P_cum=None, eta=0.02, n_dictionary=2, l0_sparseness=10, fit_tol=None, n_iter=100,
                       eta_homeo=0.01, alpha_homeo=0.02,
                       batch_size=100, record_each=0, record_num_batches = 1000, verbose=False,
                       method='mp', C=0., nb_quant=100, do_sym=True, random_state=None):
    """
    Solves a dictionary learning matrix factorization problem online.

    Finds the best dictionary and the corresponding sparse code for
    approximating the data matrix X by solving::


    Solves the optimization problem::

        (U^*, V^*) = argmin_{(U,V)} 0.5 || X - V^T * U ||_2^2
                                    + alpha * S( U )
                                    + alpha_homeo * H(V)

                     s. t. || U ||_0 = k

                    where S is a sparse representation cost,
                    and H a homeostatic representation cost.

    where V is the dictionary and U is the sparse code. This is
    accomplished by repeatedly iterating over mini-batches by slicing
    the input data.

    For instance,

        H(V) = \sum_{0 <= k < n_dictionary} (|| V_k ||_2^2 -1)^2

    Parameters
    ----------
    X: array of shape (n_samples, n_pixels)
        Data matrix.

    n_dictionary : int,
        Number of dictionary atoms to extract.

    eta : float
        Gives the learning parameter for the homeostatic gain.

    n_iter : int,
        total number of iterations to perform

    eta_homeo : float
        Gives the learning parameter for the homeostatic gain.

    alpha_homeo : float
        Gives the smoothing exponent  for the homeostatic gain
        If equal to 1 the homeostatic learning rule learns a linear relation to
        variance.
        If equal to zero, we use COMP

    nb_quant : int,
        number of bins for the quantification used in the homeostasis

    C : float
        characteristic scale for the quantization.
        Use C=0. to have an adaptive scaling.

    dictionary : array of shape (n_dictionary, n_pixels),
        initial value of the dictionary for warm restart scenarios

    fit_algorithm : {'mp', 'omp', 'comp', 'lars', 'cd'}
        see sparse_encode

    batch_size : int,
        The number of samples to take in each batch.

    l0_sparseness : int, ``0.1 * n_pixels`` by default
        Number of nonzero coefficients to target in each column of the
        solution. This is only used by `algorithm='lars'`, `algorithm='mp'`  and
        `algorithm='omp'`.

    fit_tol : float, 1. by default
        If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `fit_tol` is the
        penalty applied to the L1 norm.
        If `algorithm='threshold'`, `fit_tol` is the absolute value of the
        threshold below which coefficients will be squashed to zero.
        If `algorithm='mp'` or `algorithm='omp'`, `fit_tol` is the tolerance
        parameter: the value of the reconstruction error targeted. In this case,
        it overrides `l0_sparseness`.

    record_each :
        if set to 0, it does nothing. Else it records every record_each step the
        statistics during the learning phase (variance and kurtosis of coefficients).

    record_num_batches :
        number of batches used to make statistics (if -1, uses the whole training set)

    verbose :
        degree of verbosity of the printed output

    Returns
    -------

    dictionary : array of shape (n_dictionary, n_pixels),
        the solutions to the dictionary learning problem

    """

    if record_each>0:
        import pandas as pd
        record = pd.DataFrame()

    if n_dictionary is None:
        n_dictionary = X.shape[1]

    t0 = time.time()
    n_samples, n_pixels = X.shape

    if dictionary is None:
        dictionary = np.random.randn(n_dictionary, n_pixels)
    norm = np.sqrt(np.sum(dictionary**2, axis=1))
    dictionary /= norm[:, np.newaxis]
    norm = np.sqrt(np.sum(dictionary**2, axis=1))

    if verbose == 1:
        print('[dict_learning]', end=' ')

    # print(alpha_homeo, eta_homeo, alpha_homeo==0, eta_homeo==0, alpha_homeo==0 or eta_homeo==0, 'P_cum', P_cum)

    # splits the whole dataset into batches
    n_batches = n_samples // batch_size
    X_train = X.copy()
    np.random.shuffle(X_train)
    batches = np.array_split(X_train, n_batches)

    if alpha_homeo==0:
        # do the equalitarian homeostasis
        if P_cum is None:
            P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((n_dictionary, 1))
            if C == 0.:
                # initialize the rescaling vector
                from shl_scripts.shl_encode import get_rescaling
                corr = (batches[0] @ dictionary.T)
                C_vec = get_rescaling(corr, nb_quant=nb_quant, do_sym=do_sym, verbose=verbose)
                # and stack it to P_cum array for convenience
                P_cum = np.vstack((P_cum, C_vec))
    else:
        # do the classical homeostasis
        gain = np.ones(n_dictionary)
        mean_var = np.ones(n_dictionary)
        P_cum = None

    import itertools
    # Return elements from list of batches until it is exhausted. Then repeat the sequence indefinitely.
    batches = itertools.cycle(batches)
    # cycle over all batches
    for ii, this_X in zip(range(n_iter), batches):
        dt = (time.time() - t0)
        if verbose > 0:
            if ii % int(n_iter//verbose + 1) == 0:
                print ("Iteration % 3i /  % 3i (elapsed time: % 3is, % 4.1fmn)"
                       % (ii, n_iter, dt, dt//60))

        # Sparse coding
        sparse_code = sparse_encode(this_X, dictionary, algorithm=method, fit_tol=fit_tol,
                                  P_cum=P_cum, C=C, do_sym=do_sym, l0_sparseness=l0_sparseness)

        # Update dictionary
        residual = this_X - sparse_code @ dictionary
        residual /= n_dictionary # divide by the number of features
        dictionary += eta * sparse_code.T @ residual

        # homeostasis
        norm = np.sqrt(np.sum(dictionary**2, axis=1)).T
        dictionary /= norm[:, np.newaxis]

        if eta_homeo>0.:
            if P_cum is None:
                # Update and apply gain
                mean_var = update_gain(mean_var, sparse_code, eta_homeo, verbose=verbose)
                gain = mean_var**alpha_homeo
                gain /= gain.mean()
                dictionary /= gain[:, np.newaxis]
            else:
                if C==0.:
                    corr = (this_X @ dictionary.T)
                    C_vec = get_rescaling(corr, nb_quant=nb_quant, do_sym=do_sym, verbose=verbose)
                    P_cum[-1, :]= (1 - eta_homeo) * P_cum[-1, :] + eta_homeo * C_vec
                    P_cum[:-1, :] = update_P_cum(P_cum=P_cum[:-1, :],
                                                 code=sparse_code, eta_homeo=eta_homeo,
                                                 C=P_cum[-1, :], nb_quant=nb_quant, do_sym=do_sym,
                                                 verbose=verbose)
                else:
                    P_cum = update_P_cum(P_cum, sparse_code, eta_homeo,
                                         nb_quant=nb_quant, verbose=verbose, C=C, do_sym=do_sym)

        if record_each>0:
            if ii % int(record_each) == 0:
                from scipy.stats import kurtosis
                indx = np.random.permutation(X_train.shape[0])[:record_num_batches]
                sparse_code_rec = sparse_encode(X_train[indx, :], dictionary, algorithm=method, fit_tol=fit_tol,
                                          P_cum=P_cum, do_sym=do_sym, C=C, l0_sparseness=l0_sparseness)
                # calculation of relative entropy
                p = np.count_nonzero(sparse_code_rec,axis=0)/ (sparse_code_rec.shape[1])
                p /= p.sum()
                rel_ent = np.sum(-p * np.log(p)) / np.log(sparse_code_rec.shape[1])
                error = np.linalg.norm(X_train[indx, :] - sparse_code_rec @ dictionary)/record_num_batches

                record_one = pd.DataFrame([{'kurt':kurtosis(sparse_code_rec, axis=0),
                                            'prob_active':np.mean(np.abs(sparse_code_rec)>0, axis=0),
                                            'var':np.mean(sparse_code_rec**2, axis=0),
                                            'error':error,
                                            'entropy':rel_ent}],
                                            index=[ii])
                record = pd.concat([record, record_one])

    if verbose > 1:
        print('Learning code...', end=' ')
    elif verbose == 1:
        print('|', end=' ')

    if verbose > 1:
        dt = (time.time() - t0)
        print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))

    if record_each==0:
        return dictionary, P_cum
    else:
        return dictionary, P_cum, record
Example #59
0
from scipy.stats import kurtosis

if __name__ == "__main__":
    a = [
        -6, -6, -34, -10, -2, 6, 26, 6, -6, 22, 2, 26, -26, 10, 10, 10, -2, 30,
        -14, -14, 10, -22, -2, -14, 6, -6, -14, -10, -10, 6, -10, 2, 14, -10,
        -34, 2, -18, -6, -2, 22, -30, 18, 10, -22, -2, 18, -34, 6, 10, 10, 18,
        -22, 2, 2, 6, 6, 14, 6, -14, 2, 14, -2, 10, 22, 22, -2, 10, -14, 6, 6,
        6, 10, 14, -14, 10, 30, -42, 10, 2, 6, -22, 22, -14, 2, 30, 2, -18, 2,
        26, -6, -18, 30, 18, -2, 2, 10, -10, 2, 10, -6, 26, -38, -10, -2, -18,
        38, 2, 30, -14, -18, -26, -10, 6, 2, 14, -6, 2, -18, -18, -2, 2, 14,
        10, -14, 10, -34, -22, 2, 18, -14, 42, -18, -14, -10, 6, 14, 10, 14,
        18, 14, -2, 18, 38, 6, 22, -18, -10, 2, 6, 6, 14, 2, -18, -14, 2, -18,
        14, -6, 26, 2, -10, -14, 2, -6, 10, 18, -30, -10, -26, -2, -6, 14, 10,
        -14, 6, -14, -14, 6, -18, -30, 42, -6, -6, -6, -10, -2, 18, 14, -14,
        -2, 14, -30, 2, 14, 10, 2, 14, 2, -6, -14, -6, 6, -2, -14, 22, -10, -6,
        14, -10, -18, 18, -2, -6, 18, 2, 26, 2, 14, -10, -2, -2, -2, 14, 30,
        -2, 2, -18, 6, -18, -14, -18, 10, 18, -2, -30, 14, -10, 6, 2, 2, -2,
        10, -34, -14, -18, 22, -10, -10, 2, -10, 26, -2, 2, -18, -14, 26, -6
    ]

    print(kurtosis(a))
sal_churn.shape

# Measures of Central Tendency
np.mean(sal_churn)
np.median(sal_churn.Salary_hike)
np.median(sal_churn.Churn_out_rate)

# Measures of Dispersion
np.var(sal_churn)
np.std(sal_churn)

# Skewness and Kurtosis
skew(sal_churn.Salary_hike)
skew(sal_churn.Churn_out_rate)

kurtosis(sal_churn.Salary_hike)
kurtosis(sal_churn.Churn_out_rate)


x = np.array(sal_churn.Salary_hike)
y = np.array(sal_churn.Churn_out_rate)

# Normal Q-Q plot
plt.plot(sal_churn.Salary_hike)
plt.plot(sal_churn.Churn_out_rate)

plt.plot(sal_churn);plt.legend(['Salary_hike','Churn_out_rate']);

stats.probplot(x,dist='norm',plot=pylab)
stats.probplot(y,dist='norm',plot=pylab)