def test_kurtosis(self): self.assertEqual(bc.helper.tools.kurtosis([]), None) self.assertAlmostEqual(bc.helper.tools.kurtosis([1, 2, 3, 4, 5]), stats.kurtosis([1, 2, 3, 4, 5], fisher=False)) self.assertAlmostEqual(bc.helper.tools.kurtosis([1, 6, 6, 6, 9, 17]), stats.kurtosis([1, 6, 6, 6, 9, 17], fisher=False)) self.assertAlmostEqual(bc.helper.tools.kurtosis(self.list_1), stats.kurtosis(self.list_1, fisher=False)) self.assertAlmostEqual(bc.helper.tools.kurtosis(self.list_2), stats.kurtosis(self.list_2, fisher=False))
def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_ica_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/wine_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def test_kurtosis(): """ test equation of kurtosis in scipy """ n = 100 x = np.random.rand(n) # biased estimator b_kurt = spstats.kurtosis(x, bias=True) k4 = sum((v-x.mean())**4 for v in x)/n k2 = sum((v-x.mean())**2 for v in x)/n b_kurt2 = k4/k2**2 - 3 print ("biased kurtosis:", b_kurt2) np.testing.assert_allclose(b_kurt, b_kurt2) # unbiased estimator ub_kurt = spstats.kurtosis(x, bias=False) k4 = sum((v-x.mean())**4 for v in x)/n k2 = sum((v-x.mean())**2 for v in x)/n ub_kurt2 = 1.0/(n-2)/(n-3) * ((n**2-1.0)*k4/k2**2.0 - 3*(n-1)**2.0) print ("ubbiased kurtosis:", ub_kurt2) k2 = sum((v-x.mean())**2 for v in x)/(n-1) ub_kurt3 = 1/(n-2)/(n-3) *((n**2-1.0)*(n/(n-1))**2. *k4/k2**2.0- 3*( n-1)**2.0) print ("ubbiased kurtosis:", ub_kurt3) np.testing.assert_allclose(ub_kurt, ub_kurt2) np.testing.assert_allclose(ub_kurt, ub_kurt3)
def calc_rejection(self, trg_data, W, H, H2): diffs = np.zeros(H2.shape[1]) for c in range(self.src.num_cluster): inds = np.where(self.cluster_labels == c)[0] if inds.size > 0: min_h2 = np.min(H[:, inds]) max_h2 = np.max(H[:, inds]) foo = H[:, inds]-min_h2 / (max_h2 - min_h2) foo = np.max(foo, axis=0) - np.min(foo, axis=0) diffs[inds] = foo kurts = stats.kurtosis(H, fisher=False, axis=0) K1 = trg_data.T.dot(trg_data) K2 = W.dot(H).T.dot(W.dot(H)) K3 = W.dot(H2).T.dot(W.dot(H2)) reject = list() reject.append(('Kurtosis', stats.kurtosis(H, fisher=False, axis=0))) reject.append(('Entropy', -stats.entropy(H))) reject.append(('KTA kurt1', self.reject_classifier(K1, diffs))) reject.append(('KTA kurt2', self.reject_classifier(K2, kurts))) reject.append(('KTA kurt3', self.reject_classifier(K3, kurts))) reject.append(('Diffs', diffs)) reject.append(('Dist L2 H', -np.sum((np.abs(trg_data - W.dot(H))**2. ), axis=0))) reject.append(('Dist L2 H2', -np.sum((np.abs(trg_data - W.dot(H2))**2. ), axis=0))) reject.append(('Dist L1 H', -np.sum(np.abs(trg_data - W.dot(H)), axis=0))) reject.append(('Dist L1 H2', -np.sum(np.abs(trg_data - W.dot(H2)), axis=0))) return reject
def smerodatna_odchylka(data, min=0, max=0, plot=True): # # pocita smerodatnou odchylku. Pokud min a max neni nastaveno, pocita se # z celeho pole. Jinak to je vyber mezi min a max # # in 'data' - pole s daty # in 'min' - minimalni hodnota v poli pro posouzeni # in 'max' - maximalni hodnota v poli pro posouzeni # in 'plot' - rozhoduje o vykresleni grafu # # out 'out' - smerodatna odchylka # data = np.array(data) if min == 0 and max == 0: average = np.mean(data) median = np.median(data) standardDeviation=np.std(data) kurtosis = stats.kurtosis(data) skewness = stats.skew(data) else: crop = np.array([]) for x in data: if min < x < max: crop=np.append(crop,x) average = np.mean(crop) # modus = stats.mode(crop) # modus = statistics.mode(crop) !!!!! median = np.median(crop) standardDeviation=np.std(crop) kurtosis = stats.kurtosis(crop) skewness = stats.skew(crop) if plot: plt.figure() plt.axvspan(float(min), float(max), alpha=0.3, color='k') plt.axvspan(average-standardDeviation, average+standardDeviation, alpha=0.4, color='b') plt.axvspan(average+standardDeviation, average+standardDeviation+standardDeviation, alpha=0.4, color='r') plt.axvspan(average-standardDeviation, average-standardDeviation-standardDeviation, alpha=0.4, color='r') plt.axvline(x=median, linewidth=2, color='r') plt.axvline(x=average, linewidth=2, color='g') #plt.axvline(x=modus[0], linewidth=2, color='b') plt.hist(data, 1.0+3.3*math.log(np.shape(data)[0]), facecolor='green', alpha=0.75) plt.text(average, 10, "std: "+ str(standardDeviation), bbox={'facecolor':'green', 'alpha':0.75, 'pad':10}) plt.show(block=False) print "___________________________________________________________" print "výběr hodnot od ", float(min), " po ", float(max) print "průměr: ", average print "median: ", median print "smerodatn odchylka je: ", standardDeviation print "spicatost: ", kurtosis print "sikmost: ", skewness return standardDeviation
def test_unbiased_HMM(precision=2): n_rv, n_sample = 10, 100 n_scenario = 500 data = np.random.rand(n_rv, n_sample) # original statistics tgt_moments = np.zeros((n_rv, 4)) tgt_moments[:, 0] = data.mean(axis=1) tgt_moments[:, 1] = data.std(axis=1, ddof=1) tgt_moments[:, 2] = spstats.skew(data, axis=1, bias=False) tgt_moments[:, 3] = spstats.kurtosis(data, axis=1, bias=False) tgt_corrs = np.corrcoef(data) t0 = time() py_scenarios = HMM(tgt_moments, tgt_corrs, n_scenario, bias=False) print ("python unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format( n_rv, n_scenario, time()-t0)) t1 = time() c_scenarios = c_HMM(tgt_moments, tgt_corrs, n_scenario, bias=False) print ("c unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format( n_rv, n_scenario, time()-t1)) for scenarios in (py_scenarios, c_scenarios): # scenarios statistics res_moments = np.zeros((n_rv, 4)) res_moments[:, 0] = scenarios.mean(axis=1) res_moments[:, 1] = scenarios.std(axis=1, ddof=1) res_moments[:, 2] = spstats.skew(scenarios, axis=1, bias=False) res_moments[:, 3] = spstats.kurtosis(scenarios, axis=1, bias=False) res_corrs = np.corrcoef(scenarios) np.testing.assert_array_almost_equal(tgt_moments, res_moments, precision) np.testing.assert_array_almost_equal(tgt_corrs, res_corrs, precision)
def main(): start_time = time.time() files = [DATA_DIR + file for file in os.listdir(DATA_DIR) if fnmatch.fnmatch(file, '*.csv')] bad_codes = [0, 7, 8, 9] kt = [] cong = list(range(102, 114)) for i in files: print('processing', i, '...') table, dem, rep = get_data(i) # for j in bad_codes: # table[table == j] = np.nan # dem[dem == j] = np.nan # rep[rep == j] = np.nan total_pol = 10-np.mean(sp.kurtosis(table, fisher=True, nan_policy='omit')) dem_pol = 10-np.mean(sp.kurtosis(dem, fisher=True, nan_policy='omit')) rep_pol = 10-np.mean(sp.kurtosis(rep, fisher=True, nan_policy='omit')) print('total polarization:', total_pol) print('democrat only polarization:', dem_pol) print('republican only polarization:', rep_pol, '\n') kt.append(total_pol) plt.plot(cong, kt) plt.title('Polarization timeline (original data)') plt.xlabel('x-th Congress') plt.ylabel('10 - kurtosis') # plt.show() plt.savefig('polarization.pdf') print('time taken:', time.time()-start_time, 'seconds')
def getstats_base(X, linds): sval = {} for l_ind in linds: print(l_ind, X['model_state']['layers'][l_ind]['name']) layer = X['model_state']['layers'][l_ind] w = layer['weights'][0] karray = stats.kurtosis(w) kall = stats.kurtosis(w.ravel()) cf0 = np.corrcoef(w) cf0t = np.corrcoef(w.T) wmean = w.mean(1) w2mean = (w**2).mean(1) lname = X['model_state']['layers'][l_ind]['name'] sval[lname] = {'karray': karray, 'kall': kall, 'corr0': cf0, 'corr0_t': cf0t, 'wmean': wmean, 'w2mean': w2mean} if 'filterSize' in X['model_state']['layers'][l_ind]: fs = X['model_state']['layers'][l_ind]['filterSize'][0] ws = w.shape w = w.reshape((ws[0] / (fs**2), fs, fs, ws[1])) mat = np.row_stack([np.row_stack([w[i, j, :, :] for i in range(w.shape[0])]).T for j in range(w.shape[1])] ) cf = np.corrcoef(mat.T) cft = np.corrcoef(mat) mat2 = np.row_stack([np.row_stack([w[i, :, :, j] for i in range(w.shape[0])]).T for j in range(w.shape[3])] ) cf2 = np.corrcoef(mat2.T) cf2t = np.corrcoef(mat2) sval[lname].update({'corr': cf, 'corr2': cf2, 'corr_t': cft, 'corr2_t': cf2t}) return sval
def test_kurtosis(self): # Using the scipy.stats definition which is optimized and unittested data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]] expt = [] expt.append(stats.kurtosis(data[0])) expt.append(stats.kurtosis(data[1])) resulting_vals = kurtosis(data) self.assertTrue(np.array_equal(np.array(expt), np.array(resulting_vals)))
def print_kurtosis(scaled_data): #print the kurtosis of the scaled data print "Kurotsis of original DF:", kurtosis(scaled_data) #print the kurtosis of the ICA transformed columns for i in range(1,len(scaled_data[0])+1): ica = FastICA(n_components=i) ica_fit = ica.fit_transform(scaled_data) print "Kurtosis of ICA Transformed data when i=" + str(i) + ":", kurtosis(ica_fit)
def test_kurtosis(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.kurtosis(x) rm = stats.mstats.kurtosis(xm) assert_almost_equal(r, rm, 10) r = stats.kurtosis(y) rm = stats.mstats.kurtosis(ym) assert_almost_equal(r, rm, 10)
def stats_plots(V, labelsin, title=None): """ 4 plots of basic statistical properties. IC = intraclass correlation, or the noise sources between the groups. """ import scipy.stats as stats colors = ['darkkhaki', 'royalblue', 'forestgreen','tomato'] var = [np.var(i) for i in V] skew = [stats.skew(i) for i in V] kurt = [stats.kurtosis(i) for i in V] uniq = list(set(labelsin)) v_sort = [[] for u in uniq] # Make a blank list, preparing for IC v_means = [[] for u in uniq] # v_means is a list of list of means for each cell of each type v_var, v_skew, v_kurt = [[] for u in uniq], [[] for u in uniq], [[] for u in uniq] for v in range(len(V)): i = uniq.index(labelsin[v]) v_sort[i].append(V[v]) v_means[i].append(np.mean(V[v])) v_var[i].append(np.var(V[v])) v_skew[i].append(stats.skew(V[v])) v_kurt[i].append(stats.kurtosis(V[v])) # ic = var_between^2 / (var_between^2 + var_within^2) ic = [] for v in range(len(uniq)): I = np.var(v_means[v])**2 / \ (np.var(v_means[v])**2 + sum([np.var(i) for i in v_sort[v]])**2) ic.append([I]) print(ic) group_means = [np.mean(k) for k in v_means] # group_means are the master means (only 4) master_ic = np.var(group_means)**2 / \ (np.var(group_means)**2 + sum([np.var(i) for i in v_means])**2) print('Master IC for this set: %.5f' %master_ic) ## Plotting stuff fig = plt.figure() axs = [fig.add_subplot(221), fig.add_subplot(222), fig.add_subplot(223), fig.add_subplot(224)] t**s = ['Variance', 'Skew', 'Kurtosis', 'Intraclass correlation'] plot_vars = [v_var, v_skew, v_kurt, ic] for a in axs: # For each plot for u in range(len(uniq)): # For each cell type a.scatter(np.ones(len(plot_vars[axs.index(a)][u]))*u, plot_vars[axs.index(a)][u], c=colors[u], s=80, edgecolor='k', alpha=0.6) if axs.index(a) == 3: a.set_yticks([0,0.12,0.24]) else: a.locator_params(axis='y', nbins=4) a.set_xticks([]) a.set_title(t**s[axs.index(a)]) # Legend and title #patches = [mpatches.Patch(color=colors[u], label=uniq[u]) for u in range(len(uniq))] #plt.legend(handles=patches, loc=5) if title is not None: plt.suptitle(title, fontsize=20) plt.show()
def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None): """ Returns the confidence interval for kurtosis. Parameters ---------- sig : float The significance level. Default is .05 upper_bound : float Maximum value of kurtosis the upper limit can be. Default is .99 confidence limit assuming normality. lower_bound : float Minimum value of kurtosis the lower limit can be. Default is .99 confidence limit assuming normality. Returns -------- Interval : tuple Lower and upper confidence limit Notes ----- For small n, upper_bound and lower_bound may have to be provided by the user. Consider using test_kurt to find values close to the desired significance level. If function returns f(a) and f(b) must have different signs, consider expanding the bounds. """ endog = self.endog nobs = self.nobs if upper_bound is None: upper_bound = kurtosis(endog) + \ (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5) * \ (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ (nobs + 5.))) ** .5) if lower_bound is None: lower_bound = kurtosis(endog) - \ (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5) * \ (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ (nobs + 5.))) ** .5) self.r0 = chi2.ppf(1 - sig, 1) llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \ kurtosis(endog)) ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \ upper_bound) return llim, ulim
def kurtosis_da(resp): dims = resp.coords.dims if ('x' in resp) and ('y' in dims): resp = resp.transpose('unit', 'shapes', 'x', 'y') elif ('x' in dims): resp = resp.transpose('unit', 'shapes', 'x') elif ('y' in dims): resp = resp.transpose('unit', 'shapes', 'y') stim_resp = np.array([(unit**2).sum((1, 2)) for unit in resp.values]) pos_resp = np.array([(unit**2).sum(0).ravel() for unit in resp.values]) k_stim = kurtosis(stim_resp, axis=1, fisher=False) k_pos = kurtosis(pos_resp, axis=1, fisher=False) return k_pos, k_stim
def kurto_improved(x,dt,LENwin): """ Determines the kurtosis of a timeseries. dt = sampling interval in seconds. LENwin = time window (in secs) over which the skewness is determined described by kuperkoch et al. 2010: calculate the kurtosis recursively Results are not satisfying, but one may want to improve this... it might save some time! """ # find number of samples in averaging windows nLEN=int(LENwin/dt)+1 #xabs=abs(x) kurtos=[] first_window=ss.kurtosis(x[0:(0+nLEN)],fisher=False) kurtos.append(first_window) i=1 while i<(len(x) - nLEN +1): new_value=kurtos[i-1]-(x[i-1])**4+(x[i-1+nLEN])**4 kurtos.append(new_value) i+=1 return(kurtos)
def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(moments.rolling_kurt, lambda x: kurtosis(x, bias=False))
def compute_profile(self): self.rec.label_contours(self.ji_intervals) distributions = {} for key, segments in self.rec.contour_labels.items(): distributions[key] = [] for indices in segments: distributions[key].extend(self.pitch_obj.pitch[indices[0]:indices[1]]) parameters = {} for interval, distribution in distributions.items(): distribution = np.array(distribution) #TODO: replace -10000 with whatever the bound is for invalid pitch values in cent scale distribution = distribution[distribution >= -10000] [n, be] = np.histogram(distribution, bins=1200) bc = (be[1:] + be[:-1])/2.0 peak_pos = bc[np.argmax(n)] peak_mean = float(np.mean(distribution)) peak_variance = float(variation(distribution)) peak_skew = float(skew(distribution)) peak_kurtosis = float(kurtosis(distribution)) pearson_skew = float(3.0 * (peak_mean - peak_pos) / np.sqrt(abs(peak_variance))) parameters[interval] = {"position": float(peak_pos), "mean": peak_mean, "amplitude": float(max(n)), "variance": peak_variance, "skew1": peak_skew, "skew2": pearson_skew, "kurtosis": peak_kurtosis} all_amps = [parameters[interval]["amplitude"] for interval in parameters.keys()] peak_amp_sum = sum(all_amps) for interval in parameters.keys(): parameters[interval]["amplitude"] = parameters[interval]["amplitude"]/peak_amp_sum self.intonation_profile = parameters
def _get_grid_size(data, use_default_square=False): """ Calculate the size of the grid. Parameters ---------- data: array-like The normalized data. use_default_square: bool Define the grid as the minimal possible square. Returns ------- int, int The width and height of the grid. """ # if the grid would be square, this is the minimum size sqr_size = int(np.ceil(np.sqrt(len(data)))) size_x = size_y = sqr_size if not use_default_square: kurt = kurtosis(data) kurt_x, kurt_y = np.int32(np.abs(np.ceil(kurt * 2))) size_x += kurt_x size_y += kurt_y return size_x, size_y
def jarque_bera(resids): """ Calculate residual skewness, kurtosis, and do the JB test for normality Parameters ----------- resids : array-like Returns ------- JB, JBpv, skew, kurtosis JB = n/6*(S^2 + (K-3)^2/4) JBpv is the Chi^2 two-tail probability value skew is the measure of skewness kurtosis is the measure of kurtosis """ resids = np.asarray(resids) # Calculate residual skewness and kurtosis skew = stats.skew(resids) kurtosis = 3 + stats.kurtosis(resids) # Calculate the Jarque-Bera test for normality JB = (resids.shape[0]/6) * (skew**2 + (1/4)*(kurtosis-3)**2) JBpv = stats.chi2.sf(JB,2); return JB, JBpv, skew, kurtosis
def _find_high_kurtosis(self, pcas, memory): random_state = check_random_state(self.random_state) if not self.kurtosis_thr: kurtosis_thr = -np.inf else: kurtosis_thr = self.kurtosis_thr n_components = self.n_components while n_components < 3 * self.n_components: group_maps = memory.cache( randomized_svd)(pcas, n_components)[0] group_maps = group_maps[:, :n_components] ica_maps = memory.cache(fastica)(group_maps, whiten=False, fun='cube', random_state=random_state)[2] ica_maps = ica_maps.T kurtosis = stats.kurtosis(ica_maps, axis=1) kurtosis_mask = kurtosis > kurtosis_thr if np.sum(kurtosis_mask) >= n_components: order = np.argsort(kurtosis)[::-1] ica_maps = ica_maps[order[:n_components]] break n_components += 1 del group_maps else: raise ValueError('Could not find components with high-enough' ' kurtosis') self.n_components_ = n_components return ica_maps
def test_cont_basic_slow(): # same as above for slow distributions for distname, arg in distcont[:]: if distname not in distslow: continue distfn = getattr(stats, distname) np.random.seed(765456) sn = 1000 rvs = distfn.rvs(size=sn,*arg) sm = rvs.mean() sv = rvs.var() skurt = stats.kurtosis(rvs) sskew = stats.skew(rvs) m,v = distfn.stats(*arg) yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, distname + \ 'sample mean test' # the sample skew kurtosis test has known failures, not very good distance measure #yield check_sample_skew_kurt, distfn, arg, sskew, skurt, distname yield check_moment, distfn, arg, m, v, distname yield check_cdf_ppf, distfn, arg, distname yield check_sf_isf, distfn, arg, distname yield check_pdf, distfn, arg, distname yield check_pdf_logpdf, distfn, arg, distname yield check_cdf_logcdf, distfn, arg, distname yield check_sf_logsf, distfn, arg, distname #yield check_oth, distfn, arg # is still missing if distname in distmissing: alpha = 0.01 yield check_distribution_rvs, distname, arg, alpha, rvs
def computeProfileStatScores(self): """ Builds the scores using raw profile intensity data only. Returns the scores. Parameters: N/A Returns: An array of profile intensities as floating point values. """ try: bins =[] for intensity in self.profile: bins.append(float(intensity)) mn = mean(bins) stdev = std(bins) skw = skew(bins) kurt = kurtosis(bins) stats = [mn,stdev,skw,kurt] return stats except Exception as e: # catch *all* exceptions print "Error getting Profile stat scores from PHCX file\n\t", sys.exc_info()[0] print self.format_exception(e) raise Exception("Profile stat score extraction exception") return []
def computeDMCurveStatScores(self): """ Returns a list of integer data points representing the candidate DM curve. Parameters: N/A Returns: A list data type containing data points. """ try: bins=[] bins=self.profileOps.getDMCurveData(self.rawdata,self.profileIndex) mn = mean(bins) stdev = std(bins) skw = skew(bins) kurt = kurtosis(bins) stats = [mn,stdev,skw,kurt] return stats except Exception as e: # catch *all* exceptions print "Error getting DM curve stat scores from PHCX file\n\t", sys.exc_info()[0] print self.format_exception(e) raise Exception("DM curve stat score extraction exception") return []
def get_stat_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(i, res_type = 'kaiser_fast') try: #basic statistical features length = len(b) mean = np.mean(b) minimum = np.min(b) maximum = np.max(b) std = np.std(b) rms = np.sqrt(np.mean(b**2)) kurt = kurtosis(b) Skew = skew(b) #Audio length feature data,samp_rate = librosa.effects.trim(b,top_db = 40) len_init = len(data) ratio_init = len_init/length splits = librosa.effects.split(b, top_db=40) if len(splits) > 1: b = np.concatenate([b[x[0]:x[1]] for x in splits]) len_final = len(b) ratio_final = len_final/length #return pd.Series([mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final]) return pd.Series(np.hstack((mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final))) except: print("Bad file at {}".format(fname)) return pd.Series([0]*11)
def grid_color_stat(patient_grid_1_color): shape_stats = np.zeros(4) shape_stats[0] = np.mean(patient_grid_1_color.flatten()) shape_stats[1] = np.std(patient_grid_1_color.flatten()) shape_stats[2] = skew(patient_grid_1_color.flatten()) shape_stats[3] = kurtosis(patient_grid_1_color.flatten()) return shape_stats
def calc_statistics(x): n = x.shape[0] # 样本个数 # 手动计算 m = 0 m2 = 0 m3 = 0 m4 = 0 for t in x: m += t m2 += t*t m3 += t**3 m4 += t**4 m /= n m2 /= n m3 /= n m4 /= n mu = m sigma = np.sqrt(m2 - mu*mu) skew = (m3 - 3*mu*m2 + 2*mu**3) / sigma**3 kurtosis = (m4 - 4*mu*m3 + 6*mu*mu*m2 - 4*mu**3*mu + mu**4) / sigma**4 - 3 print '手动计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis # 使用系统函数验证 mu = np.mean(x, axis=0) sigma = np.std(x, axis=0) skew = stats.skew(x) kurtosis = stats.kurtosis(x) return mu, sigma, skew, kurtosis
def test_kurt(self): from scipy.stats import kurtosis string_series = tm.makeStringSeries().rename('series') alt = lambda x: kurtosis(x, bias=False) self._check_stat_op('kurt', alt, string_series) index = pd.MultiIndex( levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]] ) s = Series(np.random.randn(6), index=index) tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) # test corner cases, kurt() returns NaN unless there's at least 4 # values min_N = 4 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: assert np.isnan(s.kurt()) assert np.isnan(df.kurt()).all() else: assert 0 == s.kurt() assert (df.kurt() == 0).all()
def mcnoise(data, noise_std, n, noise_scaling=1.): """ Parameters ---------- data : ndarray Array of data. noise_std : float Standard deviation of the noise n : int Number of repetition noise_scaling: float Scaling factor for noise Returns ------- variance, variance error, skewness, skewness error, kurtosis, kurtosis error """ noise_arr = np.random.normal(0, noise_std, (n, data.size)) * noise_scaling var_sample = np.var(data + noise_arr, axis=1) skew_sample = skew(data + noise_arr, axis=1) kurt_sample = kurtosis(data + noise_arr, axis=1) var_val = np.mean(var_sample) skew_val = np.mean(skew_sample) kurt_val = np.mean(kurt_sample) var_err = np.std(var_sample) skew_err = np.std(skew_sample) kurt_err = np.std(kurt_sample) return var_val, var_err, skew_val, skew_err, kurt_val, kurt_err
def perf_stats( returns, returns_style='compound', return_as_dict=False, period=DAILY): """Calculates various performance metrics of a strategy, for use in plotting.show_perf_stats. Parameters ---------- returns : pd.Series Daily returns of the strategy, noncumulative. - See full explanation in tears.create_full_tear_sheet. returns_style : str, optional See annual_returns' style return_as_dict : boolean, optional If True, returns the computed metrics in a dictionary. period : str, optional - defines the periodicity of the 'returns' data for purposes of annualizing. Can be 'monthly', 'weekly', or 'daily' - defaults to 'daily'. Returns ------- dict / pd.DataFrame Performance metrics. """ all_stats = OrderedDict() all_stats['annual_return'] = annual_return( returns, style=returns_style, period=period) all_stats['annual_volatility'] = annual_volatility(returns, period=period) all_stats['sharpe_ratio'] = sharpe_ratio( returns, returns_style=returns_style, period=period) all_stats['calmar_ratio'] = calmar_ratio( returns, returns_style=returns_style, period=period) all_stats['stability'] = stability_of_timeseries(returns) all_stats['max_drawdown'] = max_drawdown(returns) all_stats['omega_ratio'] = omega_ratio(returns) all_stats['sortino_ratio'] = sortino_ratio(returns) # TODO: The information_ratio method requires # a second argument for benchmark returns. # Setting information_ratio to NaN until # benchmark returns are added as an argument # to this method. all_stats['information_ratio'] = np.nan all_stats['skewness'] = stats.skew(returns) all_stats['kurtosis'] = stats.kurtosis(returns) if return_as_dict: return all_stats else: all_stats_df = pd.DataFrame( index=list(all_stats.keys()), data=list(all_stats.values())) all_stats_df.columns = ['perf_stats'] return all_stats_df
def kurt(x): from scipy.stats import kurtosis # noqa:F811 if len(x) < 4: return np.nan return kurtosis(x, bias=False)
start_date = _start_date, end_date = _end_date, start_t = _start_t, end_t = _end_t, ) # print(df) max_value = max(list(df[_col_name])) average_value = float(round(np.mean(list(df[_col_name])), 2)) area_value = round(sum(list(df[_col_name])), 2) median_value = round(float(np.median(list(df[_col_name]))), 2) var_value = round( np.var(list(df[_col_name])), 2) std_value = round( np.std(list(df[_col_name])), 2) skew_value = round( skew(list(df[_col_name])), 2) kurtosis_value = round( kurtosis(list(df[_col_name])), 2) q1 = round( np.quantile(list(df[_col_name]), .25), 2) q3 = round( np.quantile(list(df[_col_name]), .75), 2) iqr_value = round( iqr(list(df[_col_name])), 2) percentile10 = round( np.percentile(list(df[_col_name]), 10), 2) percentile40 = round( np.percentile(list(df[_col_name]), 40), 2) percentile60 = round( np.percentile(list(df[_col_name]), 60), 2) percentile90 = round( np.percentile(list(df[_col_name]), 90), 2) trim_mean10 = round( trim_mean(list(df[_col_name]), 0.1), 2) trim_mean20 = round( trim_mean(list(df[_col_name]), 0.2), 2) features = { 'max': max_value, 'average': average_value, 'area': area_value, 'median': median_value,
def process_results_run(self): ######### # Prepare variables for results processing ######### true_detected_positions = [] temp_list = [] for i_detector in range(self.n_drift_detectors) : true_detected_positions.append([]) temp_list.append(copy(self.true_positions)) ######### # Process results from drift detection ######### for i_detector in range(self.n_drift_detectors) : # Get a list of TP detected drifts for j in range(len(self.detected_positions[i_detector])) : try : true_position = min([num for num in temp_list[i_detector] if num<self.detected_positions[i_detector][j]], key=lambda x:abs(x-self.detected_positions[i_detector][j])) true_detected_positions[i_detector].append(true_position) self.list_TP[i_detector].append(self.detected_positions[i_detector][j]) self.delays[i_detector].append(self.detected_positions[i_detector][j]-true_position) ind = temp_list[i_detector].index(true_position) if ind > 0: del temp_list[i_detector][0:ind+1] else : del temp_list[i_detector][ind] except ValueError : pass self.n_TP[i_detector] = len(self.list_TP[i_detector]) self.n_FP[i_detector] = len(self.detected_positions[i_detector]) - self.n_TP[i_detector] self.list_n_detected[i_detector].append(self.n_detected_drifts[i_detector]) self.list_n_TP[i_detector].append(self.n_TP[i_detector]) self.list_n_FP[i_detector].append(self.n_FP[i_detector]) self.list_delays[i_detector].append(np.mean(self.delays[i_detector])) # Exceptions raised if not enought drift detected to calculate stats, we decide to add Nan to the results then try : # Stats of meta-features : median, kurtosis, skewness, perc10, perc90 self.stats_severity_list[i_detector].append([np.median(self.severity_list[i_detector]), kurtosis(self.severity_list[i_detector]), skew(self.severity_list[i_detector]), np.percentile(self.severity_list[i_detector],10), np.percentile(self.severity_list[i_detector],90)]) except: #Debug # print('Severity') # print('Detector : '+str(self.list_names_drifts_detectors[i_detector])) # print('Stream : '+str(self.name_file)) # print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector]))) self.stats_severity_list[i_detector].append([np.nan, np.nan, np.nan, np.nan, np.nan]) try : self.stats_magnitude_list[i_detector].append([np.median(self.magnitude_list[i_detector]), kurtosis(self.magnitude_list[i_detector]), skew(self.magnitude_list[i_detector]), np.percentile(self.magnitude_list[i_detector],10), np.percentile(self.magnitude_list[i_detector],90)]) except: #Debug # print('Magnitude') # print('Detector : '+str(self.list_names_drifts_detectors[i_detector])) # print('Stream : '+str(self.name_file)) # print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector]))) self.stats_magnitude_list[i_detector].append([np.nan, np.nan, np.nan, np.nan, np.nan]) try : self.stats_interval_list[i_detector].append([np.median(self.interval_list[i_detector]), kurtosis(self.interval_list[i_detector]), skew(self.interval_list[i_detector]), np.percentile(self.interval_list[i_detector],10), np.percentile(self.interval_list[i_detector],90)]) except: #Debug # print('Interval') # print('Detector : '+str(self.list_names_drifts_detectors[i_detector])) # print('Stream : '+str(self.name_file)) # print('Nombre de drifts detected : '+str(len(self.detected_positions[i_detector]))) self.stats_interval_list[i_detector].append([np.nan, np.nan, np.nan, np.nan, np.nan]) # Reset for next run self.reset_run()
def extract_feature(X, y, fs): """ 特征提取 @param X: 数据样本 @param y:数据标签 @param fs:原始数据采样频率 @return FX: 特征向量 @return Fy: 标签 example: from utils.augment import preprocess from utils.feature import extract_feature # -1- 载入数据 path = r"./data/0HP" data_mark = "FE" len_data = 1024 overlap_rate = 50 # 50% random_seed = 1 fs = 12000 X, y = preprocess(path, data_mark, fs, len_data/fs, overlap_rate, random_seed ) # -2- 提取特征 FX, Fy = extract_feature(X, y, fs) """ def skewness(s) -> float: """ 偏度计算 """ N = len(s) s = np.ravel(s) mean = np.mean(s) rms = np.sqrt(np.dot(s, s) / N) return np.sum(np.power(np.abs(s) - mean, 3)) / (N * rms**3) def maxf_in_env_spectrum(data, fs) -> float: """ 包络谱最大幅值处的频率 """ data = np.ravel(data) N = len(data) T = 1 / fs analytic_signal = hilbert(data) am_enve = np.abs(analytic_signal).reshape(N, ) yf = fft(am_enve - np.mean(am_enve)) y_envsp = 2.0 / N * np.abs(yf[0:N // 2]).reshape(N // 2, 1) xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2) # 返回最大幅值的频率 maxf = xf[np.argwhere(y_envsp == np.max(y_envsp))[0][0]] return maxf def hist_for_entropy(s): """ 对信号的直方图计算 @param s:一维序列数据 @return res: 直方图每个组对应的高度 @return s_min:s的最小值 @return s_max:s的最大值 @return ncell:直方图的分组数目 """ s = np.ravel(s) N = len(s) s_max = np.max(s) s_min = np.min(s) delt = (s_max - s_min) / N c_0 = s_min - delt / 2 c_N = s_max + delt / 2 ncell = int(np.ceil(np.sqrt(N))) # c = f(s) c = np.round((s - c_0) / (c_N - c_0) * ncell + 1 / 2) # 计算分组数组出现的频次 res = np.zeros(ncell) for i in range(0, N): ind = int(c[i]) if ind >= 1 and ind <= ncell: res[ind - 1] = res[ind - 1] + 1 return res, s_min, s_max, ncell def shannom_entropy_for_hist(s) -> float: """ 一维序列的香农信号熵 @param x: 一维序列数据 @return estimate: 香农信号熵的无偏估计值 """ h, s_min, s_max, ncell = hist_for_entropy(s) # 无偏估计 h = h[h != 0] N = np.sum(h) estimate = -np.sum(h * np.log(h)) / N sigma = np.sum(h * np.log2(h)**2) sigma = np.sqrt((sigma / N - estimate**2) / (N - 1)) estimate = estimate + np.log(N) + np.log((s_max - s_min) / ncell) nbias = -(ncell - 1) / (2 * N) estimate = estimate - nbias return estimate def pdf_for_median_am(s) -> float: """ 一维序列信号幅值中位数处的概率密度估计 @param s: 一维序列信号 @return 幅值中位数处的概率密度估计 """ N = len(s) res, s_min, s_max, ncell = hist_for_entropy(s) # 归一化的到概率密度 pdf = res / N / (s_max - s_min) * ncell # 幅值中位数 映射 到直方图的组号 delt = (s_max - s_min) / N c_min = s_min - delt / 2 c_max = s_max + delt / 2 s_median = np.median(s) s_median_icell = int( np.round((s_median - c_min) / (c_max - c_min) * ncell + 1 / 2)) return pdf[s_median_icell] feature = {} N = len(X[0]) feature['mean'] = [np.mean(x) for x in X] feature['rms'] = [np.sqrt(np.dot(np.ravel(x), np.ravel(x)) / N) for x in X] feature['std'] = [np.std(x) for x in X] feature['skewness'] = [skewness(x) for x in X] feature['kurtosis'] = [kurtosis(x, fisher=False) for x in X] feature['maxf'] = [maxf_in_env_spectrum(x, fs) for x in X] feature['signal_entropy'] = [shannom_entropy_for_hist(x) for x in X] feature['am_median_pdf'] = [pdf_for_median_am(x) for x in X] feature['label'] = [int(la) for la in y] # 返回pandas.DataFrame类型的特征矩阵 f_datafram = pd.DataFrame([feature[k] for k in feature.keys()], index=list(feature.keys())).T # 返回 FX,Fy features = [ 'mean', 'rms', 'std', 'skewness', 'kurtosis', 'maxf', 'signal_entropy', 'am_median_pdf' ] FX, Fy = f_datafram[features], f_datafram['label'] return FX, Fy
def normalized_kurtosis(x, tx): y = normalize(x, tx) return kurtosis(y)
result['clustersCOrdered'] = clustersOrdered1 ### Algoritmo de seleção dos primeiros features de cada cluster # Distance check = 0 featSelDist = [] bestKurtosis = 100 bestKurtosisID = {} i = 0 for val in clustersOrdered: if val != check: check = val bestKurtosis = 100 featureKurtosis = np.abs( kurtosis(featureData[:, result['orderedDistanceMatrixFeaturesID'][i]])) if (featureKurtosis < bestKurtosis): bestKurtosis = featureKurtosis bestKurtosisID[check] = result['orderedDistanceMatrixFeaturesID'][i] i = i + 1 for k in range(1, len(bestKurtosisID) + 1): featSelDist.append(bestKurtosisID[k]) result['featSelDist'] = featSelDist # Correlation check = 0 featSelCorr = [] bestKurtosis = 100 bestKurtosisID = {}
def _get_opinion_score_2darray_with_preprocessing(dataset_reader, **kwargs): s_es = dataset_reader.opinion_score_2darray # dscore_mode: True - do differential-scoring # False - don't do differential-scoring dscore_mode = kwargs[ 'dscore_mode'] if 'dscore_mode' in kwargs else False # zscore_mode: True - do z-scoring (normalizing to 0-mean 1-std) # False - don't do z-scoring zscore_mode = kwargs[ 'zscore_mode'] if 'zscore_mode' in kwargs else False # subject_rejection: True - do subject rejection # False - don't do subject rejection subject_rejection = kwargs[ 'subject_rejection'] if 'subject_rejection' in kwargs else False if dscore_mode is True: E, S = s_es.shape s_e = pd.DataFrame(s_es).mean(axis=1) # mean along s s_e_ref = DmosModel._get_ref_mos(dataset_reader, s_e) s_es = s_es + dataset_reader.ref_score - np.tile(s_e_ref, (S, 1)).T if zscore_mode is True: E, S = s_es.shape mu_s = pd.DataFrame(s_es).mean(axis=0) # mean along e simga_s = pd.DataFrame(s_es).std(ddof=1, axis=0) # std along e s_es = (s_es - np.tile(mu_s, (E, 1))) / np.tile(simga_s, (E, 1)) if subject_rejection is True: E, S = s_es.shape ps = np.zeros(S) qs = np.zeros(S) for s_e in s_es: s_e_notnan = s_e[~np.isnan(s_e)] mu = np.mean(s_e_notnan) sigma = np.std(s_e_notnan) kurt = stats.kurtosis(s_e_notnan, fisher=False) if 2 <= kurt and kurt <= 4: for idx_s, s in enumerate(s_e): if not np.isnan(s): if s >= mu + 2 * sigma: ps[idx_s] += 1 if s <= mu - 2 * sigma: qs[idx_s] += 1 else: for idx_s, s in enumerate(s_e): if not np.isnan(s): if s >= mu + np.sqrt(20) * sigma: ps[idx_s] += 1 if s <= mu - np.sqrt(20) * sigma: qs[idx_s] += 1 rejections = [] acceptions = [] for idx_s, subject in zip(range(S), range(S)): if (ps[idx_s] + qs[idx_s]) / E > 0.05 and np.abs( (ps[idx_s] - qs[idx_s]) / (ps[idx_s] + qs[idx_s])) < 0.3: rejections.append(subject) else: acceptions.append(subject) s_es = s_es[:, acceptions] return s_es
def create_feature_with_distribution(dataframe_column, number_observations): """ Calculates the distribution of a Feature, Categorical or Numerical. Some Numerical features, if they have few distinct values, a categorical distribution also will be applied. Params: @dataframe_column must be a dataframe column Returns a Feature """ try: data_type = dataframe_column.dtypes data_category = FeatureHelper.get_data_category(data_type) categories = [] num_statistics = None # print(len(feature)) unique_values = len(dataframe_column.unique()) missing_values_NA = dataframe_column.isna().sum() missing_values_NULL = dataframe_column.isnull().sum() can_be_seen_as_category = FeatureHelper.can_be_seen_as_category( data_type, unique_values, number_observations) can_be_seen_as_index = FeatureHelper.can_be_seen_as_index( data_type, unique_values, number_observations) # CHECK 1 # Categorical or Numerical can be seen as Category if data_category == DATA_CATEGORY_CATEGORICAL or can_be_seen_as_category: value_counts = dataframe_column.value_counts() keys = value_counts.keys() # Total #TODO: optimize this calculation, avoid to calculate all times (total, get the dataset rows) #total = 0 #for i in range(0, len(keys) ): # total += value_counts[keys[i]] for i in range(0, len(keys)): count = value_counts[keys[i]] categories.append( Category(value=keys[i], frequency=count, proportion=count / number_observations)) # CHECK 2 # Numerical Statistics if data_category == DATA_CATEGORY_NUMERICAL: num_statistics = NumericalStatistics( mean_value=mean(dataframe_column), median_value=median(dataframe_column), standard_deviation=stdev(dataframe_column), mode_value=stats.mode(dataframe_column), max_value=max(dataframe_column), min_value=min(dataframe_column), kurtosis=stats.kurtosis(dataframe_column), skewness=stats.skew(dataframe_column)) feature = Feature(name=dataframe_column.name, label=dataframe_column.name, data_type=data_type, data_category=data_category, can_be_seen_as_category=can_be_seen_as_category, can_be_seen_as_index=can_be_seen_as_index, unique_values=unique_values, missing_values_NA=missing_values_NA, missing_values_NULL=missing_values_NULL, statistics=num_statistics, categories=categories) return feature except Exception as e: print( 'Error: data_tabular - calc_distribution() \nException Message: ', e)
print("男孩身高標準差=", std_boy) statistics_stdev_boy = statistics.stdev(boys) print("statistics_mean_boy=", statistics_stdev_boy) # python 百分位數 # np print("90百分位數=", np.percentile(boys, 90)) print("50百分位數=", np.percentile(boys, 50)) print("20百分位數=", np.percentile(boys, 20)) #stat print("20百分位數=", stats.scoreatpercentile(boys, 20)) #計算峰度和偏度 print(stats.skew(boys)) print(stats.kurtosis(boys)) # pandas和 stat 接近 # python的峰帶 #最後,畫圖看分布 plt.hist(boys, alpha=.4, bins=40) plt.title('boy,skewness={0},kurtosis={1}'.format( round(stats.skew(boys), 2), round(stats.kurtosis(boys), 2))) plt.axvline(x=mean_boy) plt.show() # 今天學到不同統計量之間特性, # 試著分析男生女生身高資料, # 試著回答下面的問題: # Q1:試著用今天所教的內容,如何描述這兩組資料的樣態?
def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True): """ Calculates the four kurtosis measures in Kim & White Parameters ---------- y : array-like axis : int or None, optional Axis along which the kurtoses are computed. If `None`, the entire array is used. ab: iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db: iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure excess : bool, optional If true (default), computed values are excess of those for a standard normal distribution. Returns ------- kr1 : ndarray The standard kurtosis estimator. kr2 : ndarray Kurtosis estimator based on octiles. kr3 : ndarray Kurtosis estimators based on exceedence expectations. kr4 : ndarray Kurtosis measure based on the spread between high and low quantiles. Notes ----- The robust kurtosis measures are defined .. math:: KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right) +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)} {\\hat{q}_{.75}-\\hat{q}_{.25}} .. math:: KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)} {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)} .. math:: KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}} {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}} where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`. .. [1] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if (axis is None or (y.squeeze().ndim == 1 and y.ndim != 1)): y = y.ravel() axis = 0 alpha, beta = ab delta, gamma = dg perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, delta, 100.0 - delta, gamma, 100.0 - gamma) e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc, axis=axis) expected_value = expected_robust_kurtosis(ab, dg) if excess else np.zeros(4) kr1 = stats.kurtosis(y, axis, False) - expected_value[0] kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1] if y.ndim == 1: kr3 = _kr3(y, alpha, beta) else: kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta) kr3 -= expected_value[2] kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3] return kr1, kr2, kr3, kr4
def dataClean(alltrigs, opt, flag=1): """ Examine triggers and weed out spikes and calibration pulses using kurtosis and outlier ratios alltrigs: triggers output from triggering opt: opt from config flag: 1 if defining window to check, 0 if want to check whole waveform for spikes (note that different threshold values should be used for different window lengths) Returns good trigs (trigs) and several junk types (junk, junkFI, junkKurt) """ trigs = Stream() junkFI = Stream() junkKurt = Stream() junk = Stream() for i in range(len(alltrigs)): njunk = 0 ntele = 0 for n in range(opt.nsta): dat = alltrigs[i].data[n * opt.wshape:(n + 1) * opt.wshape] if flag == 1: datcut = dat[range( int((opt.ptrig - opt.kurtwin / 2) * opt.samprate), int((opt.ptrig + opt.kurtwin / 2) * opt.samprate))] else: datcut = dat if np.sum(np.abs(dat)) != 0.0: # Calculate kurtosis in window k = stats.kurtosis(datcut) # Compute kurtosis of frequency amplitude spectrum next datf = np.absolute(fft(dat)) kf = stats.kurtosis(datf) # Calculate outlier ratio using z ((data-median)/mad) mad = np.nanmedian(np.absolute(dat - np.nanmedian(dat))) z = (dat - np.median(dat)) / mad # Outliers have z > 4.45 orm = len(z[z > 4.45]) / np.array(len(z)).astype(float) if k >= opt.kurtmax or orm >= opt.oratiomax or kf >= opt.kurtfmax: njunk += 1 winstart = int(opt.ptrig * opt.samprate - opt.winlen / 10) winend = int(opt.ptrig * opt.samprate - opt.winlen / 10 + opt.winlen) fftwin = np.reshape(fft(dat[winstart:winend]), (opt.winlen, )) if np.median(np.abs(dat[winstart:winend])) != 0: fi = np.log10( np.mean( np.abs( np.real( fftwin[int(opt.fiupmin * opt.winlen / opt.samprate ):int(opt.fiupmax * opt.winlen / opt.samprate)]))) / np.mean( np.abs( np.real( fftwin[int(opt.filomin * opt.winlen / opt.samprate ):int(opt.filomax * opt.winlen / opt.samprate)])))) if fi < opt.telefi: ntele += 1 # Allow if there are enough good stations to correlate if njunk <= (opt.nsta - opt.ncor) and ntele <= opt.teleok: trigs.append(alltrigs[i]) else: if njunk > 0: if ntele > 0: junk.append(alltrigs[i]) else: junkKurt.append(alltrigs[i]) else: junkFI.append(alltrigs[i]) return trigs, junk, junkFI, junkKurt
def getKURT(vector): vector = np.asarray(vector) return stats.kurtosis(vector)
""" Name : c8_14_mean_std_skew_kurt.py Book : Python for Finance (2nd ed.) Publisher: Packt Publishing Ltd. Author : Yuxing Yan Date : 6/6/2017 email : [email protected] [email protected] """ from scipy import stats, random import numpy as np np.random.seed(12345) ret = random.normal(0, 1, 500000) print('mean =', np.mean(ret)) print('std =', np.std(ret)) print('skewness=', stats.skew(ret)) print('kurtosis=', stats.kurtosis(ret))
scaler = StandardScaler() X= scaler.fit_transform(X) print("PCA analysis") from sklearn.decomposition import FastICA from scipy.stats import kurtosis KURTOSIS = [] N_COMPS = np.arange(2,20,1) for n_comps in N_COMPS: X = np.copy(X_safe) Y = np.copy(Y_safe) scaler = StandardScaler() X = scaler.fit_transform(X) transformer = FastICA(n_components=n_comps, random_state=111,tol=0.001) X = transformer.fit_transform(X) kurt = kurtosis(X,axis=1) kurt = np.mean(np.abs(kurt)) KURTOSIS.append(kurt) print(" n_clusters = ", n_comps, " Kurtosis :", kurtosis) fig, ax = plt.subplots() plt.plot(N_COMPS, KURTOSIS , 'o', color = 'steelblue') plt.plot(N_COMPS, KURTOSIS , '-', color = 'steelblue' , alpha = 0.5) plt.title("Kurtosis, Independent component analysis") plt.xlabel('Number of components') , plt.ylabel('Average kurtosis') plt.savefig("plots/diabetes_ICA_kurtosis.png") X = np.copy(X_safe) Y = np.copy(Y_safe) scaler = StandardScaler() X = scaler.fit_transform(X)
def dohistogram(que, Y, **kwargs): ''' Return a histogram of Y-values and a gaussian fit of the histogram, excluding values that exceed either the compliance limit (for current or current-density) or the ceiling for R. We would like to include all data in the histogram, but outliers sometimes confuse the fitting routine, which defeats the purpose of machine-fitting ''' defaultKwargs = {'label': '', 'density': False, 'warnings': False} kwargs = {**defaultKwargs, **kwargs} logger = logging.getLogger(__package__ + ".dohistogram") logger.addHandler(QueueHandler(que)) def __handlematherror(msg): # TODO we can now split out the file name with the bad data in it! logger.warning( "Encountered this error while constructing histogram: %s", str(msg), exc_info=False) bins = np.array([0., 0., 0., 0.]) freq = np.array([0., 0., 0., 0.]) return bins, freq try: yrange = (Y.min(), Y.max()) except ValueError as msg: logger.error("Error ranging data for histogram: %s", str(msg)) yrange = (0, 0) if kwargs['label'] == "J" or kwargs['label'] == "lag": Y = Y[Y <= opts.compliance] if yrange != (0, 0): yrange = (Y.min() - 1, Y.max() + 1) if kwargs['label'] == "R": Y = Y[Y <= opts.maxr] if kwargs['label'] in ('DJDV', 'NDC'): nbins = opts.heatmapbins else: nbins = opts.bins if len(Y) < 10 and kwargs['warnings']: logger.warning("Histogram with only %d points.", len(Y)) try: freq, bins = np.histogram(Y, range=yrange, bins=nbins, density=kwargs['density']) except ValueError as msg: bins, freq = __handlematherror(msg) except FloatingPointError as msg: bins, freq = __handlematherror(msg) if len(Y): Ym = signedgmean(Y) Ys = abs(Y.std()) else: Ym, Ys = 0.0, 0.0 p0 = [1., Ym, Ys] bin_centers = (bins[:-1] + bins[1:]) / 2 coeff = p0 covar = None assert (covar is None) hist_fit = np.array([x * 0 for x in range(0, len(bin_centers))]) try: if opts.lorenzian: coeff, covar = curve_fit(lorenz, bin_centers, freq, p0=p0, maxfev=opts.maxfev) hist_fit = lorenz(bin_centers, *coeff) else: coeff, covar = curve_fit(gauss, bin_centers, freq, p0=p0, maxfev=opts.maxfev) hist_fit = gauss(bin_centers, *coeff) except RuntimeError: if opts.maxfev > 100 and kwargs['warnings']: logger.warning("|%s| Fit did not converge", kwargs['label'], exc_info=False) except ValueError as msg: if kwargs['warnings']: logger.warning( "|%s| Skipping data with ridiculous numbers in it (%s)", kwargs['label'], str(msg), exc_info=False) except FloatingPointError as msg: logger.error( "|%s| Encountered floating point error fitting Guasian: %s", kwargs['label'], str(msg), exc_info=False) try: skewstat, skewpval = skewtest(freq) kurtstat, kurtpval = kurtosistest(freq) except ValueError as msg: logger.error("|%s| Could not perform skewtest: %s", kwargs['label'], str(msg), exc_info=False) skewstat, skewpval, kurtstat, kurtpval = 0.0, 0.0, 0.0, 0.0 return { "bin": bin_centers, "freq": freq, "mean": coeff[1], "std": coeff[2], "var": coeff[2], "bins": bins, "fit": hist_fit, "Gmean": Ym, "Gstd": Ys, "skew": skew(freq), "kurtosis": kurtosis(freq), "skewstat": skewstat, "skewpval": skewpval, "kurtstat": kurtstat, "kurtpval": kurtpval }
def handle_dimredux(X, outpath, PCA_cut=0.95, SVD_cut=0.95): # PCA pca = PCA(PCA_cut, whiten=True, svd_solver='auto', random_state=SEED) pcaRes = pca.fit_transform(X) # pcaRes = (pca.fit_transform(X), pca) plt.plot(pca.explained_variance_) plt.xlabel('Component') plt.ylabel('Eigenvalues') plt.title( f'Distribution of Eigenvalues over PCA components \n Explains {PCA_cut * 100}% of Variance, k={len(pca.explained_variance_)}' ) plt.savefig(os.path.join(outpath, 'PCAEigenDist.png'), dpi=400, format='png') plt.close() # ICA ica = FastICA(whiten=True, random_state=SEED, max_iter=10000, tol=0.001) ica.fit(X) kvals = [] xv = np.arange(2, ica.components_.shape[0]) for i in xv: kvals.append( np.mean(kurtosis(np.dot(X, ica.components_[:i].T))**2) ) # transform X with increasingly more ICA components and calculate the kurtosis of the transformation ica_k = xv[np.argmax(kvals)] icaRes = np.dot(X, ica.components_[:ica_k].T ) # Take the X transform with greatest kurtosis # icaRes = (np.dot(X, ica.components_[:ica_k].T), ica) # Take the X transform with greatest kurtosis plt.plot(xv, kvals, label='Kurtosis') plt.vlines(ica_k, 0, np.max(kvals), label=f'Best K: {ica_k}') plt.yscale('log') plt.xlabel('ICA Components') plt.ylabel('Mean Squared Kurtosis') plt.title('Kurtosis of ICA Components') plt.legend() plt.savefig(os.path.join(outpath, 'ICAKurtosis.png'), dpi=400, format='png') plt.close() # RCA reconScore = [] X_ts = [] Xvals = np.arange(2, X.shape[1]) for i in Xvals: rca = SRP(i, dense_output=True) X_t = rca.fit_transform(X) reverse = np.linalg.pinv(rca.components_.toarray()) l = 0 for j in range(9): rca = SRP(i, dense_output=True) X_t += rca.fit_transform(X) reverse += np.linalg.pinv(rca.components_.toarray()) l += 1 reconScore.append( ((X - np.dot(X_t / (1 + l), reverse.T / (1 + l)))**2).mean()) X_ts.append(X_t / (1 + l)) rca_k = Xvals[np.argmin(reconScore)] if rca_k > len(reconScore): rca_k = len(reconScore) - 1 minError = reconScore[rca_k] rcaRes = X_ts[rca_k] # rcaRes = (X_ts[rca_k], None) plt.plot(Xvals, reconScore, label='Recon. Score') plt.vlines(rca_k, 0, max(reconScore), label=f'Best K: {rca_k}') plt.title('Reconstruction Scores (MSE) for Randomized Projections') plt.xlabel('Components') plt.ylabel('MSE') plt.legend() plt.savefig(os.path.join(outpath, 'RCARecon.png'), dpi=400, format='png') plt.close() # SVD svd = TruncatedSVD(X.shape[1] - 1, random_state=SEED) svd.fit(X) evr_Cumm = np.cumsum(svd.explained_variance_ratio_) svd_k = (evr_Cumm <= SVD_cut).sum() svdRes = svd.transform(X) # svdRes = (svd.transform(X), svd) plt.plot(evr_Cumm, label='Cumm. Ratio') plt.plot(svd.explained_variance_ratio_, label='Ratio of exp. var.') plt.vlines(svd_k, 0, 1, label=f'Best K: {svd_k}') plt.title( f'Choosing best k components for Truncated SVD \n Explains {SVD_cut * 100}% of variance' ) plt.xlabel('Components') plt.ylabel('Ratio/Percentage of Explained Variance') plt.legend() plt.savefig(os.path.join(outpath, 'SVDChooseK.png'), dpi=400, format='png') plt.close() return pcaRes, icaRes, rcaRes, svdRes
def CalcateDQF(fy4data, exdata, bias, dqf): fy4data0 = fy4data[np.where(dqf == 0)] exdata0 = exdata[np.where(dqf == 0)] bias0 = bias[np.where(dqf == 0)] fy4data0 = fy4data0[~fy4data0.mask] exdata0 = exdata0[~exdata0.mask] bias0 = bias0[~bias0.mask] fy4data1 = fy4data[np.where(dqf == 1)] exdata1 = exdata[np.where(dqf == 1)] bias1 = bias[np.where(dqf == 1)] fy4data1 = fy4data1[~fy4data1.mask] exdata1 = exdata1[~exdata1.mask] bias1 = bias1[~bias1.mask] fy4data2 = fy4data[np.where(dqf == 2)] exdata2 = exdata[np.where(dqf == 2)] bias2 = bias[np.where(dqf == 2)] fy4data2 = fy4data2[~fy4data2.mask] exdata2 = exdata2[~exdata2.mask] bias2 = bias2[~bias2.mask] fy4data3 = fy4data[~fy4data.mask] exdata3 = exdata[~exdata.mask] bias3 = bias[~bias.mask] # 指标计算 slope0, intercept0, r_value0, p_value0, std_err0 = stats.linregress( fy4data0, exdata0) slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress( fy4data1, exdata1) slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress( fy4data2, exdata2) slope3, intercept3, r_value3, p_value3, std_err3 = stats.linregress( fy4data3, exdata3) metrics = collections.OrderedDict() metrics['QualID0_NUM'] = fy4data0.size # 总数 metrics['QualID0_MAX'] = round(bias0.max(), 4) # 最大值 metrics['QualID0_MIN'] = round(bias0.min(), 4) # 最小值 metrics['QualID0_MEDIAN'] = round(np.median(bias0), 4) # 中位数 metrics['QualID0_MEAN'] = round(bias0.mean(), 4) # 平均值 metrics['QualID0_AE'] = round(np.abs(bias0).mean(), 4) # 绝对值平均数 metrics['QualID0_STD'] = round( np.sqrt(np.square(bias0).sum() / (fy4data0.size - 1)), 4) metrics['QualID0_RMSE'] = round( np.sqrt(np.square(bias0).sum() / fy4data0.size), 4) # 均方根误差 metrics['QualID0_SKEW'] = round(stats.skew(bias0), 4) # 偏度系数 metrics['QualID0_KURT'] = round(stats.kurtosis(bias0), 4) # 峰度系数 metrics['QualID0_CORR'] = round(r_value0, 4) metrics['QualID0_slope'] = round(slope0, 4) metrics['QualID0_intercept'] = round(intercept0, 4) metrics['QualID1_NUM'] = fy4data1.size # 总数 metrics['QualID1_MAX'] = round(bias1.max(), 4) # 最大值 metrics['QualID1_MIN'] = round(bias1.min(), 4) # 最小值 metrics['QualID1_MEDIAN'] = round(np.median(bias1), 4) # 中位数 metrics['QualID1_MEAN'] = round(bias1.mean(), 4) # 平均值 metrics['QualID1_AE'] = round(np.abs(bias1).mean(), 4) # 绝对值平均数 metrics['QualID1_STD'] = round( np.sqrt(np.square(bias1).sum() / (fy4data1.size - 1)), 4) metrics['QualID1_RMSE'] = round( np.sqrt(np.square(bias1).sum() / fy4data1.size), 4) # 均方根误差 metrics['QualID1_SKEW'] = round(stats.skew(bias1), 4) # 偏度系数 metrics['QualID1_KURT'] = round(stats.kurtosis(bias1), 4) # 峰度系数 metrics['QualID1_CORR'] = round(r_value1, 4) metrics['QualID1_slope'] = round(slope1, 4) metrics['QualID1_intercept'] = round(intercept1, 4) metrics['QualID2_NUM'] = fy4data2.size # 总数 metrics['QualID2_MAX'] = round(bias2.max(), 4) # 最大值 metrics['QualID2_MIN'] = round(bias2.min(), 4) # 最小值 metrics['QualID2_MEDIAN'] = round(np.median(bias2), 4) # 中位数 metrics['QualID2_MEAN'] = round(bias2.mean(), 4) # 平均值 metrics['QualID2_AE'] = round(np.abs(bias2).mean(), 4) # 绝对值平均数 metrics['QualID2_STD'] = round( np.sqrt(np.square(bias2).sum() / (fy4data2.size - 1)), 4) metrics['QualID2_RMSE'] = round( np.sqrt(np.square(bias2).sum() / fy4data2.size), 4) # 均方根误差 metrics['QualID2_SKEW'] = round(stats.skew(bias2), 4) # 偏度系数 metrics['QualID2_KURT'] = round(stats.kurtosis(bias2), 4) # 峰度系数 metrics['QualID2_CORR'] = round(r_value2, 4) metrics['QualID2_slope'] = round(slope2, 4) metrics['QualID2_intercept'] = round(intercept2, 4) metrics['QualID3_NUM'] = fy4data3.size # 总数 metrics['QualID3_MAX'] = round(bias3.max(), 4) # 最大值 metrics['QualID3_MIN'] = round(bias3.min(), 4) # 最小值 metrics['QualID3_MEDIAN'] = round(np.median(bias3), 4) # 中位数 metrics['QualID3_MEAN'] = round(bias3.mean(), 4) # 平均值 metrics['QualID3_AE'] = round(np.abs(bias3).mean(), 4) # 绝对值平均数 metrics['QualID3_STD'] = round( np.sqrt(np.square(bias3).sum() / (fy4data3.size - 1)), 4) metrics['QualID3_RMSE'] = round( np.sqrt(np.square(bias3).sum() / fy4data3.size), 4) # 均方根误差 metrics['QualID3_SKEW'] = round(stats.skew(bias3), 4) # 偏度系数 metrics['QualID3_KURT'] = round(stats.kurtosis(bias3), 4) # 峰度系数 metrics['QualID3_CORR'] = round(r_value3, 4) metrics['QualID3_slope'] = round(slope3, 4) metrics['QualID3_intercept'] = round(intercept3, 4) return metrics
# trad_date_from = dt.date(2011, 2, 1) # trad_date_to = dt.date(2014, 1, 31) # get simple returns dataframe and dates dates, returns_df = simulate_trading(close_df, trad_date_from, trad_date_to, args.trad_freq, args.est_per_trad_days, args.trad_per_trad_days, args.no_pairs) # convert returns to log returns log_returns_df = np.log(returns_df + 1) # sum up returns and calculate cumulative sum of log returns cum_log_returns_df = np.cumsum(log_returns_df) # converto to simple return cum_returns_df = np.exp(cum_log_returns_df) # daily_ret = np.mean(log_returns_df) daily_vol = np.std(log_returns_df) ann_ret = np.mean(log_returns_df) * 252 ann_vol_ret = np.std(log_returns_df) * np.sqrt(252) skew = stats.skew(log_returns_df) kurt = stats.kurtosis(log_returns_df) min_daily_ret = np.min(log_returns_df) max_daily_ret = np.max(log_returns_df) cum_ret = cum_returns_df[-1] - 1 # plot plt.plot(dates, cum_returns_df, label='Growth of 1$') plt.legend() plt.grid(True) plt.xlabel('Date') pass
minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors_train), np.nan_to_num(question2_vectors_train)) ] train_df['braycurtis_distance2'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors_train), np.nan_to_num(question2_vectors_train)) ] train_df['skew_q1vec2'] = [ skew(x) for x in np.nan_to_num(question1_vectors_train) ] train_df['skew_q2vec2'] = [ skew(x) for x in np.nan_to_num(question2_vectors_train) ] train_df['kur_q1vec2'] = [ kurtosis(x) for x in np.nan_to_num(question1_vectors_train) ] train_df['kur_q2vec2'] = [ kurtosis(x) for x in np.nan_to_num(question2_vectors_train) ] question1_vectors_test = np.zeros((test.shape[0], 300)) question2_vectors_test = np.zeros((test.shape[0], 300)) error_count_test = 0 for i, q in tqdm(enumerate(test.question1.values)): question1_vectors_test[i, :] = sent2vec(q) for i, q in tqdm(enumerate(test.question2.values)): question2_vectors_test[i, :] = sent2vec(q) test_df['cosine_distance2'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors_test), np.nan_to_num(question2_vectors_test))
def calc_kurtosis(data): return kurtosis(data)
def nankurtosis(arr, axis=0): return stats.kurtosis(arr, axis=axis, nan_policy="omit")
def jump_diffusion(S=1, X=0.5, T=1, mu=0.12, sigma=0.3, Lambda=0.25, a=0.2, b=0.2, Nsteps=252, Nsim=100, alpha=0.05, seed=None): ''' Monte Carlo simulation [1] of Merton's Jump Diffusion Model [2]. The model is specified through the stochastic differential equation (SDE): dS(t) ----- = mu*dt + sigma*dW(t) + dJ(t) S(t-) with: mu, sigma: constants, the drift and volatility coefficients of the stock price process; W: a standard one-dimensional Brownian motion; J: a jump process, independent of W, with piecewise constant sample paths. It is defined as the sum of multiplicative jumps Y(j). Input --------------------------------------------------------------------------- S: float. The current asset price. X: float. The strike price, i.e. the price at which the asset may be bought (call) or sold (put) in an option contract [3]. T: int or float. The maturity of the option contract, i.e. the final monitoring date. mu, sigma: float. Respectively, the drift and volatility coefficients of the asset price process. Lambda: float. The intensity of the Poisson process in the jump diffusion model ('lambda' is a protected keyword in Python). a, b: float. Parameters required to calculate, respectively, the mean and variance of a standard lognormal distribution, log(x) ~ N(a, b**2). (see code). Nsteps: int. The number of monitoring dates, i.e. the time steps. Nsim: int. The number of Monte Carlo simulations (at least 10,000 required to generate stable results). alpha: float. The confidence interval significance level, in [0, 1]. seed: int. Set random seed, for reproducibility of the results. Default value is None (the best seed available is used, but outcome will vary in each experiment). References --------------------------------------------------------------------------- [1] Glasserman, P. (2003): 'Monte Carlo Methods in Financial Engineering', Springer Applications of Mathematics, Vol. 53 [2] Merton, R.C. (1976): 'Option Pricing when Underlying Stock Returns are Discontinuous', Journal of Financial Economics, 3:125-144. [3] Hull, J.C. (2017): 'Options, Futures, and Other Derivatives', 10th Edition, Pearson. ''' # Import required libraries import time import numpy as np from scipy import stats import matplotlib.pyplot as plt import seaborn as sns # Set random seed np.random.seed(seed) ''' Time the whole path-generating process, using a tic-toc method familiar to MATLAB users ''' tic = time.time() # Calculate the length of the time step Delta_t = T / Nsteps ''' Compute mean and variance of a standard lognormal distribution from user defined parameters a and b. The latter are useful to simulate the jump component in Monte Carlo. a and b are chosen such that log(Y(j)) ~ N(a, b**2). This implies that the mean and variance of the multiplicative jumps will be: * mean_Y = np.exp(a + 0.5*(b**2)) * variance_Y = np.exp(2*a + b**2) * (np.exp(b**2)-1) ''' mean_Y = np.exp(a + 0.5 * (b**2)) variance_Y = np.exp(2 * a + b**2) * (np.exp(b**2) - 1) ''' Calculate the theoretical drift (M) and volatility (V) of the stock price process under Merton's jump diffusion model. These values can be used to monitor the rate of convergence of Monte Carlo estimates as the number of simulated experiments increases, and can help spot errors, if any, in implementing the model. ''' M = S * np.exp(mu * T + Lambda * T * (mean_Y - 1)) V = S**2 * (np.exp((2*mu + sigma**2)*T \ + Lambda*T*(variance_Y + mean_Y**2 - 1)) \ - np.exp(2*mu*T + 2*Lambda*T*(mean_Y - 1))) ''' Generate an Nsim x (Nsteps+1) array of zeros to preallocate the simulated paths of the Monte Carlo simulation. Each row of the matrix represents a full, possible path for the stock, each column all values of the asset at a particular instant in time. ''' simulated_paths = np.zeros([Nsim, Nsteps + 1]) # Replace the first column of the array with the vector of initial price S simulated_paths[:, 0] = S ''' To account for the multiple sources of uncertainty in the jump diffusion process, generate three arrays of random variables. - The first one is related to the standard Brownian motion, the component epsilon(0,1) in epsilon(0,1) * np.sqrt(dt); - The second and third ones model the jump, a compound Poisson process: the former (a Poisson process with intensity Lambda) causes the asset price to jump randomly (random timing); the latter (a Gaussian variable) defines both the direction (sign) and intensity (magnitude) of the jump. ''' Z_1 = np.random.normal(size=[Nsim, Nsteps]) Z_2 = np.random.normal(size=[Nsim, Nsteps]) Poisson = np.random.poisson(Lambda * Delta_t, [Nsim, Nsteps]) # Populate the matrix with Nsim randomly generated paths of length Nsteps for i in range(Nsteps): simulated_paths[:,i+1] = simulated_paths[:,i]*np.exp((mu - sigma**2/2)*Delta_t + sigma*np.sqrt(Delta_t) \ * Z_1[:,i] + a*Poisson[:,i] \ + np.sqrt(b**2) * np.sqrt(Poisson[:,i]) \ * Z_2[:,i]) # Single out array of simulated prices at maturity T final_prices = simulated_paths[:, -1] # Compute mean, variance, standard deviation, skewness, excess kurtosis mean_jump = np.mean(final_prices) var_jump = np.var(final_prices) std_jump = np.std(final_prices) skew_jump = stats.skew(final_prices) kurt_jump = stats.kurtosis(final_prices) # Calculate confidence interval for the mean ci_low = mean_jump - std_jump / np.sqrt(Nsim) * stats.norm.ppf(1 - 0.5 * alpha) ci_high = mean_jump + std_jump / np.sqrt(Nsim) * stats.norm.ppf(1 - 0.5 * alpha) # Print statistics, align results print("Merton's Jump Diffusion Model") print('-----------------------------') print('Theoretical Moments') print('-----------------------------') print('Mean (M){:>21.4f}'.format(M)) print('Variance (V){:>17.4f}'.format(V)) print('\nMonte Carlo Estimates') print('-----------------------------') print('Mean {:>24.4f}'.format(mean_jump)) print('Variance {:>20.4f}'.format(var_jump)) print('Standard deviation {:>10.4f}'.format(std_jump)) print('Skewness {:>20.4f}'.format(skew_jump)) print('Excess kurtosis {:>13.4f}'.format(kurt_jump)) print('\nConfidence interval, Mean') print('-----------------------------') print('Alpha {:>23.2f}'.format(alpha)) print('Lower bound {:>17.4f}'.format(ci_low)) print('Upper bound {:>17.4f}'.format(ci_high)) # Choose palette, figure size, and define figure axes sns.set(palette='viridis') plt.figure(figsize=(10, 8)) ax = plt.axes() # Generate t, the time variable on the abscissae t = np.linspace(0, T, Nsteps + 1) * Nsteps # Plot the Monte Carlo simulated stock price paths jump_diffusion = ax.plot(t, simulated_paths.transpose()) # Make drawn paths thinner by decreasing line width plt.setp(jump_diffusion, linewidth=1) # Set title (LaTeX notation) and x- and y- labels ax.set(title="Monte Carlo simulated stock price paths in Merton's jump \ diffusion model\n$S_0$ = {}, $\mu$ = {}, $\sigma$ = {}, $a$ = {}, $b$ = {}, \ $\lambda$ = {}, $T$ = {}, Nsteps = {}, Nsim = {}"\ .format(S, mu, sigma, a, b, Lambda, T, Nsteps, Nsim), \ xlabel='Time (days)', ylabel='Stock price') # Display figure in a Python environment plt.show() # Time and print the elapsed time toc = time.time() elapsed_time = toc - tic print('Total running time: {:.2f} ms'.format(elapsed_time * 1000))
#print (s) np.all(s > 1) np.all(s < 1) count, bins, ignored = plt.hist(s, 15, normed=True) print(count) print(bins) print(ignored) plt.plot(bins, np.ones_like(bins), linewidth=2, color='r') plt.show() x = np.random.normal(0.75) mu, sigma = 0, 0.1 # mean and standard deviation n = np.random.normal(mu, sigma, 1000) a = np.random.normal(0.75, size=1000) np.std(a) stats.kurtosis(a) stats.skew(a) # # Chi squared distributions chi_squared = np.random.chisquare(2, size=1000) stats.skew(chi_squared) chi_squared = np.random.chisquare(6, size=1000) stats.skew(chi_squared) pyplot.pie([1, 2, 3]) pyplot.show() df = pd.read_csv(r'C:\Users\Ashish\Desktop\Test\Grades\Grades.csv') len(df) df.head(5) early = df[df['assignment1_submission'] <= '2015-12-31'] late = df[df['assignment1_submission'] > '2015-12-31'] early['assignment1_grade'].mean()
plt.show() # This plot is tall. It is leptokurtic. # Most students performed similarly. plt.hist(test_scores_lepto) plt.show() # The height of this plot neither short nor tall. It is mesokurtic. plt.hist(test_scores_meso) plt.show() # We can measure kurtosis with the kurtosis function. # Negative values indicate platykurtic distributions, positive values indicate leptokurtic distributions, and values near 0 are mesokurtic. from scipy.stats import kurtosis kurt_platy = kurtosis(test_scores_platy) kurt_lepto = kurtosis(test_scores_lepto) kurt_meso = kurtosis(test_scores_meso) ## 10. Modality ## import matplotlib.pyplot as plt # This plot has one mode. It is unimodal. plt.hist(test_scores_uni) plt.show() # This plot has two peaks. It is bimodal. # This could happen if one group of students learned the material and another learned something else, for example. plt.hist(test_scores_bi) plt.show()
def run(self): self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict( self.transaction.persistent_model_metadata.train_metadata) header = self.transaction.input_data.columns origData = {} for column in header: origData[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int( sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.logging.info( 'population_size={population_size}, sample_size={sample_size} {percent:.2f}%' .format(population_size=population_size, sample_size=sample_size, percent=(sample_size / population_size) * 100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = self.cast(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: origData[column].append(value) column_count[column] += 1 stats = {} for i, col_name in enumerate(origData): col_data = origData[col_name] # all rows in just one column data_type = self.getColumnDataType(col_data) # NOTE: Enable this if you want to assume that some numeric values can be text # We noticed that by default this should not be the behavior # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification # # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC: # unique_count = len(set(col_data)) # if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN: # data_type = DATA_TYPES.TEXT if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [ str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA' ]: col_data[i] = None else: try: col_data[i] = int(parseDate(element).timestamp()) except: logging.warning( 'Could not convert string to date and it was expected, current value {value}' .format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [ float(i) for i in newData if str(i) not in [ '', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA' ] ] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.05 initial_step_size = abs(max_value - min_value) / 100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i - min_value) * inc_rate i = i + i_inc # TODO: Solve inc_rate for N # min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate .... # # x_0 = 0 # x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate # # sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) ) # # mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) ) # # mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) )) # = n*min(sum_(i=1)^(i=n)(inc_rate^i)) # => sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n)) # # # i + i*x else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max( [1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { "column": col_name, KEYS.DATA_TYPE: data_type, # "distribution": best_fit_name, # "distributionParams": distribution_params, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } stats[col_name] = col_stats # else if its text else: # see if its a sentence or a word is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False dictionary, histogram = self.getWordsDictionary( col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len(dictionary) / len( col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len( col_data) > 50 and is_full_text == False: dictionary = [] dictionary_available = False col_stats = { "column": col_name, KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "histogram": histogram } stats[col_name] = col_stats total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.persistent_model_metadata.column_stats = stats self.transaction.persistent_model_metadata.total_row_count = total_rows self.transaction.persistent_model_metadata.test_row_count = test_rows self.transaction.persistent_model_metadata.train_row_count = train_rows self.transaction.persistent_model_metadata.validation_row_count = validation_rows self.transaction.persistent_model_metadata.update() return stats
index_1 = {} # loop over the image paths for imagePath in imagePaths: # load the image and extract the filename image = cv2.imread(imagePath) filename = imagePath[imagePath.rfind("\\") + 1:] print(filename) #dinos/trex_01.png # extract the mean and standard deviation from each channel of the # BGR image, then update the index with the feature vector (means, stds) = cv2.meanStdDev(image) features = np.concatenate([means, stds]).flatten() index[filename] = features kurtosis_scipy = stats.kurtosis(image) skew_scipy = stats.skew(image) features1 = np.concatenate([skew_scipy, kurtosis_scipy]).flatten() index_1[filename] = features1 print(index["trex_01.png"]) print(index_1["trex_01.png"]) # display the query image and grab the sorted keys of the index dictionary query = cv2.imread(imagePaths[0]) cv2.imshow("Query (trex_01.png)", query) keys = sorted(index.keys()) # loop over the filenames in the dictionary for (i, k) in enumerate(keys): # if this isq the query image, ignore it
def run(self, input_data, modify_light_metadata, hmd=None, print_logs=True): """ # Runs the stats generation phase # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column # A lot of information about the data distribution and quality will also be logged to the server in this phase """ no_processes = multiprocessing.cpu_count() - 2 if no_processes < 1: no_processes = 1 pool = multiprocessing.Pool(processes=no_processes) if print_logs == False: self.log = logging.getLogger('null-logger') self.log.propagate = False # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(input_data.data_frame) if population_size < 50: sample_size = population_size else: sample_size = int(calculate_sample_size(population_size=population_size, margin_error=self.transaction.lmd['sample_margin_of_error'], confidence_level=self.transaction.lmd['sample_confidence_level'])) #if sample_size > 3000 and sample_size > population_size/8: # sample_size = min(round(population_size/8),3000) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.log.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) all_sampled_data = input_data.data_frame.iloc[input_data_sample_indexes] stats = {} col_data_dict = {} for col_name in all_sampled_data.columns.values: col_data = all_sampled_data[col_name].dropna() full_col_data = all_sampled_data[col_name] data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type(col_data, input_data.data_frame, col_name) if column_status == 'Column empty': if modify_light_metadata: self.transaction.lmd['malformed_columns']['names'].append(col_name) self.transaction.lmd['malformed_columns']['indices'].append(i) continue new_col_data = [] if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: #data_type == DATA_TYPES.DATE: for element in col_data: if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null']: new_col_data.append(None) else: try: new_col_data.append(int(parse_datetime(element).timestamp())) except: self.log.warning(f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}') new_col_data.append(None) col_data = new_col_data if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype) x = histogram['x'] y = histogram['y'] col_data = StatsGenerator.clean_int_and_date_data(col_data) # This means the column is all nulls, which we don't handle at the moment if len(col_data) < 1: return None xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.1 initial_step_size = abs(max_value-min_value)/100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i-min_value)*inc_rate i = i + i_inc else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE: histogram, _ = StatsGenerator.get_histogram(input_data.data_frame[col_name], data_type=data_type, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "histogram": histogram, "percentage_buckets": histogram['x'] } elif curr_data_subtype == DATA_SUBTYPES.IMAGE: histogram, percentage_buckets = StatsGenerator.get_histogram(col_data, data_subtype=curr_data_subtype) col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, 'percentage_buckets': percentage_buckets, 'histogram': histogram } # @TODO This is probably wrong, look into it a bit later else: # see if its a sentence or a word histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype) dictionary = list(histogram.keys()) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { 'data_type': data_type, 'data_subtype': curr_data_subtype, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "histogram": histogram } stats[col_name] = col_stats stats[col_name]['data_type_dist'] = data_type_dist stats[col_name]['data_subtype_dist'] = data_subtype_dist stats[col_name]['column'] = col_name empty_count = len(full_col_data) - len(col_data) stats[col_name]['empty_cells'] = empty_count stats[col_name]['empty_percentage'] = empty_count * 100 / len(full_col_data) if 'separator' in additional_info: stats[col_name]['separator'] = additional_info['separator'] col_data_dict[col_name] = col_data for col_name in all_sampled_data.columns: if col_name in self.transaction.lmd['malformed_columns']['names']: continue # Use the multiprocessing pool for computing scores which take a very long time to compute # For now there's only one and computing it takes way too long, so this is not enabled scores = [] ''' scores.append(pool.apply_async(compute_clf_based_correlation_score, args=(stats, all_sampled_data, col_name))) ''' for score_promise in scores: # Wait for function on process to finish running score = score_promise.get() stats[col_name].update(score) for score_func in [compute_duplicates_score, compute_empty_cells_score, compute_data_type_dist_score, compute_z_score, compute_lof_score, compute_similariy_score, compute_value_distribution_score]: start_time = time.time() if 'compute_z_score' in str(score_func) or 'compute_lof_score' in str(score_func): stats[col_name].update(score_func(stats, col_data_dict, col_name)) else: stats[col_name].update(score_func(stats, all_sampled_data, col_name)) fun_name = str(score_func) run_duration = round(time.time() - start_time, 2) #print(f'Running scoring function "{run_duration}" took {run_duration} seconds !') stats[col_name].update(compute_consistency_score(stats, col_name)) stats[col_name].update(compute_redundancy_score(stats, col_name)) stats[col_name].update(compute_variability_score(stats, col_name)) stats[col_name].update(compute_data_quality_score(stats, col_name)) total_rows = len(input_data.data_frame) if modify_light_metadata: self.transaction.lmd['column_stats'] = stats self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error'] self.transaction.lmd['data_preparation']['total_row_count'] = total_rows self.transaction.lmd['data_preparation']['used_row_count'] = sample_size self.transaction.lmd['data_preparation']['test_row_count'] = len(input_data.test_indexes[KEY_NO_GROUP_BY]) self.transaction.lmd['data_preparation']['train_row_count'] = len(input_data.train_indexes[KEY_NO_GROUP_BY]) self.transaction.lmd['data_preparation']['validation_row_count'] = len(input_data.validation_indexes[KEY_NO_GROUP_BY]) pool.close() pool.join() self._log_interesting_stats(stats) return stats
def dict_learning(X, dictionary=None, P_cum=None, eta=0.02, n_dictionary=2, l0_sparseness=10, fit_tol=None, n_iter=100, eta_homeo=0.01, alpha_homeo=0.02, batch_size=100, record_each=0, record_num_batches = 1000, verbose=False, method='mp', C=0., nb_quant=100, do_sym=True, random_state=None): """ Solves a dictionary learning matrix factorization problem online. Finds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: Solves the optimization problem:: (U^*, V^*) = argmin_{(U,V)} 0.5 || X - V^T * U ||_2^2 + alpha * S( U ) + alpha_homeo * H(V) s. t. || U ||_0 = k where S is a sparse representation cost, and H a homeostatic representation cost. where V is the dictionary and U is the sparse code. This is accomplished by repeatedly iterating over mini-batches by slicing the input data. For instance, H(V) = \sum_{0 <= k < n_dictionary} (|| V_k ||_2^2 -1)^2 Parameters ---------- X: array of shape (n_samples, n_pixels) Data matrix. n_dictionary : int, Number of dictionary atoms to extract. eta : float Gives the learning parameter for the homeostatic gain. n_iter : int, total number of iterations to perform eta_homeo : float Gives the learning parameter for the homeostatic gain. alpha_homeo : float Gives the smoothing exponent for the homeostatic gain If equal to 1 the homeostatic learning rule learns a linear relation to variance. If equal to zero, we use COMP nb_quant : int, number of bins for the quantification used in the homeostasis C : float characteristic scale for the quantization. Use C=0. to have an adaptive scaling. dictionary : array of shape (n_dictionary, n_pixels), initial value of the dictionary for warm restart scenarios fit_algorithm : {'mp', 'omp', 'comp', 'lars', 'cd'} see sparse_encode batch_size : int, The number of samples to take in each batch. l0_sparseness : int, ``0.1 * n_pixels`` by default Number of nonzero coefficients to target in each column of the solution. This is only used by `algorithm='lars'`, `algorithm='mp'` and `algorithm='omp'`. fit_tol : float, 1. by default If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `fit_tol` is the penalty applied to the L1 norm. If `algorithm='threshold'`, `fit_tol` is the absolute value of the threshold below which coefficients will be squashed to zero. If `algorithm='mp'` or `algorithm='omp'`, `fit_tol` is the tolerance parameter: the value of the reconstruction error targeted. In this case, it overrides `l0_sparseness`. record_each : if set to 0, it does nothing. Else it records every record_each step the statistics during the learning phase (variance and kurtosis of coefficients). record_num_batches : number of batches used to make statistics (if -1, uses the whole training set) verbose : degree of verbosity of the printed output Returns ------- dictionary : array of shape (n_dictionary, n_pixels), the solutions to the dictionary learning problem """ if record_each>0: import pandas as pd record = pd.DataFrame() if n_dictionary is None: n_dictionary = X.shape[1] t0 = time.time() n_samples, n_pixels = X.shape if dictionary is None: dictionary = np.random.randn(n_dictionary, n_pixels) norm = np.sqrt(np.sum(dictionary**2, axis=1)) dictionary /= norm[:, np.newaxis] norm = np.sqrt(np.sum(dictionary**2, axis=1)) if verbose == 1: print('[dict_learning]', end=' ') # print(alpha_homeo, eta_homeo, alpha_homeo==0, eta_homeo==0, alpha_homeo==0 or eta_homeo==0, 'P_cum', P_cum) # splits the whole dataset into batches n_batches = n_samples // batch_size X_train = X.copy() np.random.shuffle(X_train) batches = np.array_split(X_train, n_batches) if alpha_homeo==0: # do the equalitarian homeostasis if P_cum is None: P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((n_dictionary, 1)) if C == 0.: # initialize the rescaling vector from shl_scripts.shl_encode import get_rescaling corr = (batches[0] @ dictionary.T) C_vec = get_rescaling(corr, nb_quant=nb_quant, do_sym=do_sym, verbose=verbose) # and stack it to P_cum array for convenience P_cum = np.vstack((P_cum, C_vec)) else: # do the classical homeostasis gain = np.ones(n_dictionary) mean_var = np.ones(n_dictionary) P_cum = None import itertools # Return elements from list of batches until it is exhausted. Then repeat the sequence indefinitely. batches = itertools.cycle(batches) # cycle over all batches for ii, this_X in zip(range(n_iter), batches): dt = (time.time() - t0) if verbose > 0: if ii % int(n_iter//verbose + 1) == 0: print ("Iteration % 3i / % 3i (elapsed time: % 3is, % 4.1fmn)" % (ii, n_iter, dt, dt//60)) # Sparse coding sparse_code = sparse_encode(this_X, dictionary, algorithm=method, fit_tol=fit_tol, P_cum=P_cum, C=C, do_sym=do_sym, l0_sparseness=l0_sparseness) # Update dictionary residual = this_X - sparse_code @ dictionary residual /= n_dictionary # divide by the number of features dictionary += eta * sparse_code.T @ residual # homeostasis norm = np.sqrt(np.sum(dictionary**2, axis=1)).T dictionary /= norm[:, np.newaxis] if eta_homeo>0.: if P_cum is None: # Update and apply gain mean_var = update_gain(mean_var, sparse_code, eta_homeo, verbose=verbose) gain = mean_var**alpha_homeo gain /= gain.mean() dictionary /= gain[:, np.newaxis] else: if C==0.: corr = (this_X @ dictionary.T) C_vec = get_rescaling(corr, nb_quant=nb_quant, do_sym=do_sym, verbose=verbose) P_cum[-1, :]= (1 - eta_homeo) * P_cum[-1, :] + eta_homeo * C_vec P_cum[:-1, :] = update_P_cum(P_cum=P_cum[:-1, :], code=sparse_code, eta_homeo=eta_homeo, C=P_cum[-1, :], nb_quant=nb_quant, do_sym=do_sym, verbose=verbose) else: P_cum = update_P_cum(P_cum, sparse_code, eta_homeo, nb_quant=nb_quant, verbose=verbose, C=C, do_sym=do_sym) if record_each>0: if ii % int(record_each) == 0: from scipy.stats import kurtosis indx = np.random.permutation(X_train.shape[0])[:record_num_batches] sparse_code_rec = sparse_encode(X_train[indx, :], dictionary, algorithm=method, fit_tol=fit_tol, P_cum=P_cum, do_sym=do_sym, C=C, l0_sparseness=l0_sparseness) # calculation of relative entropy p = np.count_nonzero(sparse_code_rec,axis=0)/ (sparse_code_rec.shape[1]) p /= p.sum() rel_ent = np.sum(-p * np.log(p)) / np.log(sparse_code_rec.shape[1]) error = np.linalg.norm(X_train[indx, :] - sparse_code_rec @ dictionary)/record_num_batches record_one = pd.DataFrame([{'kurt':kurtosis(sparse_code_rec, axis=0), 'prob_active':np.mean(np.abs(sparse_code_rec)>0, axis=0), 'var':np.mean(sparse_code_rec**2, axis=0), 'error':error, 'entropy':rel_ent}], index=[ii]) record = pd.concat([record, record_one]) if verbose > 1: print('Learning code...', end=' ') elif verbose == 1: print('|', end=' ') if verbose > 1: dt = (time.time() - t0) print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60)) if record_each==0: return dictionary, P_cum else: return dictionary, P_cum, record
from scipy.stats import kurtosis if __name__ == "__main__": a = [ -6, -6, -34, -10, -2, 6, 26, 6, -6, 22, 2, 26, -26, 10, 10, 10, -2, 30, -14, -14, 10, -22, -2, -14, 6, -6, -14, -10, -10, 6, -10, 2, 14, -10, -34, 2, -18, -6, -2, 22, -30, 18, 10, -22, -2, 18, -34, 6, 10, 10, 18, -22, 2, 2, 6, 6, 14, 6, -14, 2, 14, -2, 10, 22, 22, -2, 10, -14, 6, 6, 6, 10, 14, -14, 10, 30, -42, 10, 2, 6, -22, 22, -14, 2, 30, 2, -18, 2, 26, -6, -18, 30, 18, -2, 2, 10, -10, 2, 10, -6, 26, -38, -10, -2, -18, 38, 2, 30, -14, -18, -26, -10, 6, 2, 14, -6, 2, -18, -18, -2, 2, 14, 10, -14, 10, -34, -22, 2, 18, -14, 42, -18, -14, -10, 6, 14, 10, 14, 18, 14, -2, 18, 38, 6, 22, -18, -10, 2, 6, 6, 14, 2, -18, -14, 2, -18, 14, -6, 26, 2, -10, -14, 2, -6, 10, 18, -30, -10, -26, -2, -6, 14, 10, -14, 6, -14, -14, 6, -18, -30, 42, -6, -6, -6, -10, -2, 18, 14, -14, -2, 14, -30, 2, 14, 10, 2, 14, 2, -6, -14, -6, 6, -2, -14, 22, -10, -6, 14, -10, -18, 18, -2, -6, 18, 2, 26, 2, 14, -10, -2, -2, -2, 14, 30, -2, 2, -18, 6, -18, -14, -18, 10, 18, -2, -30, 14, -10, 6, 2, 2, -2, 10, -34, -14, -18, 22, -10, -10, 2, -10, 26, -2, 2, -18, -14, 26, -6 ] print(kurtosis(a))
sal_churn.shape # Measures of Central Tendency np.mean(sal_churn) np.median(sal_churn.Salary_hike) np.median(sal_churn.Churn_out_rate) # Measures of Dispersion np.var(sal_churn) np.std(sal_churn) # Skewness and Kurtosis skew(sal_churn.Salary_hike) skew(sal_churn.Churn_out_rate) kurtosis(sal_churn.Salary_hike) kurtosis(sal_churn.Churn_out_rate) x = np.array(sal_churn.Salary_hike) y = np.array(sal_churn.Churn_out_rate) # Normal Q-Q plot plt.plot(sal_churn.Salary_hike) plt.plot(sal_churn.Churn_out_rate) plt.plot(sal_churn);plt.legend(['Salary_hike','Churn_out_rate']); stats.probplot(x,dist='norm',plot=pylab) stats.probplot(y,dist='norm',plot=pylab)