def test_unbiased_HMM(precision=2): n_rv, n_sample = 10, 100 n_scenario = 500 data = np.random.rand(n_rv, n_sample) # original statistics tgt_moments = np.zeros((n_rv, 4)) tgt_moments[:, 0] = data.mean(axis=1) tgt_moments[:, 1] = data.std(axis=1, ddof=1) tgt_moments[:, 2] = spstats.skew(data, axis=1, bias=False) tgt_moments[:, 3] = spstats.kurtosis(data, axis=1, bias=False) tgt_corrs = np.corrcoef(data) t0 = time() py_scenarios = HMM(tgt_moments, tgt_corrs, n_scenario, bias=False) print ("python unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format( n_rv, n_scenario, time()-t0)) t1 = time() c_scenarios = c_HMM(tgt_moments, tgt_corrs, n_scenario, bias=False) print ("c unbiased HMM (n_rv, n_scenario):({}, {}) {:.4f} secs".format( n_rv, n_scenario, time()-t1)) for scenarios in (py_scenarios, c_scenarios): # scenarios statistics res_moments = np.zeros((n_rv, 4)) res_moments[:, 0] = scenarios.mean(axis=1) res_moments[:, 1] = scenarios.std(axis=1, ddof=1) res_moments[:, 2] = spstats.skew(scenarios, axis=1, bias=False) res_moments[:, 3] = spstats.kurtosis(scenarios, axis=1, bias=False) res_corrs = np.corrcoef(scenarios) np.testing.assert_array_almost_equal(tgt_moments, res_moments, precision) np.testing.assert_array_almost_equal(tgt_corrs, res_corrs, precision)
def main(): parser = argparse.ArgumentParser(description='Simulate and plot graphs of human turnover steps.') parser.add_argument('step', type=int, help='an integer for the accumulator') parser.add_argument('p', type=float, help='an integer for the accumulator') parser.add_argument('K', type=float, help='an integer for the accumulator') parser.add_argument('stage', type=str, help='wake or sleep') parser.add_argument('--stepchart', default=False, action="store_true", help='if record and plot step chart') args = parser.parse_args() sim = TurnoverModel(args.step, args.p, args.K, args.stage, record_step_chart=args.stepchart) sim.save_fluctuation() sim.save_interval_angle_dist() sim.save_interval_ccdf() if args.stepchart: sim.save_step_chart() print '===== turnover intervals =====' print sim.turnover_intervals print '===== turnover times,intervals,angles =====' print zip(sim.turnover_times,sim.turnover_intervals,sim.turnover_angles) print '===== alpha s,l =====' print sim.calced_alpha_s.get(), sim.calced_alpha_l.get() print '===== skew of log10(tau)=====' print skew([log10(i) for i in sim.turnover_intervals])
def iskew (i): print i ikmap_NL = kmapNL(i) ikmap_NOISY = kmapNOISY(i) skewness_NL = [skew(WLanalysis.smooth(ikmap_NL, ismooth).flatten() ) for ismooth in sigmaG_arr*PPA_NL] skewness_NOISY = [skew(WLanalysis.smooth(ikmap_NOISY, ismooth).flatten() ) for ismooth in sigmaG_arr*PPA_NOISY] return [skewness_NL, skewness_NOISY]
def smerodatna_odchylka(data, min=0, max=0, plot=True): # # pocita smerodatnou odchylku. Pokud min a max neni nastaveno, pocita se # z celeho pole. Jinak to je vyber mezi min a max # # in 'data' - pole s daty # in 'min' - minimalni hodnota v poli pro posouzeni # in 'max' - maximalni hodnota v poli pro posouzeni # in 'plot' - rozhoduje o vykresleni grafu # # out 'out' - smerodatna odchylka # data = np.array(data) if min == 0 and max == 0: average = np.mean(data) median = np.median(data) standardDeviation=np.std(data) kurtosis = stats.kurtosis(data) skewness = stats.skew(data) else: crop = np.array([]) for x in data: if min < x < max: crop=np.append(crop,x) average = np.mean(crop) # modus = stats.mode(crop) # modus = statistics.mode(crop) !!!!! median = np.median(crop) standardDeviation=np.std(crop) kurtosis = stats.kurtosis(crop) skewness = stats.skew(crop) if plot: plt.figure() plt.axvspan(float(min), float(max), alpha=0.3, color='k') plt.axvspan(average-standardDeviation, average+standardDeviation, alpha=0.4, color='b') plt.axvspan(average+standardDeviation, average+standardDeviation+standardDeviation, alpha=0.4, color='r') plt.axvspan(average-standardDeviation, average-standardDeviation-standardDeviation, alpha=0.4, color='r') plt.axvline(x=median, linewidth=2, color='r') plt.axvline(x=average, linewidth=2, color='g') #plt.axvline(x=modus[0], linewidth=2, color='b') plt.hist(data, 1.0+3.3*math.log(np.shape(data)[0]), facecolor='green', alpha=0.75) plt.text(average, 10, "std: "+ str(standardDeviation), bbox={'facecolor':'green', 'alpha':0.75, 'pad':10}) plt.show(block=False) print "___________________________________________________________" print "výběr hodnot od ", float(min), " po ", float(max) print "průměr: ", average print "median: ", median print "smerodatn odchylka je: ", standardDeviation print "spicatost: ", kurtosis print "sikmost: ", skewness return standardDeviation
def getLabelImFeats(lsim,center,orgim): """Compute object geometry features. Parameters ---------- lsim: Segmented binary image center: Center coordinate(x,y) of the object orgim: Original image """ label_img = skimage.measure.label(lsim) regions = regionprops(label_img) index = label_img[center[0],center[1]]-1 # direct features Area = regions[index].area CentralMoments = regions[index].moments_central Eccentricity = regions[index].eccentricity Perimeter = regions[index].perimeter skewx=np.mean(stats.skew(lsim, axis=0, bias=True)) skewy=np.mean(stats.skew(lsim, axis=1, bias=True)) # derived features compact = Area/Perimeter**2 skewness = np.sqrt(skewx**2 + skewy**2) cen_skew = getCentSkewness(label_img,Area, index,regions[index].centroid) numBranch = getRBSTim(label_img,orgim) return np.hstack((Area, Eccentricity, Perimeter, compact, skewness, cen_skew, numBranch))
def get_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(fname, res_type = 'kaiser_fast') try: mfcc = np.mean(librosa.feature.mfcc(y = b,n_mfcc=60).T,axis=0) mels = np.mean(librosa.feature.melspectrogram(b, sr = SAMPLE_RATE).T,axis = 0) stft = np.abs(librosa.stft(b)) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr = SAMPLE_RATE).T,axis = 0) contrast=np.mean(librosa.feature.spectral_contrast(S=stft, sr=SAMPLE_RATE).T,axis=0) tonnetz=np.mean(librosa.feature.tonnetz(librosa.effects.harmonic(b), sr = SAMPLE_RATE).T,axis = 0) ft2 = librosa.feature.zero_crossing_rate(b)[0] ft3 = librosa.feature.spectral_rolloff(b)[0] ft4 = librosa.feature.spectral_centroid(b)[0] ft5 = librosa.feature.spectral_contrast(b)[0] ft6 = librosa.feature.spectral_bandwidth(b)[0] ft2_trunc = np.hstack([np.mean(ft2),np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2)]) ft3_trunc = np.hstack([np.mean(ft3),np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3)]) ft4_trunc = np.hstack([np.mean(ft4),np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4)]) ft5_trunc = np.hstack([np.mean(ft5),np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5)]) ft6_trunc = np.hstack([np.mean(ft6),np.std(ft6), skew(ft6), np.max(ft6), np.min(ft6)]) return pd.Series(np.hstack((mfcc,mels,chroma,contrast,tonnetz,ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc))) #d = np.hstack([mfcc,mels,chroma,contrast,tonnetz,ft2_trunc,ft3_trunc,ft4_trunc,ft5_trunc,ft6_trunc]) #features = np.empty((0,238)) #d = np.vstack([features,d]) except: print('bad file') return pd.Series([0]*238)
def colorMoment(im): """Calculates the 2nd and 3rd color moments of the input image and returns values in a list.""" #The first color moment is the mean. This is already considered as a metric for #the red, green, and blue channels, so this is not included here. #Only the 2nd and 3rd moments will be calculated here. newIm = matplotlib.colors.rgb_to_hsv(im) #convert to HSV space #Pull out each channel from the image to analyze seperately. HChannel = newIm[:,:,0] SChannel = newIm[:,:,1] VChannel = newIm[:,:,2] #2nd moment = standard deviation. Hstd = numpy.std(HChannel) Sstd = numpy.std(SChannel) Vstd = numpy.std(VChannel) #3rd Moment = "Skewness". Calculate the skew, which gives an array. #Then take the mean of that array to get a single value for each channel. Hskew = numpy.mean(skew(HChannel)) Sskew = numpy.mean(skew(SChannel)) Vskew = numpy.mean(skew(VChannel)) return [Hstd, Sstd, Vstd, Hskew, Sskew, Vskew] #return all of the metrics.
def skewness_sqrt(ny_housing): skewness_SpLiv1 = skew(ny_housing['SalePrice']) skewness_grLiv1 = skew(ny_housing['GrLivArea']) ny_housing['SalePrice'] = np.sqrt(ny_housing['SalePrice']) ny_housing['GrLivArea'] = np.sqrt(ny_housing['GrLivArea']) skewness_SpLiv2 = skew(ny_housing['SalePrice']) skewness_grLiv2 = skew(ny_housing['GrLivArea']) return skewness_grLiv2,skewness_SpLiv2
def skewness_log(data): skewness_SpLiv1 = skew(data['SalePrice']) skewness_grLiv1 = skew(data['GrLivArea']) data['SalePrice'] = np.log(data['SalePrice']) data['GrLivArea'] = np.log(data['GrLivArea']) skewness_SpLiv2 = skew(data['SalePrice']) skewness_grLiv2 = skew(data['GrLivArea']) return skewness_grLiv2,skewness_SpLiv2
def single_file_featurization(wavfile): ''' INPUT: row of dataframe with 'audio_slice_name' as the filename of the audio sample OUTPUT: feature vector for audio sample Function for dataframe apply for extracting each audio sample into a feature vector of mfcc coefficients ''' # print statements to update the progress of the processing try: # load the raw audio .wav file as a matrix using librosa wav_mat, sr = lr.load(wavfile, sr=sample_rate) # create the spectrogram using the predefined variables for mfcc extraction S = lr.feature.melspectrogram(wav_mat, sr=sr, n_mels=n_filters, fmax=sr/2, n_fft=window, hop_length=hop) # using the pre-defined spectrogram, extract the mfcc coefficients mfcc = lr.feature.mfcc(S=lr.logamplitude(S), n_mfcc=25) # calculate the first and second derivatives of the mfcc coefficients to detect changes and patterns mfcc_delta = lr.feature.delta(mfcc) mfcc_delta = mfcc_delta.T mfcc_delta2 = lr.feature.delta(mfcc, order=2) mfcc_delta2 = mfcc_delta2.T mfcc = mfcc.T # combine the mfcc coefficients and their derivatives in a column stack for analysis total_mfcc = np.column_stack((mfcc, mfcc_delta, mfcc_delta2)) # use the average of each column to condense into a feature vector # this makes each sample uniform regardless of the length of original the audio sample # the following features are extracted # - avg of mfcc, first derivative, second derivative # - var of mfcc, first derivative, second derivative # - max of mfcc # - min of mfcc # - median of mfcc # - skew of mfcc # - kurtosis of mfcc avg_mfcc = np.mean(total_mfcc, axis=0) var_mfcc = np.var(total_mfcc, axis=0) max_mfcc = np.max(mfcc, axis=0) min_mfcc = np.min(mfcc, axis=0) med_mfcc = np.median(mfcc, axis=0) skew_mfcc = skew(mfcc, axis=0) kurt_mfcc = skew(mfcc, axis=0) # combine into one vector and append to the total feature matrix return np.concatenate((avg_mfcc, var_mfcc, max_mfcc, min_mfcc, med_mfcc, skew_mfcc, kurt_mfcc)) except: print "Uhmmm something bad happened" return np.zeros(7)
def test_skew(self): # Using the scipy.stats definition which is optimized and unittested data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]] expt = [] expt.append(stats.skew(data[0])) expt.append(stats.skew(data[1])) resulting_vals = skew(data) self.assertTrue(np.array_equal(np.array(expt), np.array(resulting_vals)))
def test_skew(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.skew(x) rm = stats.mstats.skew(xm) assert_almost_equal(r, rm, 10) r = stats.skew(y) rm = stats.mstats.skew(ym) assert_almost_equal(r, rm, 10)
def stats_plots(V, labelsin, title=None): """ 4 plots of basic statistical properties. IC = intraclass correlation, or the noise sources between the groups. """ import scipy.stats as stats colors = ['darkkhaki', 'royalblue', 'forestgreen','tomato'] var = [np.var(i) for i in V] skew = [stats.skew(i) for i in V] kurt = [stats.kurtosis(i) for i in V] uniq = list(set(labelsin)) v_sort = [[] for u in uniq] # Make a blank list, preparing for IC v_means = [[] for u in uniq] # v_means is a list of list of means for each cell of each type v_var, v_skew, v_kurt = [[] for u in uniq], [[] for u in uniq], [[] for u in uniq] for v in range(len(V)): i = uniq.index(labelsin[v]) v_sort[i].append(V[v]) v_means[i].append(np.mean(V[v])) v_var[i].append(np.var(V[v])) v_skew[i].append(stats.skew(V[v])) v_kurt[i].append(stats.kurtosis(V[v])) # ic = var_between^2 / (var_between^2 + var_within^2) ic = [] for v in range(len(uniq)): I = np.var(v_means[v])**2 / \ (np.var(v_means[v])**2 + sum([np.var(i) for i in v_sort[v]])**2) ic.append([I]) print(ic) group_means = [np.mean(k) for k in v_means] # group_means are the master means (only 4) master_ic = np.var(group_means)**2 / \ (np.var(group_means)**2 + sum([np.var(i) for i in v_means])**2) print('Master IC for this set: %.5f' %master_ic) ## Plotting stuff fig = plt.figure() axs = [fig.add_subplot(221), fig.add_subplot(222), fig.add_subplot(223), fig.add_subplot(224)] t**s = ['Variance', 'Skew', 'Kurtosis', 'Intraclass correlation'] plot_vars = [v_var, v_skew, v_kurt, ic] for a in axs: # For each plot for u in range(len(uniq)): # For each cell type a.scatter(np.ones(len(plot_vars[axs.index(a)][u]))*u, plot_vars[axs.index(a)][u], c=colors[u], s=80, edgecolor='k', alpha=0.6) if axs.index(a) == 3: a.set_yticks([0,0.12,0.24]) else: a.locator_params(axis='y', nbins=4) a.set_xticks([]) a.set_title(t**s[axs.index(a)]) # Legend and title #patches = [mpatches.Patch(color=colors[u], label=uniq[u]) for u in range(len(uniq))] #plt.legend(handles=patches, loc=5) if title is not None: plt.suptitle(title, fontsize=20) plt.show()
def test_skewness(self): """ sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0)/ ((sqrt(var(testmathworks)*4/5))**3)/5 """ y = stats.skew(self.testmathworks) assert_approx_equal(y,-0.29322304336607,10) y = stats.skew(self.testmathworks,bias=0) assert_approx_equal(y,-0.437111105023940,10) y = stats.skew(self.testcase) assert_approx_equal(y,0.0,10)
def feature_skewness(svmfile): X, y = load_svmlight_file(svmfile, zero_based = False, query_id = False) m, n = X.shape for i in range(n): x = np.array(X[:,i].todense())[:,0] ecdf = ECDF(x) s1 = skew(x) s2 = skew(np.log2(x+1)) s3 = skew(ecdf(x)) if np.abs(s1) < np.abs(s2): print "%d %f -> %f or %f" % (i+1, s1, s2, s3) else: print "[!] %d %f -> %f or %f" % (i+1, s1, s2, s3)
def mcnoise(data, noise_std, n, noise_scaling=1.): """ Parameters ---------- data : ndarray Array of data. noise_std : float Standard deviation of the noise n : int Number of repetition noise_scaling: float Scaling factor for noise Returns ------- variance, variance error, skewness, skewness error, kurtosis, kurtosis error """ noise_arr = np.random.normal(0, noise_std, (n, data.size)) * noise_scaling var_sample = np.var(data + noise_arr, axis=1) skew_sample = skew(data + noise_arr, axis=1) kurt_sample = kurtosis(data + noise_arr, axis=1) var_val = np.mean(var_sample) skew_val = np.mean(skew_sample) kurt_val = np.mean(kurt_sample) var_err = np.std(var_sample) skew_err = np.std(skew_sample) kurt_err = np.std(kurt_sample) return var_val, var_err, skew_val, skew_err, kurt_val, kurt_err
def _fitstart(self, x): '''example method, method of moment estimator as starting values Parameters ---------- x : array data for which the parameters are estimated Returns ------- est : tuple preliminary estimates used as starting value for fitting, not necessarily a consistent estimator Notes ----- This needs to be written and attached to each individual distribution This example was written for the gamma distribution, but not verified with literature ''' loc = np.min([x.min(),0]) a = 4/stats.skew(x)**2 scale = np.std(x) / np.sqrt(a) return (a, loc, scale)
def compute_profile(self): self.rec.label_contours(self.ji_intervals) distributions = {} for key, segments in self.rec.contour_labels.items(): distributions[key] = [] for indices in segments: distributions[key].extend(self.pitch_obj.pitch[indices[0]:indices[1]]) parameters = {} for interval, distribution in distributions.items(): distribution = np.array(distribution) #TODO: replace -10000 with whatever the bound is for invalid pitch values in cent scale distribution = distribution[distribution >= -10000] [n, be] = np.histogram(distribution, bins=1200) bc = (be[1:] + be[:-1])/2.0 peak_pos = bc[np.argmax(n)] peak_mean = float(np.mean(distribution)) peak_variance = float(variation(distribution)) peak_skew = float(skew(distribution)) peak_kurtosis = float(kurtosis(distribution)) pearson_skew = float(3.0 * (peak_mean - peak_pos) / np.sqrt(abs(peak_variance))) parameters[interval] = {"position": float(peak_pos), "mean": peak_mean, "amplitude": float(max(n)), "variance": peak_variance, "skew1": peak_skew, "skew2": pearson_skew, "kurtosis": peak_kurtosis} all_amps = [parameters[interval]["amplitude"] for interval in parameters.keys()] peak_amp_sum = sum(all_amps) for interval in parameters.keys(): parameters[interval]["amplitude"] = parameters[interval]["amplitude"]/peak_amp_sum self.intonation_profile = parameters
def test_cont_basic_slow(): # same as above for slow distributions for distname, arg in distcont[:]: if distname not in distslow: continue distfn = getattr(stats, distname) np.random.seed(765456) sn = 1000 rvs = distfn.rvs(size=sn,*arg) sm = rvs.mean() sv = rvs.var() skurt = stats.kurtosis(rvs) sskew = stats.skew(rvs) m,v = distfn.stats(*arg) yield check_sample_meanvar_, distfn, arg, m, v, sm, sv, sn, distname + \ 'sample mean test' # the sample skew kurtosis test has known failures, not very good distance measure #yield check_sample_skew_kurt, distfn, arg, sskew, skurt, distname yield check_moment, distfn, arg, m, v, distname yield check_cdf_ppf, distfn, arg, distname yield check_sf_isf, distfn, arg, distname yield check_pdf, distfn, arg, distname yield check_pdf_logpdf, distfn, arg, distname yield check_cdf_logcdf, distfn, arg, distname yield check_sf_logsf, distfn, arg, distname #yield check_oth, distfn, arg # is still missing if distname in distmissing: alpha = 0.01 yield check_distribution_rvs, distname, arg, alpha, rvs
def jarque_bera(resids): """ Calculate residual skewness, kurtosis, and do the JB test for normality Parameters ----------- resids : array-like Returns ------- JB, JBpv, skew, kurtosis JB = n/6*(S^2 + (K-3)^2/4) JBpv is the Chi^2 two-tail probability value skew is the measure of skewness kurtosis is the measure of kurtosis """ resids = np.asarray(resids) # Calculate residual skewness and kurtosis skew = stats.skew(resids) kurtosis = 3 + stats.kurtosis(resids) # Calculate the Jarque-Bera test for normality JB = (resids.shape[0]/6) * (skew**2 + (1/4)*(kurtosis-3)**2) JBpv = stats.chi2.sf(JB,2); return JB, JBpv, skew, kurtosis
def computeDMCurveStatScores(self): """ Returns a list of integer data points representing the candidate DM curve. Parameters: N/A Returns: A list data type containing data points. """ try: bins=[] bins=self.profileOps.getDMCurveData(self.rawdata,self.profileIndex) mn = mean(bins) stdev = std(bins) skw = skew(bins) kurt = kurtosis(bins) stats = [mn,stdev,skw,kurt] return stats except Exception as e: # catch *all* exceptions print "Error getting DM curve stat scores from PHCX file\n\t", sys.exc_info()[0] print self.format_exception(e) raise Exception("DM curve stat score extraction exception") return []
def grid_color_stat(patient_grid_1_color): shape_stats = np.zeros(4) shape_stats[0] = np.mean(patient_grid_1_color.flatten()) shape_stats[1] = np.std(patient_grid_1_color.flatten()) shape_stats[2] = skew(patient_grid_1_color.flatten()) shape_stats[3] = kurtosis(patient_grid_1_color.flatten()) return shape_stats
def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(moments.rolling_skew, lambda x: skew(x, bias=False))
def perf_stats( returns, returns_style='compound', return_as_dict=False, period=DAILY): """Calculates various performance metrics of a strategy, for use in plotting.show_perf_stats. Parameters ---------- returns : pd.Series Daily returns of the strategy, noncumulative. - See full explanation in tears.create_full_tear_sheet. returns_style : str, optional See annual_returns' style return_as_dict : boolean, optional If True, returns the computed metrics in a dictionary. period : str, optional - defines the periodicity of the 'returns' data for purposes of annualizing. Can be 'monthly', 'weekly', or 'daily' - defaults to 'daily'. Returns ------- dict / pd.DataFrame Performance metrics. """ all_stats = OrderedDict() all_stats['annual_return'] = annual_return( returns, style=returns_style, period=period) all_stats['annual_volatility'] = annual_volatility(returns, period=period) all_stats['sharpe_ratio'] = sharpe_ratio( returns, returns_style=returns_style, period=period) all_stats['calmar_ratio'] = calmar_ratio( returns, returns_style=returns_style, period=period) all_stats['stability'] = stability_of_timeseries(returns) all_stats['max_drawdown'] = max_drawdown(returns) all_stats['omega_ratio'] = omega_ratio(returns) all_stats['sortino_ratio'] = sortino_ratio(returns) # TODO: The information_ratio method requires # a second argument for benchmark returns. # Setting information_ratio to NaN until # benchmark returns are added as an argument # to this method. all_stats['information_ratio'] = np.nan all_stats['skewness'] = stats.skew(returns) all_stats['kurtosis'] = stats.kurtosis(returns) if return_as_dict: return all_stats else: all_stats_df = pd.DataFrame( index=list(all_stats.keys()), data=list(all_stats.values())) all_stats_df.columns = ['perf_stats'] return all_stats_df
def computeProfileStatScores(self): """ Builds the scores using raw profile intensity data only. Returns the scores. Parameters: N/A Returns: An array of profile intensities as floating point values. """ try: bins =[] for intensity in self.profile: bins.append(float(intensity)) mn = mean(bins) stdev = std(bins) skw = skew(bins) kurt = kurtosis(bins) stats = [mn,stdev,skw,kurt] return stats except Exception as e: # catch *all* exceptions print "Error getting Profile stat scores from PHCX file\n\t", sys.exc_info()[0] print self.format_exception(e) raise Exception("Profile stat score extraction exception") return []
def calc_statistics(x): n = x.shape[0] # 样本个数 # 手动计算 m = 0 m2 = 0 m3 = 0 m4 = 0 for t in x: m += t m2 += t*t m3 += t**3 m4 += t**4 m /= n m2 /= n m3 /= n m4 /= n mu = m sigma = np.sqrt(m2 - mu*mu) skew = (m3 - 3*mu*m2 + 2*mu**3) / sigma**3 kurtosis = (m4 - 4*mu*m3 + 6*mu*mu*m2 - 4*mu**3*mu + mu**4) / sigma**4 - 3 print '手动计算均值、标准差、偏度、峰度:', mu, sigma, skew, kurtosis # 使用系统函数验证 mu = np.mean(x, axis=0) sigma = np.std(x, axis=0) skew = stats.skew(x) kurtosis = stats.kurtosis(x) return mu, sigma, skew, kurtosis
def get_stat_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(i, res_type = 'kaiser_fast') try: #basic statistical features length = len(b) mean = np.mean(b) minimum = np.min(b) maximum = np.max(b) std = np.std(b) rms = np.sqrt(np.mean(b**2)) kurt = kurtosis(b) Skew = skew(b) #Audio length feature data,samp_rate = librosa.effects.trim(b,top_db = 40) len_init = len(data) ratio_init = len_init/length splits = librosa.effects.split(b, top_db=40) if len(splits) > 1: b = np.concatenate([b[x[0]:x[1]] for x in splits]) len_final = len(b) ratio_final = len_final/length #return pd.Series([mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final]) return pd.Series(np.hstack((mean,minimum,maximum,std,rms,kurt,Skew,len_init,ratio_init,len_final,ratio_final))) except: print("Bad file at {}".format(fname)) return pd.Series([0]*11)
def test_distribution(data, mask=None): logger.info("Testing distribution.") data = data.reshape(data.shape[0], reduce(lambda x, y: x * y, data.shape[1:4])) if mask is not None: mask_idx = np.where(mask.flatten() == 1)[0].tolist() data = data[:, mask_idx] k = kurtosis(data, axis=0) s = skew(data, axis=0) logger.info("Proportion voxels k <= -1: %.2f" % (len(np.where(k <= -1)[0].tolist()) * 1. / data.shape[1])) logger.info("Proportion voxels -1 < k < 1: %.2f" % (len(np.where(np.logical_and(k > -1, k < 1))[0].tolist()) * 1. / data.shape[1])) logger.info("Proportion voxels 1 < k < 2: %.2f" % (len(np.where(np.logical_and(k >= 1, k < 2))[0].tolist()) * 1. / data.shape[1])) logger.info("Proportion voxels 2 < k < 3: %.2f" % (len(np.where(np.logical_and(k >= 2, k < 3))[0].tolist()) * 1. / data.shape[1])) logger.info("Proportion voxels k >= 3: %.2f" % (len(np.where(k >= 3)[0].tolist()) * 1. / data.shape[1])) values = len(np.unique(data)) if (values * 1. / reduce(lambda x, y: x * y, data.shape) < 10e-4): logger.warn("Quantization probable (%d unique values out of %d)." % (values, reduce(lambda x, y: x * y, data.shape))) logger.info("Number of unique values in data: %d" % values) logger.info("Krutosis k: %.2f (%.2f std) and skew s: %.2f (%.2f std)" % (k.mean(), k.std(), s.mean(), s.std()))
def statistical_spectrum_descriptors(spectrogram): """ Statistical Spectrum Descriptors of the STFT. Parameters ---------- spectrogram : numpy array Magnitude spectrogram. Returns ------- statistical_spectrum_descriptors : dict Statistical spectrum descriptors of the spectrogram. References ---------- .. [1] Thomas Lidy and Andreas Rauber, "Evaluation of Feature Extractors and Psycho-acoustic Transformations for Music Genre Classification", Proceedings of the 6th International Conference on Music Information Retrieval (ISMIR), 2005. """ from scipy.stats import skew, kurtosis return {'mean': np.mean(spectrogram, axis=0), 'median': np.median(spectrogram, axis=0), 'variance': np.var(spectrogram, axis=0), 'skewness': skew(spectrogram, axis=0), 'kurtosis': kurtosis(spectrogram, axis=0), 'min': np.min(spectrogram, axis=0), 'max': np.max(spectrogram, axis=0)}
def get_skews(_list): skews = [] for i in _list: skews.append(stats.skew(i)) return skews
def plot_trace(self, wplot, proctype, wroi, color): if wplot == 1: wp = self.p1 else: wp = self.p2 if proctype == 0 or proctype == 2: # motSVD if proctype == 0: ir = 0 else: ir = wroi + 1 cmap = cm.get_cmap("hsv") nc = min(10, self.motSVDs[ir].shape[1]) cmap = (255 * cmap(np.linspace(0, 0.2, nc))).astype(int) norm = (self.motSVDs[ir][:, 0]).std() tr = (self.motSVDs[ir][:, :10]**2).sum(axis=1)**0.5 / norm for c in np.arange(0, nc, 1, int)[::-1]: pen = pg.mkPen(tuple(cmap[c, :]), width=1) #, style=QtCore.Qt.DashLine) tr2 = self.motSVDs[ir][:, c] / norm tr2 *= np.sign(skew(tr2)) wp.plot(tr2, pen=pen) pen = pg.mkPen(color) wp.plot(tr, pen=pen) wp.setRange(yRange=(-3, 3)) elif proctype == 1: pup = self.pupil[wroi] pen = pg.mkPen(color, width=2) pp = wp.plot(zscore(pup['area_smooth']) * 2, pen=pen) if 'com_smooth' in pup: pupcom = pup['com_smooth'].copy() else: pupcom = pup['com'].copy() pupcom -= pupcom.mean(axis=0) norm = pupcom.std() pen = pg.mkPen((155, 255, 155), width=1, style=QtCore.Qt.DashLine) py = wp.plot(pupcom[:, 0] / norm * 2, pen=pen) pen = pg.mkPen((0, 100, 0), width=1, style=QtCore.Qt.DashLine) px = wp.plot(pupcom[:, 1] / norm * 2, pen=pen) tr = np.concatenate((zscore(pup['area_smooth'])[np.newaxis, :] * 2, pupcom[:, 0][np.newaxis, :] / norm * 2, pupcom[:, 1][np.newaxis, :] / norm * 2), axis=0) lg = wp.addLegend(offset=(0, 0)) lg.addItem(pp, "<font color='white'><b>area</b></font>") lg.addItem(py, "<font color='white'><b>ypos</b></font>") lg.addItem(px, "<font color='white'><b>xpos</b></font>") elif proctype == 3: tr = zscore(self.blink[wroi]) pen = pg.mkPen(color, width=2) wp.plot(tr, pen=pen) elif proctype == 4: running = self.running[wroi] running *= np.sign(running.mean(axis=0)) running -= running.min() running /= running.max() running *= 16 running -= 8 wp.plot(running[:, 0], pen=color) wp.plot(running[:, 1], pen=color) tr = running.T return tr
plt.show() sns.distplot(df_train['TotalBsmtSF'], bins=50, fit=norm) plt.show() sns.distplot(df_train['GarageArea'], bins=50, fit=norm) plt.show() sns.distplot(df_train[:train_len]['SalePrice'], bins=50, fit=norm) plt.show() # Observa-se que algumas variáveis e o alvo possuem assimetria (skewness) acentuada. # Para normalizar esta situação e facilitar o treinamento dos modelos, # Aplicaremos transformação logaritmica nas variaveis que apresentarem essa caracteristica. df_train['SalePrice'][:train_len] = np.log1p(df_train['SalePrice'][:train_len]) skewness = df_train.select_dtypes( exclude='object').apply(lambda x: stats.skew(x)) skewness = skewness[abs(skewness) > 0.6] skewed_features = skewness.index df_train[skewed_features] = np.log1p(df_train[skewed_features]) # Gerar dummies das categoricas df_train = pd.get_dummies(df_train) # 1o treinamento from sklearn.linear_model import LinearRegression, LassoCV train_set = df_train[:train_len] test_set = df_train[train_len:].drop('SalePrice', axis=1) X_train, X_test, Y_train, Y_test = train_test_split( train_set.drop('SalePrice', axis=1), train_set.SalePrice)
lbl.fit(list(all_data[c].values)) all_data[c] = lbl.transform(list(all_data[c].values)) print('Shape all_data: {}'.format(all_data.shape)) ##### Dodanie nowej, dodatkowej zmiennej ##### all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF'] ##### Weryfikacja indeksow, ktore sa typy numeryczne ##### numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) print("\nSkew in numerical features: \n") skewness = pd.DataFrame({'Skew' :skewed_feats}) skewness.head(10) ##### Checking which measures have to be Box-Cox transformed ###### skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: #all_data[feat] += 1 all_data[feat] = boxcox1p(all_data[feat], lam)
df2 = pd.read_csv(path2) # Impute missing values with mean df2 = df2.replace("?","NaN") mean_imputer = Imputer(missing_values='NaN',strategy='mean',axis=0) df2['normalized-losses'] = mean_imputer.fit_transform(df2[['normalized-losses']]) df2['horsepower'] = mean_imputer.fit_transform(df2[['horsepower']]) # Skewness of numeric features num_cols = df2._get_numeric_data().columns for num_col in num_cols: if skew(df2[num_col].values)>1: print(num_col) df2[num_col]= np.sqrt(df2[num_col]) print(df2.head()) cat_cols = list(set(df2.columns)- set(num_cols)) # Label encode label_encoder = LabelEncoder() for cat_col in cat_cols: df2[cat_col]= label_encoder.fit_transform(df2[cat_col]) df2['area']=df2['height']*df2['width']
upper_bound = np.mean(data) + (np.percentile(data, confidence_level) * standard_error) lower_bound = np.mean(data) - (np.percentile(data, confidence_level) * standard_error) return lower_bound, upper_bound data = read_data('nerve.txt') bootstrap_sample = bootstrap(data, num_of_simulation=10000) median = np.median(bootstrap_sample, axis=1) plt.figure(0) plt.hist(median, bins=10, label='Median (bootstrap)') plt.savefig('median.png') skewness = stats.skew(bootstrap_sample, axis=1) plt.figure(1) plt.hist(skewness, bins=50, label='Skewness (bootstrap)') plt.savefig('skewness.png') # Basic bootstrap confidence interval print("Median CI (Basic) =", confidence_interval(median, confidence_level=0.95)) print("Skewness CI (Basic) =", confidence_interval(skewness, confidence_level=0.95)) # Bootstrap-t confidence interval print("Median CI (Bootstrap-t) =", bootstrap_t(median, confidence_level=0.95)) print("Skewness CI (Bootstrap-t) =", bootstrap_t(skewness, confidence_level=0.95))
alldata.shape #create new data train_new = alldata[alldata['SalePrice'].notnull()] test_new = alldata[alldata['SalePrice'].isnull()] print Train, train_new.shape print ('----------------') print Test, test_new.shape #get numeric features numeric_features = [f for f in train_new.columns if train_new[f].dtype != object] #transform the numeric features using log(x + 1) from scipy.stats import skew skewed = train_new[numeric_features].apply(lambda x: skew(x.dropna().astype(float))) skewed = skewed[skewed > 0.75] skewed = skewed.index train_new[skewed] = np.log1p(train_new[skewed]) test_new[skewed] = np.log1p(test_new[skewed]) del test_new['SalePrice'] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(train_new[numeric_features]) scaled = scaler.transform(train_new[numeric_features]) for i, col in enumerate(numeric_features): train_new[col] = scaled[:,i] numeric_features.remove('SalePrice')
plt.title('Area distribution') fig = plt.figure() res = stats.probplot(train['area'], plot=plt) plt.show() y_train = train.area.values print("Skewness: %f" % train['area'].skew()) print("Kurtosis: %f" % train['area'].kurt()) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = all_data[numeric_feats].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) skewness = pd.DataFrame({'Skewed Features': skewed_feats}) print(skewness.head()) skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform".format( skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: all_data[feat] = boxcox1p(all_data[feat], lam) all_data[feat] += 1 all_data = pd.get_dummies(all_data)
def getBorderFeatures(img, nim): """genear descriptores basados en bordes No de lineas rectas, cantidad de bordes img es una imagen png 256x256 cargada con cv2 :param img: 256x256 RGB intensity normalice image readed with OCV :param nim: image flattern in a one dimention vecto :return: dictionary with some border features: number of lines, numbre of border pixels in image """ # primero para la imagen si reducir imgColor = nim img2 = remove_transparency(imgColor, 0) # Se detectan bordes usando Canny img_edged = cv2.Canny(img2, 100, 200) bordes = img_edged.sum() # Se cuentan las linea rectas usando HoughLines lines = cv2.HoughLines( img_edged, 1, np.pi / 180, 64 ) # nomrmalmente deben se entre la curata y la midad de loz pixeles W. try: if lines.any(): lines = len(lines) else: lines = 0 except: lines = 0 # Luego para la imagen encogida img = cv2.GaussianBlur(img, (3, 3), 0) img = cv2.resize(img, (32, 32), interpolation=cv2.INTER_AREA) imgColor = cv2.normalize(img, None, alpha=0, beta=256, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F) imgGray = cv2.cvtColor(imgColor, cv2.COLOR_BGR2GRAY) imgGray = imgGray.astype('uint8') std = imgGray.std(axis=0).std(axis=0) media = imgGray.mean(axis=0).std(axis=0) ventana = int(math.sqrt(imgGray.size) / 4 + 1) # Se detectan los bordes usando Canny img2 = remove_transparency(imgColor, 0) img_edged = cv2.Canny(img2, 100, 200) bordes32 = img_edged.sum() # Se binariza la imagen usando umbral adaptativo queda ub "borde de la imagen con una textura simplificada" umbral = cv2.adaptiveThreshold(imgGray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, ventana, 2 * std) # Binarizacion de la imagen usando Otsu blur = cv2.GaussianBlur(imgGray, (3, 3), 0) ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Detectar y contar lineas rectas lines32 = cv2.HoughLines(img_edged, 1, np.pi / 180, 16) try: if lines32.any(): lines32 = len(lines32) else: lines32 = 0 except: lines32 = 0 simetriaImagenBinarizada = skew(th3).sum() curtosisImagenBinarizada = kurtosis(th3).sum() sumaUmbral = umbral.sum() return { 'lines': lines, 'bordes': bordes, 'lines32': lines32, 'bordes32': bordes32, 'simetriaImagenBinarizada': simetriaImagenBinarizada, 'curtosisImagenBinarizada': curtosisImagenBinarizada, 'sumaUmbral': sumaUmbral }
def section_action(request): __analysis = [] context = {} __grades = '' __domain = request.get_host() __location = '' __teachers = '' __college = '' __department = '' __course = '' __course_code = '' __section = '' __message = '' __mean = 0 __std = 0 __min = 0 __max = 0 __skewness = 0 __correlation = 0 __show__result = 0 __histogramfile = '' __mids = [] __finals = [] __totals = [] __line = 7 try: sem.acquire() print("The semaphore is locked") __grades = request.FILES['grades'] if request.method == 'GET': raise Exception('Internal Error') if request.method == 'POST': # upload the file _grades_uploaded_file = 'data/upload/sections/' + str(__grades) if not os.path.exists('data/upload/sections/'): os.makedirs('data/upload/sections/') with open(_grades_uploaded_file, 'wb+') as destination: for chunk in __grades.chunks(): destination.write(chunk) # read the content of the uploaded file workbook = xl.open_workbook(_grades_uploaded_file, on_demand=True) worksheet = workbook.sheet_by_index(0) try: ____tmp = worksheet.cell_value(6, 5) except IndexError: ____tmp = '' __section = int(worksheet.cell_value(4, 1)) __location = worksheet.cell_value(0, 1) if __section == '': raise Exception('Unable to read the section from the excel file !!!') if __location == '': raise Exception('Unable to read the location from the excel file !!!') __newfilename = 'data/upload/sections/section_' + str(__section) + _grades_uploaded_file[-4:] os.rename(_grades_uploaded_file, __newfilename) __section_obj = None __actualSemester = Semester.objects.get(semester_isInUse=True) for _mytest in Section.objects.all(): if _mytest.section_department.department_location.college_location.location_name_ar == __location \ and _mytest.section_code == __section \ and _mytest.section_semester == __actualSemester: __section_obj = _mytest print('Section found with id = ' + str(_mytest.section_id)) __location = _mytest.section_department.department_location.college_location.location_name __college = _mytest.section_department.department_location.college_name __department = _mytest.section_department.department_name __course = _mytest.section_course.course_name __course_code = _mytest.section_course.course_code for _teach in _mytest.section_teachers.all(): __teachers = __teachers + ' ' + _teach.teacher_name_ar break if ____tmp == '': # grades without mids while True: try: __student = worksheet.cell_value(__line, 0) if worksheet.cell_value(__line, 2) != '': __finals.append(int(worksheet.cell_value(__line, 2))) if worksheet.cell_value(__line, 3) != '': __totals.append(int(worksheet.cell_value(__line, 3))) __line += 1 except IndexError: break else: while True: try: __student = worksheet.cell_value(__line, 0) if worksheet.cell_value(__line, 2) != '': __mids.append(int(worksheet.cell_value(__line, 2))) if worksheet.cell_value(__line, 3) != '': __finals.append(int(worksheet.cell_value(__line, 3))) if worksheet.cell_value(__line, 4) != '': __totals.append(int(worksheet.cell_value(__line, 4))) __line += 1 except IndexError: break # debug data # print('grades = ' + str(__grades)) # print('Section = ' + str(__section)) # print('MIDs = ' + str(__mids)) # print('Finals = ' + str(__finals)) # print('Totals = ' + str(__totals)) # compute statistics about the course grades if __section_obj == None: raise Exception('Unable to recognise the section in the database !!!') if __section != '' and __section_obj != None: __message = 'The grade Excel file was well loaded' __mean = float("{0:.4f}".format(statistics.mean(__totals))) __std = float("{0:.4f}".format(statistics.stdev(__totals))) __skewness = float("{0:.4f}".format(skew(__totals, bias=False))) if len(__mids) == 0: __correlation = -99.99 else: __correlation = float("{0:.4f}".format(pearsonr(__mids, __finals)[1])) __min = min(__totals) __max = max(__totals) # plot the histogram a = np.array(__totals) # Fit a normal distribution to the data: mu, std = norm.fit(a) number = a.size # Plot the histogram. plt.hist(a, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], density=True, color='#607c8e', edgecolor='black', rwidth=0.8) # Plot the PDF. x = np.linspace(0, 100, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=3) title = "Histogram: Section = %d, Number of Students = %d" % (__section, number) plt.title(title) __histogramfile = "data/media/histogram_section" + str(__section) + ".png" plt.savefig(__histogramfile) plt.close() print("mean : " + str(__mean)) print("std : " + str(__std)) print("skewness : " + str(__skewness)) print("correlation : " + str(__correlation)) __show__result = 1 # save the section analysis __grades_data = {} __grades_data['mids'] = __mids __grades_data['finals'] = __finals __grades_data['totals'] = __totals str_grades = str(__grades_data) # inverse dict2 = eval(str1) try: obj = SectionDocRequest.objects.get(section=__section_obj) print("--------> Updating the section data with id= " + str(obj.section_doc_id)) except SectionDocRequest.DoesNotExist: obj = SectionDocRequest() print("--------> Creating new section data") obj.doc_correlation = __correlation obj.doc_explanation = '' obj.doc_max = __max obj.doc_mean = __mean obj.doc_min = __min obj.doc_skewness = __skewness obj.doc_std_deviation = __std obj.histogram = __histogramfile obj.student_grades = str_grades obj.section = __section_obj obj.save() # get the server domain if request.is_secure(): __domain = 'https://' + request.get_host() else: __domain = 'http://' + request.get_host() __security = request.is_secure() except MultiValueDictKeyError: __message = 'Please fill all the form.' except ValueError as e: __message = 'Please use the grade file provided by the registration portal (Academia) without any change.' print(str(e)) except Exception as e: __message = str(e) print(str(e)) finally: sem.release() print("The semaphore was released") if len(__mids) == 0: __correlation = 'N/A' context = { 'show_result': __show__result, 'message': __message, 'mean': __mean, 'std': __std, 'skewness': __skewness, 'correlation': __correlation, 'min': __min, 'max': __max, 'histogram': __histogramfile, 'domain': __domain, 'section': __section, 'location': __location, 'college': __college, 'department': __department, 'course': __course, 'course_code': __course_code, 'teachers': __teachers, } __results = analysis(context) context['analysis'] = __results del __grades del __section del __message del __mean del __std del __min del __max del __skewness del __correlation del __show__result del __histogramfile del __domain return render(request, 'section_result.html', context=context)
def skew(X): ''' skewness for each variable in a segmented time series ''' return stats.skew(X, axis=1)
def get_features(df_, fs): """ Calculates features to be used for ECG signal quality classification Parameters ---------- df_: pandas dataframe Dataframe of ECG data, must contain the following columns: processed, r_peaks, and beats fs: float Sampling rate of signal (must be in Hertz) Returns ------- df: pandas dataframe Dataframe appended with computed features """ df = pd.DataFrame.copy(df_) print('Computing features...'), # features from statistics of magnitude of ECG signal df.loc[:, 'f_stddev'] = df.processed.apply(lambda x: np.nanstd(x)) df.loc[:, 'f_kurtosis'] = df.processed.apply(lambda x: kurtosis(x)) df.loc[:, 'f_skewness'] = df.processed.apply(lambda x: skew(x)) df.loc[:, 'f_rms'] = df.processed.apply(lambda x: rms(x)) df.loc[:, 'f_energy'] = df.processed.apply(lambda x: sig_energy(x)) # features from power spectrum of signal df.loc[:, 'f_relpower'] = df.processed.apply(lambda x: rel_power(x, fs)) df.loc[:, 'f_relbasepower'] = df.processed.apply(lambda x: rel_power( x, fs, num_freqbounds=(1, 40), denom_freqbounds=(0, 40))) fbins, fmax = 10, 10 powspec_vals = np.vstack( df.processed.apply( lambda x: power_spec(x, fs, bins=fbins, fmax=fmax)).values) for i in range(fbins): df.loc[:, 'f_powspec' + str(i)] = list(powspec_vals[:, i]) # features from physiological parameters df.loc[:, 'f_rpeakcount'] = df.r_peaks.map(len) df.loc[:, 'f_nhr'] = df.processed.apply(lambda x: normal_hr(x, fs)) df.loc[:, 'f_hrv'] = df.r_peaks.apply(lambda x: heart_rate_var(x, fs)) df.loc[:, 'f_rtor'] = df.r_peaks.apply(lambda x: rtor_duration(x, fs)) df.loc[:, 'f_sumbe'] = df.beats.apply(lambda x: sum_beat_energy(np.array(x))) df.loc[:, 'f_pca'] = 0 df.loc[df.beats.map(len) > 0, 'f_pca'] = df.beats[ df.beats.map(len) > 0].apply(lambda x: pca_feature(np.array(x))) # df.loc[:, 'f_mbe'] = 0 df.loc[df.beats.map(len) > 0, 'f_mbe'] = df.beats[ df.beats.map(len) > 0].apply(lambda x: mean_beat_energy(np.array(x))) df.loc[:, 'f_maxminbeat'] = 0 df.loc[df.beats.map(len) > 0, 'f_maxminbeat'] = df.beats[ df.beats.map(len) > 0].apply(lambda x: maxmin_beat(np.array(x))) print('Done!') return df
def myskew(l): if len(l) == 0: return 0 ret = skew(l) return ret
def skewness(RR): return skew(RR)
def compute_var_skew(data): data = data[data.notnull()] skewness = skew(data) return skewness
import matplotlib.pyplot as plt import statistics from scipy.stats import skew, kurtosis randomNums = np.random.normal(scale=3, size=1000) randomInts = np.round(randomNums) axis = np.arange(start=min(randomInts), stop = max(randomInts) + 1) plt.hist(randomInts, bins = axis) srednia = np.mean(randomInts) mediana = np.median(randomInts) dominanta = statistics.mode(randomInts) od_st = np.std(randomInts) wariancja = statistics.variance(randomInts) skosnosc = skew(randomInts) kurtoza = kurtosis(randomInts) print("Średnia: ",srednia) print("Mediana: ",mediana) print("Dominanta: ",dominanta) print("Odchylenie ",od_st) print("Wariancja ",wariancja) print("Skośność ",skosnosc) print("Kurtoza ",kurtoza)
def variableProfile(df, colName, varType="cat", outlierChk=True): df = df.copy() if (varType == "num"): print(df[colName].describe(), end="\n\n") plt.figure(figsize=(20, 15)) plt.subplot(3, 1, 1) plt.hist(df[colName], color='lightblue', edgecolor='black', alpha=0.7) plt.xlabel(colName) plt.figure(figsize=(20, 15)) plt.subplot(3, 1, 2) sns.kdeplot(df[colName]) plt.xlabel(colName) plt.figure(figsize=(20, 15)) plt.subplot(3, 1, 3) sns.boxplot(x=df[colName], color='lightblue') print(colName + " Skewness = " + str(round(stats.skew(df[colName][pd.notnull(df[colName])]), 3)), end="\n\n") plt.show() Q1, Q2, IQR, Lower_Whisker, Upper_Whisker = outlierIdenti(df[colName]) upperOutCnt = sum(df[colName] > Upper_Whisker) lowerOutCnt = sum(df[colName] < Lower_Whisker) if (outlierChk == True): if ((upperOutCnt > 0) | (lowerOutCnt > 0)): print(printFormat.RED + "Outliers present in " + colName + printFormat.END, end="\n\n") print(colName + " IQR = " + str(round(IQR, 3)), end="\n\n") print(colName + " Lower outlier threshold = " + str(round(Lower_Whisker, 3)), end="\n\n") print( colName + " Count of observations below lower outlier threshold = " + str(round(lowerOutCnt, 3)), end="\n\n") print( colName + " Lower of observations below lower outlier threshold = " + str(round((lowerOutCnt / df.shape[0]) * 100, 2)), end="\n\n") print(colName + " Upper outlier threshold = " + str(round(Upper_Whisker, 3)), end="\n\n") print( colName + " Count of observations over upper outlier threshold = " + str(round(upperOutCnt, 3)), end="\n\n") print( colName + " Upper of observations over upper outlier threshold = " + str(round((upperOutCnt / df.shape[0]) * 100, 2)), end="\n\n\n\n") outlierTreatOptions(df, colName, "Quantile-based Flooring and Capping") outlierTreatOptions(df, colName, "median") outlierTreatOptions(df, colName, "mean") else: print(printFormat.GREEN + "No outliers present in " + colName + printFormat.END, end="\n\n") if (varType == "cat"): print(freqTab(df, colName), end="\n\n") plt.figure(figsize=(15, 7)) plt.subplot(1, 1, 1) plt.hist(df[colName], color='lightblue', edgecolor='black', alpha=0.7) plt.xlabel(colName) plt.show()
def stats_calculate_all(x, stat_config): """Пресметка на статистиките од дадената листа x, врз основа на stat_config вредностите. :param x: листа на временската серија на податоци :type x: list(float) :param stat_config: листа со имиња на статистики кои треба да се пресметаат :type stat_config: list(str) :return: листа со пресметаните статистики според редоследот од stat_config :rtype: list """ assert len(set(stat_config).difference(['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew', 'kurtosis', 'median', 'mode', 'energy', 'energy_sample', 'snr'])) == 0 x_array = np.array(x) n = len(x) if n == 0: values = [0 for i in range(len(stat_config))] return values, stat_config min_value = np.min(x_array) if min_value < 1: offset = 1 + np.abs(min_value) else: offset = 0 max_value = np.max(x_array) values = [] for stat_name in stat_config: if stat_name == 'len': values.append(n) elif stat_name == 'min': values.append(min_value) elif stat_name == 'max': values.append(max_value) elif stat_name == 'range': range_value = max_value - min_value values.append(range_value) elif stat_name == 'mean': mean_value = np.mean(x_array) values.append(mean_value) elif stat_name == 'hmean': hmean_value = sp.hmean(x_array + offset) values.append(hmean_value) elif stat_name == 'gmean': gmean_value = sp.gmean(x_array + offset) values.append(gmean_value) elif stat_name == 'var': std_value = np.std(x_array) var_value = std_value ** 2 values.append(var_value) elif stat_name == 'std': std_value = np.std(x_array) values.append(std_value) elif stat_name == 'skew': skew_value = sp.skew(x_array) values.append(skew_value) elif stat_name == 'kurtosis': kurtosis_value = sp.kurtosis(x_array) values.append(kurtosis_value) elif stat_name == 'median': median_value = np.median(x_array) values.append(median_value) elif stat_name == 'mode': mode_value = sp.mode(x_array)[0][0] values.append(mode_value) elif stat_name == 'energy': energy_value = np.sum(x_array ** 2) values.append(energy_value) elif stat_name == 'energy_sample': energy_sample_value = np.sum(x_array ** 2) / n values.append(energy_sample_value) elif stat_name == 'snr': mean_value = np.mean(x_array) std_value = np.std(x_array) snr_value = 0.0 if std_value != 0: snr_value = mean_value / std_value values.append(snr_value) return values
def scanpy_hubness_analysis( adata, do_norm, norm_scale, do_log, do_pca, n_neighbors, metric, weighted, # weighted adjmat for louvain/leiden clustering ? seed, n_comps=50, retained_cells_idx=None, ): results_dict = {} results_dict["params"] = dict( do_norm=do_norm, norm_scale=norm_scale, do_log=do_log, do_pca=do_pca, n_neighbors=n_neighbors, metric=metric, weighted=weighted, seed=seed, n_comps=n_comps, ) start = time.time() ### preprocess, prepare clustering input ### if retained_cells_idx is None: retained_cells_idx = range(len(adata.X)) if type(do_norm) is str: adata.X = scipy.sparse.csr_matrix(adata.X) if do_norm == "seurat": recipe_seurat(adata, do_log, norm_scale) print(f"\t\tseurat norm retained {adata.X.shape[1]} genes") elif do_norm == "zheng17": recipe_zheng17(adata, do_log, norm_scale, n_top_genes=5000) print(f"\t\tzheng norm retained {adata.X.shape[1]} genes") elif do_norm == "duo": recipe_duo(adata, do_log, renorm=norm_scale) print(f"\t\tduo norm retained {adata.X.shape[1]} genes") else: raise ValueError("do_norm not in 'duo', seurat', 'zheng17'") if scipy.sparse.issparse(adata.X): adata.X = adata.X.toarray() if do_log and not (type(do_norm) is str): print("\t\tlog_transformed data") sc.pp.log1p(adata) if do_pca: use_rep = "X_pca" sc.pp.pca(adata, n_comps=min(adata.X.shape[1] - 1, min(len(adata.X) - 1, n_comps))) X = adata.obsm["X_pca"] res_key = results_dict["X_pca"] = {} else: # already computed pca use_rep = "X_pca" X = adata.obsm["X_pca"] res_key = results_dict["X_pca"] = {} skews = {} # scanpy for method in ["umap", "gauss"]: # compute neighbors try: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, ) except: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, knn=False, ) skews[method] = skew(adata.obsp["connectivities"].sum(axis=0).flat) print("\t\t\tScoring:", round((time.time() - start) / 60, 2), "mn") return skews
def get_statistical_features(v, field_type, field_general_type): r = OrderedDict([(f['name'], None) for f in field_c_statistical_features_list + field_q_statistical_features_list]) if not len(v): return r if field_general_type == 'c': r['list_entropy'] = list_entropy(v) value_lengths = [len(x) for x in v] r['mean_value_length'] = np.mean(value_lengths) r['median_value_length'] = np.median(value_lengths) r['min_value_length'] = np.min(value_lengths) r['max_value_length'] = np.max(value_lengths) r['std_value_length'] = np.std(value_lengths) r['percentage_of_mode'] = (pd.Series(v).value_counts().max() / len(v)) if field_general_type in 'q': sample_mean = np.mean(v) sample_median = np.median(v) sample_var = np.var(v) sample_min = np.min(v) sample_max = np.max(v) sample_std = np.std(v) q1, q25, q75, q99 = np.percentile(v, [0.01, 0.25, 0.75, 0.99]) iqr = q75 - q25 r['mean'] = sample_mean r['normalized_mean'] = sample_mean / sample_max r['median'] = sample_median r['normalized_median'] = sample_median / sample_max r['var'] = sample_var r['std'] = sample_std r['coeff_var'] = (sample_mean / sample_var) if sample_var else None r['min'] = sample_min r['max'] = sample_max r['range'] = r['max'] - r['min'] r['normalized_range'] = (r['max'] - r['min']) / \ sample_mean if sample_mean else None r['entropy'] = entropy(v) r['gini'] = gini(v) r['q25'] = q25 r['q75'] = q75 r['med_abs_dev'] = np.median(np.absolute(v - sample_median)) r['avg_abs_dev'] = np.mean(np.absolute(v - sample_mean)) r['quant_coeff_disp'] = (q75 - q25) / (q75 + q25) r['coeff_var'] = sample_var / sample_mean r['skewness'] = skew(v) r['kurtosis'] = kurtosis(v) r['moment_5'] = moment(v, moment=5) r['moment_6'] = moment(v, moment=6) r['moment_7'] = moment(v, moment=7) r['moment_8'] = moment(v, moment=8) r['moment_9'] = moment(v, moment=9) r['moment_10'] = moment(v, moment=10) # Outliers outliers_15iqr = np.logical_or(v < (q25 - 1.5 * iqr), v > (q75 + 1.5 * iqr)) outliers_3iqr = np.logical_or(v < (q25 - 3 * iqr), v > (q75 + 3 * iqr)) outliers_1_99 = np.logical_or(v < q1, v > q99) outliers_3std = np.logical_or(v < (sample_mean - 3 * sample_std), v > (sample_mean + 3 * sample_std)) r['percent_outliers_15iqr'] = np.sum(outliers_15iqr) / len(v) r['percent_outliers_3iqr'] = np.sum(outliers_3iqr) / len(v) r['percent_outliers_1_99'] = np.sum(outliers_1_99) / len(v) r['percent_outliers_3std'] = np.sum(outliers_3std) / len(v) r['has_outliers_15iqr'] = np.any(outliers_15iqr) r['has_outliers_3iqr'] = np.any(outliers_3iqr) r['has_outliers_1_99'] = np.any(outliers_1_99) r['has_outliers_3std'] = np.any(outliers_3std) # Statistical Distribution if len(v) >= 8: normality_k2, normality_p = normaltest(v) r['normality_statistic'] = normality_k2 r['normality_p'] = normality_p r['is_normal_5'] = (normality_p < 0.05) r['is_normal_1'] = (normality_p < 0.01) return r
def condition(X): return abs(skew(X)) > threshold
for i, q in enumerate(tqdm_notebook(df.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((df.shape[0], 300)) for i, q in enumerate(tqdm_notebook(df.question2.values)): question2_vectors[i, :] = sent2vec(q) df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] df['is_duplicate'].value_counts() df.isnull().sum() df.drop(['question1', 'question2'], axis=1, inplace=True) df = df[pd.notnull(df['cosine_distance'])] df = df[pd.notnull(df['jaccard_distance'])] from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix
def main(test, folder, scan, reduced_intensity, reduced_q, temperature=None, structure_factor=None, y=None, ramping=False, scatter=None, background=None, pooled=False): """ Processes data sets, create statistical fits, and outputs plots. ============= --VARIABLES-- test: Type of liquid: "Water", "Ethanol", "Dodecane" folder: Save location of the processed data set. scan: Specific scan under the type of test. reduced_intensity Intensity profiles reduced down to the cropped q. reduced_q Cropped q range. temperature Nozzle temperature. structure_factor Structure factor profiles (water only). y Vertical location in spray (IJ only). ramping Ramping IJ case (True/False). pooled Ramping IJ pooled case (True/False) """ prfl_fld = create_folder('{0}/{1}/Profiles/'.format(folder, scan)) stats_fld = create_folder('{0}/{1}/Statistics/'.format(folder, scan)) plt_fld = create_folder('{0}/{1}/Plots/'.format(folder, scan)) tests_fld = create_folder('{0}/{1}/Tests/'.format(folder, scan)) curves_fld = create_folder('{0}/{1}/Curves/'.format(folder, scan)) if structure_factor is not None: sf_fld = create_folder('{0}/{1}/Structure Factor/'.format( folder, scan)) if 'IJ' in str(scan) and 'Ethanol' in test: pinned_pts = np.abs(reduced_q - 1.40).argmin() elif 'IJ' in str(scan) and 'Water' in test: pinned_pts = np.abs(reduced_q - 2.79).argmin() else: # Find pinned points in the curves (least variation) intensity_std = np.std(reduced_intensity, axis=0) pinned_pts = find_peaks(-intensity_std)[0] # Find the minimum peak only (throw away every other valley) pinned_pts = pinned_pts[np.argmin(intensity_std[pinned_pts])] pinned_q = reduced_q[pinned_pts] # Designate fit_var if pooled: index = np.linspace(1, len(reduced_intensity), len(reduced_intensity)) fit_var = index data_label = '' else: if y is not None: fit_var = y data_label = ' mm' np.savetxt('{0}/positions.txt'.format(prfl_fld), y) np.savetxt('{0}/positions.txt'.format(stats_fld), y) elif temperature is not None: fit_var = temperature data_label = ' K' np.savetxt('{0}/temperature.txt'.format(prfl_fld), temperature) np.savetxt('{0}/temperature.txt'.format(stats_fld), temperature) # Save images if scatter and background arrays are passed if scatter is not None: img_fld = create_folder('{0}/{1}/Images/'.format(folder, scan)) saveimage(img_fld, fit_var, scatter, background) # Save intensities in tests_fld [ np.savetxt('{0}/{1:03.0f}.txt'.format(tests_fld, i), x) for i, x in enumerate(reduced_intensity) ] profile('peak', fit_var, [np.max(x) for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('peakq', fit_var, [reduced_q[np.argmax(x)] for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('aratio', fit_var, [ np.trapz(x[:pinned_pts], reduced_q[:pinned_pts]) / np.trapz(x[pinned_pts:], reduced_q[pinned_pts:]) for x in reduced_intensity ], prfl_fld, stats_fld, test, plt_fld) profile('mean', fit_var, [np.mean(x) for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('var', fit_var, [np.var(x) for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('skew', fit_var, [stats.skew(x) for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('kurt', fit_var, [stats.kurtosis(x) for x in reduced_intensity], prfl_fld, stats_fld, test, plt_fld) profile('pca', fit_var, pca(reduced_intensity), prfl_fld, stats_fld, test, plt_fld) profile_peakq = np.loadtxt('{0}/profile_peakq.txt'.format(prfl_fld)) rr = np.array([(x - min(fit_var)) / (max(fit_var) - min(fit_var)) for x in fit_var]) bb = np.array([ 1 - (x - min(fit_var)) / (max(fit_var) - min(fit_var)) for x in fit_var ]) for i, _ in enumerate(reduced_intensity): # Create intensity plots with q values of interest highlighted plt.figure() plt.plot(reduced_q, reduced_intensity[i], linestyle='-', color=(rr[i], 0, bb[i]), linewidth=2.0, label='{0:0.1f}{1}'.format(fit_var[i], data_label)) plt.axvline(x=profile_peakq[i], linestyle='--', color='C1', label='peakq = {0:0.2f}'.format(profile_peakq[i])) plt.legend(loc='upper right') plt.xlabel('q (Å$^{-1}$)') plt.ylabel('Intensity (a.u.)') plt.autoscale(enable=True, axis='x', tight=True) plt.minorticks_on() plt.tick_params(which='both', direction='in') plt.title(test + ' Curves') plt.tight_layout() plt.savefig('{0}/curves_{1:0.1f}.png'.format(curves_fld, fit_var[i])) plt.close() if structure_factor is not None: for i, _ in enumerate(structure_factor): np.savetxt( '{0}/{1:02d}_{2:0.2f}K.txt'.format(tests_fld, i, temperature[i]).replace( '.', 'p'), structure_factor[i]) plt.figure() plt.plot(reduced_q, structure_factor[i], linestyle='-', color=(rr[i], 0, bb[i]), linewidth=2.0, label='{0:0.1f}{1}'.format(temperature[i], data_label)) plt.legend(loc='upper right') plt.xlabel('q (Å$^{-1}$)') plt.ylabel('Structure Factor (a.u.)') plt.autoscale(enable=True, axis='x', tight=True) plt.minorticks_on() plt.tick_params(which='both', direction='in') plt.title(test + ' Curves') plt.tight_layout() plt.savefig('{0}/{1}curves_{2:0.1f}.png'.format( sf_fld, test, temperature[i])) plt.close() np.savetxt('{0}/{1}/q_range.txt'.format(folder, scan), reduced_q) if pooled: np.savetxt('{0}/{1}/temperature.txt'.format(folder, scan), temperature) np.savetxt('{0}/{1}/positions.txt'.format(folder, scan), y) else: if temperature is not None: np.savetxt('{0}/{1}/temperature.txt'.format(folder, scan), temperature) elif y is not None: np.savetxt('{0}/{1}/positions.txt'.format(folder, scan), y) if 'IJ' not in str(scan): # Standard deviation plot of all intensities plt.figure() plt.plot(reduced_q, intensity_std, linewidth='2.0') plt.xlabel('q (Å$^{-1}$)') plt.ylabel('SD(Intensity) (a.u.)') plt.axvline(x=pinned_q, color='k', linestyle='--') plt.text(pinned_q, 0.6 * np.mean(intensity_std), 'q = {0:02.2f}'.format(pinned_q), horizontalalignment='center', bbox=dict(facecolor='white', alpha=1.0)) plt.title('Scan {0}'.format(scan)) plt.tight_layout() plt.savefig('{0}/stdev.png'.format(plt_fld)) plt.close() # Superimposed intensity plot plt.figure() [ plt.plot(reduced_q, x, color=(rr[i], 0, bb[i])) for i, x in enumerate(reduced_intensity) ] plt.xlabel('q (Å$^{-1}$)') plt.ylabel('Intensity (a.u.)') plt.axvline(x=pinned_q, color='k', linestyle='--') plt.text(pinned_q, 0.5, 'q = {0:02.2f}'.format(pinned_q), horizontalalignment='center', bbox=dict(facecolor='white', alpha=1.0)) plt.title('Scan {0}'.format(scan)) plt.tight_layout() plt.savefig('{0}/superimposedcurves.png'.format(plt_fld)) plt.close() # Save the calibration data sets and log the date/time processing was done if temperature is not None and ramping is False and 'IJ' not in str(scan): with open('{0}/{1}/{1}_data.pckl'.format(folder, scan), 'wb') as f: pickle.dump([temperature, reduced_q, reduced_intensity], f) with open('{0}/{1}/{1}_log.txt'.format(folder, scan), 'a+') as f: f.write(datetime.now().strftime("\n%d-%b-%Y %I:%M:%S %p")) # Save the ethanol (cold/ambient/hot) and water impinging jet data sets # and log the date/time processing was done elif y is not None and ramping is False: with open('{0}/{1}/{1}_data.pckl'.format(folder, scan), 'wb') as f: pickle.dump([y, reduced_q, reduced_intensity], f) with open('{0}/{1}/{1}_log.txt'.format(folder, scan), 'a+') as f: f.write(datetime.now().strftime("\n%d-%b-%Y %I:%M:%S %p")) # Save the ethanol ramping impinging jet data set and log the date/time # processing was done elif ramping is True: with open( folder + '/' + str(scan) + '/' + str(scan).rsplit('/')[-1] + '_data.pckl', 'wb') as f: pickle.dump([temperature, y, reduced_q, reduced_intensity], f) with open( folder + '/' + str(scan) + '/' + str(scan).rsplit('/')[-1] + '_log.txt', 'a+') as f: f.write(datetime.now().strftime("\n%d-%b-%Y %I:%M:%S %p"))
def add_pheno_group(self, ct, mask, chest_region, chest_type, pheno_name): """For a given mask, this function computes all phenotypes corresponding to the masked structure and adds them to the dataframe with the 'add_pheno' method Parameters ---------- ct : array, shape ( X, Y, Z ) The 3D CT image array mask : boolean array, shape ( X, Y, Z ) Boolean mask where True values indicate presence of the structure of interest chest_region : string Name of the chest region in the (region, type) key used to populate the dataframe chest_type : string Name of the chest region in the (region, type) key used to populate the dataframe pheno_name : string Name of the phenotype used to populate the dataframe References ---------- 1. Schneider et al, 'Correlation between CT numbers and tissue parameters needed for Monte Carlo simulations of clinical dose distributions' """ assert pheno_name in self.pheno_names_, "Invalid phenotype name" #print "Region: %s, Type: %s, Pheno: %s" % \ # (chest_region, chest_type, pheno_name) pheno_val = None mask_sum = np.sum(mask) if pheno_name == 'LAA950': pheno_val = float(np.sum(ct[mask] <= -950.)) / mask_sum elif pheno_name == 'LAA910': pheno_val = float(np.sum(ct[mask] <= -910.)) / mask_sum elif pheno_name == 'LAA856': pheno_val = float(np.sum(ct[mask] <= -856.)) / mask_sum elif pheno_name == 'HAA700': pheno_val = float(np.sum(ct[mask] >= -700.)) / mask_sum elif pheno_name == 'HAA600': pheno_val = float(np.sum(ct[mask] >= -600)) / mask_sum elif pheno_name == 'HAA500': pheno_val = float(np.sum(ct[mask] >= -500)) / mask_sum elif pheno_name == 'HAA250': pheno_val = float(np.sum(ct[mask] >= -250)) / mask_sum elif pheno_name == 'Perc15': pheno_val = np.percentile(ct[mask], 15) elif pheno_name == 'Perc10': pheno_val = np.percentile(ct[mask], 10) elif pheno_name == 'HUMean': pheno_val = np.mean(ct[mask]) elif pheno_name == 'HUStd': pheno_val = np.std(ct[mask]) elif pheno_name == 'HUKurtosis': pheno_val = kurtosis(ct[mask], bias=False, fisher=True) elif pheno_name == 'HUSkewness': pheno_val = skew(ct[mask], bias=False) elif pheno_name == 'HUMode': min_val = np.min(ct[mask]) pheno_val = np.argmax(np.bincount(ct[mask] + np.abs(min_val))) - \ np.abs(min_val) elif pheno_name == 'HUMedian': pheno_val = np.median(ct[mask]) elif pheno_name == 'HUMin': pheno_val = np.min(ct[mask]) elif pheno_name == 'HUMax': pheno_val = np.max(ct[mask]) elif pheno_name == 'HUMean500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = np.mean(hus) elif pheno_name == 'HUStd500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = np.std(hus) elif pheno_name == 'HUKurtosis500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0]: pheno_val = kurtosis(hus, bias=False, fisher=True) elif pheno_name == 'HUSkewness500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = skew(hus, bias=False) elif pheno_name == 'HUMode500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: min_val = np.min(hus) pheno_val = np.argmax(np.bincount(hus + np.abs(min_val))) - \ np.abs(min_val) elif pheno_name == 'HUMedian500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = np.median(hus) elif pheno_name == 'HUMin500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = np.min(hus) elif pheno_name == 'HUMax500': hus = ct[np.logical_and(mask, ct <= -500)] if hus.shape[0] > 0: pheno_val = np.max(hus) elif pheno_name == 'Volume': pheno_val = np.prod(self._spacing) * float(mask_sum) elif pheno_name == 'Mass': # This quantity is computed in a piecewise linear form according # to the prescription presented in ref. [1]. Mass is computed in # grams. First compute the contribution in HU interval from -98 # and below. pheno_val = 0.0 HU_tmp = ct[np.logical_and(mask, ct < -98)].clip(-1000) if HU_tmp.shape[0] > 0: m = (1.21e-3 - 0.93) / (-1000 + 98) b = 1.21e-3 + 1000 * m pheno_val += np.sum((m*HU_tmp + b)*\ np.prod(self._spacing)*0.001) # Now compute the mass contribution in the interval [-98, 18] HU. # Note the in the original paper, the interval is defined from # -98HU to 14HU, but we extend in slightly here so there are no # gaps in coverage. The values we report in the interval [14, 23] # should be viewed as approximate. HU_tmp = ct[np.logical_and(np.logical_and(mask, ct >= -98), ct <= 18)] if HU_tmp.shape[0] > 0: pheno_val += np.sum((1.018 + 0.893*HU_tmp/1000.0)*\ np.prod(self._spacing)*0.001) # Compute the mass contribution in the interval (18, 100] HU_tmp = ct[np.logical_and(np.logical_and(mask, ct > 18), ct <= 100)] if HU_tmp.shape[0] > 0: pheno_val += np.sum((1.003 + 1.169*HU_tmp/1000.0)*\ np.prod(self._spacing)*0.001) # Compute the mass contribution in the interval > 100 HU_tmp = ct[np.logical_and(mask, ct > 100)] if HU_tmp.shape[0] > 0: pheno_val += np.sum((1.017 + 0.592*HU_tmp/1000.0)*\ np.prod(self._spacing)*0.001) if pheno_val is not None: self.add_pheno([chest_region, chest_type], pheno_name, pheno_val)
for chain in model.get_list(): xvalues = [] yvalues = [] zvalues = [] for residue in chain.get_list(): if residue.has_id("CA"): ca = residue["CA"] #print(ca.get_coord()) temp = ca.get_coord() #print(temp[0]) xvalues.append(temp[0]) yvalues.append(temp[1]) zvalues.append(temp[2]) #print(chain) xskew = skew(xvalues) yskew = skew(yvalues) zskew = skew(zvalues) avg = (xskew + yskew + zskew) / 3 #print(avg) skewchain.append(avg) print("chainwise average : ", numpy.mean(skewchain)) #idea 1.2: (xi - meanx)^3+(yi - meany)^3+(zi - meanz)^3/sdx+sdy+sdz #idea 2: measure skewness for all CA atoms present xvalues = [] yvalues = [] zvalues = [] for model in structure.get_list(): for chain in model.get_list(): for residue in chain.get_list():
def this_skew(x): if len(x) < 3: return np.nan return skew(x, bias=False)
- First I'll transform the skewed numeric features by taking log(feature + 1) - this will make the features more normal - Create Dummy variables for the categorical features - Replace the numeric missing values (NaN's) with the mean of their respective columns matplotlib.rcParams['figure.figsize'] = (12.0, 6.0) prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])}) prices.hist() #log transform the target: train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features: numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) #filling NA's with the mean of the column: all_data = all_data.fillna(all_data.mean()) #creating matrices for sklearn: X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice
#Check remaning missing values if any data_features.isnull().sum()[data_features.isnull().sum() > 0].sort_values( ascending=False) data_features['MSSubClass'].unique() data_features['YrSold'] = data_features['YrSold'].astype(str) data_features['OverallCond'] = data_features['OverallCond'].astype(str) data_features['MSSubClass'] = data_features['MSSubClass'].astype(str) data_features['MoSold'] = data_features['MoSold'].astype(str) aa = list(data_features.select_dtypes(include=['object']).columns) numerical_features = data_features.select_dtypes(exclude=["object"]).columns num_feat = data_features[numerical_features] print("Numerical features : " + str(len(numerical_features))) skewness = num_feat.apply(lambda x: skew(x)) skewness = skewness[abs(skewness) > 1] skewness.sort_values(ascending=False) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: num_feat[feat] = boxcox1p(num_feat[feat], stats.boxcox_normmax(num_feat[feat] + 1)) data_features[feat] = boxcox1p( data_features[feat], stats.boxcox_normmax(data_features[feat] + 1)) #label encoding to some ordering categorical variable from sklearn.preprocessing import LabelEncoder cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual',
def skewness_log(data): data_trans=data.copy() data_trans['GrLivArea']=np.log(data_trans['GrLivArea']) data_trans['SalePrice']=np.log(data_trans['SalePrice']) return(skew(data_trans['GrLivArea']),skew(data_trans['SalePrice']))
def masks_and_traces(ops, stat_manual, stat_orig): ''' main extraction function inputs: ops and stat creates cell and neuropil masks and extracts traces returns: F (ROIs x time), Fneu (ROIs x time), F_chan2, Fneu_chan2, ops, stat F_chan2 and Fneu_chan2 will be empty if no second channel ''' if 'aspect' in ops: dy, dx = int(ops['aspect'] * 10), 10 else: d0 = ops['diameter'] dy, dx = (d0, d0) if isinstance(d0, int) else d0 t0 = time.time() # Concatenate stat so a good neuropil function can be formed stat_all = stat_manual.copy() for n in range(len(stat_orig)): stat_all.append(stat_orig[n]) stat_all = roi_stats(stat_all, dy, dx, ops['Ly'], ops['Lx']) cell_masks = [ masks.create_cell_mask(stat, Ly=ops['Ly'], Lx=ops['Lx'], allow_overlap=ops['allow_overlap']) for stat in stat_all ] cell_pix = masks.create_cell_pix(stat_all, Ly=ops['Ly'], Lx=ops['Lx']) manual_roi_stats = stat_all[:len(stat_manual)] manual_cell_masks = cell_masks[:len(stat_manual)] manual_neuropil_masks = masks.create_neuropil_masks( ypixs=[stat['ypix'] for stat in manual_roi_stats], xpixs=[stat['xpix'] for stat in manual_roi_stats], cell_pix=cell_pix, inner_neuropil_radius=ops['inner_neuropil_radius'], min_neuropil_pixels=ops['min_neuropil_pixels'], ) print('Masks made in %0.2f sec.' % (time.time() - t0)) F, Fneu, F_chan2, Fneu_chan2, ops = extract_traces_from_masks( ops, manual_cell_masks, manual_neuropil_masks) # compute activity statistics for classifier npix = np.array([stat_orig[n]['npix'] for n in range(len(stat_orig))]).astype('float32') for n in range(len(manual_roi_stats)): manual_roi_stats[ n]['npix_norm'] = manual_roi_stats[n]['npix'] / np.mean( npix[:100]) # What if there are less than 100 cells? manual_roi_stats[n]['compact'] = 1 manual_roi_stats[n]['footprint'] = 2 manual_roi_stats[n]['manual'] = 1 # Add manual key # subtract neuropil and compute skew, std from F dF = F - ops['neucoeff'] * Fneu sk = stats.skew(dF, axis=1) sd = np.std(dF, axis=1) for n in range(F.shape[0]): manual_roi_stats[n]['skew'] = sk[n] manual_roi_stats[n]['std'] = sd[n] manual_roi_stats[n]['med'] = [ np.mean(manual_roi_stats[n]['ypix']), np.mean(manual_roi_stats[n]['xpix']) ] dF = F - ops['neucoeff'] * Fneu spks = oasis(F=dF, batch_size=ops['batch_size'], tau=ops['tau'], fs=ops['fs']) return F, Fneu, F_chan2, Fneu_chan2, spks, ops, manual_roi_stats
def course_action(request): context = {} __course_id = int(request.POST['course']) __course_obj = None __section_objects = [] __sections = [] __message = '' __mean = 0 __std = 0 __min = 0 __max = 0 __skewness = 0 __ttest_annova_type = '' __ttest_annova_value = 0 __ttest_annova_sig = 0 __correlation = 0 __show__result = 0 __histogramfile = '' __mids = [] __finals = [] __totals = [] __domain = '' __course = '' counter = 0 __nbr_sections = 0 __found_section = 0 try: sem.acquire() print("The semaphore is locked") if request.method == 'GET': raise Exception('Internal Error') if request.method == 'POST': __course_obj = Course.objects.get(course_id=__course_id) print('Dealing with course ' + __course_obj.course_name_ar) __course = __course_obj.course_name_ar for _section in Section.objects.all(): if _section.section_course.course_id == __course_id: __section_objects.append(_section) __nbr_sections += 1 print('Dealing with ' + str(__nbr_sections) + ' sections : ' + str(__section_objects)) for _section in __section_objects: for _report in SectionDocRequest.objects.all(): if _report.section.section_code == _section.section_code: __sections.append(_report) __found_section += 1 __data = eval(_report.student_grades) for grade in __data['mids']: __mids.append(grade) for grade in __data['finals']: __finals.append(grade) for grade in __data['totals']: __totals.append(grade) break print('Dealing with ' + str(__found_section) + ' section reports: ' + str(__sections)) if __nbr_sections != __found_section: raise Exception('Some sections need to be analysed first') # compute statistics about the course grades __mean = float("{0:.4f}".format(statistics.mean(__totals))) __std = float("{0:.4f}".format(statistics.stdev(__totals))) __skewness = float("{0:.4f}".format(skew(__totals, bias=False))) if len(__mids) == 0: __correlation = -99.99 else: __correlation = float("{0:.4f}".format(pearsonr(__mids, __finals)[1])) __min = min(__totals) __max = max(__totals) if __nbr_sections == 2: # T-Test __ttest_annova_type = 'T-Test' _total1 = eval(__sections[0].student_grades)['totals'] _total2 = eval(__sections[1].student_grades)['totals'] res = scipy.stats.ttest_ind(_total1, _total2) __ttest_annova_value = float("{0:.4f}".format(res.statistic)) __ttest_annova_sig = float("{0:.4f}".format(res.pvalue)) else: # annova __ttest_annova_type = 'ANOVA' if len(__sections) == 3: __ttest_annova_value = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'])[0])) __ttest_annova_sig = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'])[1])) elif len(__sections) == 4: __ttest_annova_value = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'], eval(__sections[3].student_grades)['totals'])[0])) __ttest_annova_sig = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'], eval(__sections[3].student_grades)['totals'])[1])) elif len(__sections) == 5: __ttest_annova_value = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'], eval(__sections[3].student_grades)['totals'], eval(__sections[4].student_grades)['totals'])[0])) __ttest_annova_sig = float("{0:.4f}".format( scipy.stats.f_oneway(eval(__sections[0].student_grades)['totals'], eval(__sections[1].student_grades)['totals'], eval(__sections[2].student_grades)['totals'], eval(__sections[3].student_grades)['totals'], eval(__sections[4].student_grades)['totals'])[1])) else: raise Exception('To be implemented : managing more that 5 sections per a course !!!!') print("mean : " + str(__mean)) print("std : " + str(__std)) print("skewness : " + str(__skewness)) print("__ttest_annova_type : " + str(__ttest_annova_type)) print("__ttest_annova_value : " + str(__ttest_annova_value)) print("__ttest_annova_sig : " + str(__ttest_annova_sig)) # plot the histogram a = np.array(__totals) # Fit a normal distribution to the data: mu, std = norm.fit(a) number = a.size # Plot the histogram. plt.hist(a, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], density=True, color='#607c8e', edgecolor='black', rwidth=0.8) # Plot the PDF. x = np.linspace(0, 100, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=3) title = "Histogram for course: " + __course_obj.course_name + ", N=%d" % (number) plt.title(title) __histogramfile = "data/media/histogram_course_" + str(__course) + ".png" plt.savefig(__histogramfile) plt.close() print("mean : " + str(__mean)) print("std : " + str(__std)) print("skewness : " + str(__skewness)) print("correlation : " + str(__correlation)) __show__result = 1 try: obj = CourseDocRequest.objects.get(course=__course_obj) print("--------> Updating the course report data with id= " + str(obj.course_doc_id)) except CourseDocRequest.DoesNotExist: obj = CourseDocRequest() obj.course = __course_obj print("--------> Creating new course report data") obj.doc_correlation = __correlation obj.doc_explanation = '' obj.doc_max = __max obj.doc_mean = __mean obj.doc_min = __min obj.doc_skewness = __skewness obj.doc_std_deviation = __std obj.histogram = __histogramfile obj.doc_ttest_annova_sig = __ttest_annova_sig obj.doc_ttest_annova_value = __ttest_annova_value obj.doc_ttest_annova_type = __ttest_annova_type obj.save() # get the server domain if request.is_secure(): __domain = 'https://' + request.get_host() else: __domain = 'http://' + request.get_host() __security = request.is_secure() except Exception as e: __message = e.__str__() finally: sem.release() print("The semaphore was released") if len(__mids) == 0: __correlation = 'N/A' context = { 'ttest_annova_sig': __ttest_annova_sig, 'ttest_annova_type': __ttest_annova_type, 'ttest_annova_value': __ttest_annova_value, 'show_result': __show__result, 'message': __message, 'mean': __mean, 'std': __std, 'skewness': __skewness, 'correlation': __correlation, 'min': __min, 'max': __max, 'histogram': __histogramfile, 'domain': __domain, 'course': __course, 'sections': __sections, } __results = analysis(context) context['analysis'] = __results del __course del __message del __mean del __std del __min del __max del __skewness del __correlation del __show__result del __histogramfile del __domain del __sections return render(request, 'course_result.html', context=context)