def test_kde(): """ Tests for kernel density estimation. To test standard KDE, we use a random normal to learn the density, generate 1000 samples, and then check that the means/standard deviations/kurtosis figures are similar to within a margin epsilon. TODO: Need a test that adjusts the bandwidth factor a. """ eps = 0.1 kde = KDE() # Test standard KDE. X = pandas.DataFrame(np.random.multivariate_normal([0, 0], \ np.eye(2), (1000, ))) S = kde.run(X, n_samples = 1000) assert np.mean( S.std(0) - X.std(0)) < eps assert np.mean( S.mean(0) - X.mean(0)) < eps assert np.mean(kurtosis(S, 0) - kurtosis(X, 0)) < eps # Test partitioned KDE counts = {'nGood': 250, 'nCritical': 100, 'nFail': 100} columns = ['A', 'B'] spec_lims = pandas.DataFrame({columns[0]: np.array([-2.0, 2.0]), \ columns[1]: np.array([-2.0, 2.0])}) specs = Specs(specs=spec_lims).gen_crit_region(5.5/6, 6.5/6) A = pandas.DataFrame(np.random.multivariate_normal([0, 0], \ np.eye(2), (1000, )), columns=columns) S = kde.run(A, specs=specs, counts=counts) assert np.mean( S.std(0) - 1.3 * A.std(0)) < eps assert np.mean( S.mean(0) - A.mean(0)) < eps
def getdata(data,samplerate=44100): data=[(float)(i) for i in data] sound_list["samplerate"] = samplerate sound_list["wavedata"] = data sound_list["number_of_samples"] = (sound_list["wavedata"]).shape[0] sound_list["song_length"] = int(sound_list["number_of_samples"] / samplerate) ans=[] zcr,ts=zero_crossing_rate(data,1024,sound_list["samplerate"]) ans=[np.min(zcr),np.max(zcr),np.mean(zcr),np.std(zcr),np.median(zcr),st.skew(zcr),st.kurtosis(zcr)] rms,ts=root_mean_square(data,1024,sound_list["samplerate"]) rms=[np.min(rms),np.max(rms),np.mean(rms),np.std(rms),np.median(rms),st.skew(rms),st.kurtosis(rms)] ans=ans+rms sc,ts=spectral_centroid(data,1024,sound_list["samplerate"]) rms=[np.min(sc),np.max(sc),np.mean(sc),np.std(sc),np.median(sc),st.skew(sc),st.kurtosis(sc)] ans=ans+rms sr,ts=spectral_rolloff(data,1024,sound_list["samplerate"]) rms=[np.min(sr),np.max(sr),np.mean(sr),np.std(sr),np.median(sr),st.skew(sr),st.kurtosis(sr)] ans=ans+rms sf,ts=spectral_flux(data,1024,sound_list["samplerate"]) rms=[np.min(sf),np.max(sf),np.mean(sf),np.std(sf),np.median(sf),st.skew(sf),st.kurtosis(sf)] ans=ans+rms x.set_input_data(data) mfcc=x.MFCCs for pop in mfcc: for poop in pop: ans.append(poop) return ans
def get_mfcc_features(filename): feature_dict = {} (rate, sig) = wav.read(filename) if sig.ndim == 2: # wav is stereo so average over both channels try: mfcc_feat_chan0 = mfcc(sig[:, 0], rate, numcep=15, appendEnergy=True) mfcc_feat_chan1 = mfcc(sig[:, 1], rate, numcep=15, appendEnergy=True) mfcc_feat = (mfcc_feat_chan0 + mfcc_feat_chan1) / 2 except IndexError: print('Index error') mfcc_feat = mfcc(sig, rate, numcep=15, appendEnergy=True) else: mfcc_feat = mfcc(sig, rate, numcep=15, appendEnergy=True) # Velocity is the difference between timestep t+1 and t for each mfcc_feat / 2 vel = (mfcc_feat[:-1, :] - mfcc_feat[1:, :]) / 2.0 # Acceleration is the difference between timestep t+1 and t for each velocity / 2 acc = (vel[:-1, :] - vel[1:, :]) / 2.0 mfcc_means = [] for i in range(0, 14): key = "energy" if i == 0 else "mfcc" + str(i) # mfcc feature_dict[key + "_mean"] = mfcc_feat[:, i].mean() feature_dict[key + "_var"] = mfcc_feat[:, i].var() feature_dict[key + "_skewness"] = st.skew(mfcc_feat[:, i]) feature_dict[key + "_kurtosis"] = st.kurtosis(mfcc_feat[:, i]) # Vel feature_dict[key + "_vel_mean"] = vel[:, i].mean() feature_dict[key + "_vel_var"] = vel[:, i].var() feature_dict[key + "_vel_skewness"] = st.skew(vel[:, i]) feature_dict[key + "_vel_kurtosis"] = st.kurtosis(vel[:, i]) # Accel feature_dict[key + "_accel_mean"] = acc[:, i].mean() feature_dict[key + "_accel_var"] = acc[:, i].var() feature_dict[key + "_accel_skewness"] = st.skew(acc[:, i]) feature_dict[key + "_accel_kurtosis"] = st.kurtosis(acc[:, i]) # Need the skewness and kurtosis of all mfcc means if i > 0: mfcc_means.append(feature_dict[key + "_mean"]) feature_dict["mfcc_skewness"] = st.skew(mfcc_means) feature_dict["mfcc_kurtosis"] = st.kurtosis(mfcc_means) return feature_dict
def AAcal(seqcont): v = [] for i in range(len(seqcont)): vtar = seqcont[i] vtarv = [] vtar7 = 0 vtar8 = 0 vtar9 = 0 s = pd.Series(vtar) vtar3 = np.mean( vtar) # These 4 dimensions are relevant statistical terms vtar4 = st.kurtosis(vtar) vtar5 = np.var(vtar) vtar6 = st.skew(vtar) #for p in range(len(vtar)): # These 3 dimensions are inspired by PAFIG algorithm #vtar7=vtar[p]**2+vtar7 #if vtar[p]>va: #vtar8=vtar[p]**2+vtar8 #else: #vtar9=vtar[p]**2+vtar9 vcf1 = [] vcf2 = [] for j in range(len(vtar) - 1): #Sequence-order-correlation terms vcf1.append((vtar[j] - vtar[j + 1])) for k in range(len(vtar) - 2): vcf2.append((vtar[k] - vtar[k + 2])) vtar10 = np.mean(vcf1) vtar11 = np.var(vcf1) vtar11A = st.kurtosis(vcf1) vtar11B = st.skew(vcf1) vtar12 = np.mean(vcf2) vtar13 = np.var(vcf2) vtar13A = st.kurtosis(vcf2) vtar13B = st.skew(vcf2) vtarv.append(vtar3) vtarv.append(vtar4) vtarv.append(vtar5) vtarv.append(vtar6) #vtarv.append(vtar7/len(vtar)) #vtarv.append(vtar8/len(vtar)) #vtarv.append(vtar9/len(vtar)) vtarv.append(vtar10) vtarv.append(vtar11) vtarv.append(vtar11A) vtarv.append(vtar11B) vtarv.append(vtar12) vtarv.append(vtar13) vtarv.append(vtar13A) vtarv.append(vtar13B) v.append(vtarv) return v
def __extract_features(self, mfcc_data: dict) -> dict: """ Extracts the features from the MFCC data :param mfcc_data: MFCC data for an audio chunk :return: the extracted features from the input MFCC data """ features, mfcc_means = {}, [] for i in range(0, 14): key = "energy" if i == 0 else "mfcc_" + str(i) features.update( self.__get_summary_stats(key, mfcc_data["mfcc_features"], i)) features.update( self.__get_summary_stats(key + "_velocity", mfcc_data["velocity"], i)) features.update( self.__get_summary_stats(key + "_acceleration", mfcc_data["acceleration"], i)) if i > 0: mfcc_means.append(features[key + "_mean"]) features["mfcc_skewness"] = st.skew(np.array(mfcc_means)) features["mfcc_kurtosis"] = st.kurtosis(mfcc_means) return features
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min())/2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min()) / 2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def _extract_one(self, sourcepc, neighborhood): if neighborhood: z = sourcepc[point][self.data_key]['data'][neighborhood] kurtosis_z = stat.kurtosis(z) else: kurtosis_z = np.NaN return kurtosis_z
def _get_reward(self, real_values: dict, i: int): """ Get the reward returned after previous action """ df = pd.read_csv('output.csv', skiprows=[0], sep=';') last_return = df['price'].values[-1] / self.init_price - 1 reward = {'return': last_return} if i < 100: # + 1 return reward # returns = self.sim_df.tail(99)['return'].dropna().values + [ last_return ] mu, sigma = norm.fit(returns) skew, kurtosis = st.skew(returns), st.kurtosis(returns) # autocorr = f_autocorr(np.abs(returns))[0, 1] reward.update({ 'mu': mu, 'sigma': sigma, 'skew': skew, 'kurtosis': kurtosis, # 'autocorr': autocorr, }) # error = { # k: np.abs((reward[k] - real_values[k])**2 / real_values[k]) # for k, v in reward.items() if k != 'return' # } sub_df = self.df.iloc[i - 100:i] error = { k: ((reward[k] - sub_df[k].mean()) / sub_df[k].std())**2 for k, v in reward.items() if k != 'return' } reward['error'] = -sum(error.values()) os.remove('output.csv') return reward
def get_stats_numpy(data, zero): mean = np.mean(data) median = np.median(data) std = np.std(data) var = np.var(data) skew = stats.skew(data) kurt = stats.kurtosis(data) pc = [25, 50, 75, 90] percentiles = np.array(np.percentile(data, pc)) silences = np.count_nonzero(np.asarray(data) == zero) silence_mean = np.mean( list(sum(1 for _ in g) for k, g in groupby(data) if k == zero)) longest_silence = max( sum(1 for _ in g) for k, g in groupby(data) if k == 0) if silences > 0 else 0 shortest_silence = min( sum(1 for _ in g) for k, g in groupby(data) if k == 0) if silences > 0 else 0 # print("Mean: " + str(mean)) # print("Media: " + str(median)) # print("StdDev: " + str(std)) # print("Variance: " + str(var)) # print("Skewness: " + str(skew)) # print("Kurtosis: " + str(kurt)) # print("Pc25: " + str(percentiles[0])) # print("Pc50: " + str(percentiles[1])) # print("Pc75: " + str(percentiles[2])) features = np.hstack( (mean, median, std, var, skew, kurt, percentiles, silences, silence_mean, longest_silence, shortest_silence)) return features
def computeFeatureVector(data): print "computeFeatureVector" stDeviation = np.std(np.array(data)) #Standard Deviation coVariant = st.variation(np.array(data)) # Coefficients of variation kurtosis = st.kurtosis(np.array(data)) #kurtosis features = ft.cFeatures(stDeviation, coVariant, kurtosis) return features
def get_mean_var_skew_kurt(np_array): return { "mean": np_array.mean(), "var": np_array.var(), "skewness": st.skew(np_array), "kurtosis": st.kurtosis(np_array), }
def get_stats_json(data): mean = np.mean(data) median = np.median(data) std = np.std(data) var = np.var(data) skew = stats.skew(data) kurt = stats.kurtosis(data) pc = [25,50,75] percentiles = np.array(np.percentile(data, pc)) silences = np.count_nonzero(np.asarray(data)==0.0) longest_silence = max(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0 shortest_silence = min(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0 #print("Mean: " + str(mean)) #print("Media: " + str(median)) #print("StdDev: " + str(std)) #print("Variance: " + str(var)) #print("Skewness: " + str(skew)) #print("Kurtosis: " + str(kurt)) #print("Pc25: " + str(percentiles[0])) #print("Pc50: " + str(percentiles[1])) #print("Pc75: " + str(percentiles[2])) statistiscs = { 'mean': mean, 'median': median, 'std': std, 'var': var, 'skew': skew, 'kurt': kurt, 'pc25': percentiles[0], 'pc50': percentiles[1], 'pc75': percentiles[2], } return statistiscs
def get_launch_feature(row): feature = pd.Series() feature['user_id'] = list(row['user_id'])[0] # feature['launch_count'] = len(row) diff_day = np.diff(row['day']) if len(diff_day) != 0: feature['launch_day_diff_mean'] = np.mean(diff_day) feature['launch_day_diff_std'] = np.std(diff_day) feature['launch_day_diff_max'] = np.max(diff_day) feature['launch_day_diff_min'] = np.min(diff_day) feature['launch_day_diff_kur'] = stats.kurtosis(diff_day) feature['launch_day_diff_ske'] = stats.skew(diff_day) feature['launch_day_diff_last'] = diff_day[-1] # feature['launch_day_cut_max_day'] = day_cut_max_day(row['day']) feature['launch_sub_register'] = np.subtract(np.max(row['max_day']), np.max(row['day'])) else: feature['launch_day_diff_mean'] = 0 feature['launch_day_diff_std'] = 0 feature['launch_day_diff_max'] = 0 feature['launch_day_diff_min'] = 0 feature['launch_day_diff_kur'] = 0 feature['launch_day_diff_ske'] = 0 feature['launch_day_diff_last'] = 0 # feature['launch_day_cut_max_day'] = day_cut_max_day(row['day']) feature['launch_sub_register'] = np.subtract(np.max(row['max_day']), np.max(row['day'])) launch_day_count = np.bincount(row['day'])[np.nonzero( np.bincount(row['day']))[0]] feature['launch_day_count_mean'] = np.mean(launch_day_count) feature['launch_day_count_max'] = np.max(launch_day_count) feature['launch_day_count_std'] = np.std(launch_day_count) return feature
def base_stats(data_1): stats_dict = np.zeros((data_1.shape[0], 4)) for i in range(data_1.shape[0]): stats_dict[i, 0] = st.skew(data_1[i], bias=False) stats_dict[i, 1] = st.kurtosis(data_1[i], bias=False) stats_dict[i, 2] = np.max(data_1[i]) stats_dict[i, 3] = np.std(data_1[i]) return stats_dict
def __get_summary_stats(key: str, data: np.array, coefficient: int) -> dict: return { key + "_mean": data[:, coefficient].mean(), key + "_variance": data[:, coefficient].var(), key + "_skewness": st.skew(data[:, coefficient]), key + "_kurtosis": st.kurtosis(data[:, coefficient]) }
def extract(self, sourcepc, neighborhood, targetpc, targetindex, volume_description): if neighborhood: z = sourcepc[point][self.data_key]['data'][neighborhood] kurtosis_z = stat.kurtosis(z) else: kurtosis_z = np.NaN return kurtosis_z
def kurtosis_normal_distribution(df, features, crypto_name, output_path): res = {'feature': [], 'kurtosis_of_n_distrib': []} for feature in features: df = df.dropna(subset=[feature]) stat, p = stats.kurtosis(df[feature].values) res['feature'].append(feature) res['kurtosis_of_n_distrib'].append(stat) pd.DataFrame(data=res).to_csv(output_path + crypto_name + ".csv", sep=",", index=False)
def get_mfcc_features(filename): feature_dict = {} (rate, sig) = wav.read(filename) if sig.ndim == 2: # wav is stereo so average over both channels mfcc_feat_chan0 = mfcc(sig[:,0], rate, numcep=15, appendEnergy=True) mfcc_feat_chan1 = mfcc(sig[:,1], rate, numcep=15, appendEnergy=True) mfcc_feat = (mfcc_feat_chan0 + mfcc_feat_chan1) / 2 else: mfcc_feat = mfcc(sig, rate, numcep=15, appendEnergy=True) # Velocity is the difference between timestep t+1 and t for each mfcc_feat / 2 vel = (mfcc_feat[:-1,:] - mfcc_feat[1:,:]) / 2.0 # Acceleration is the difference between timestep t+1 and t for each velocity / 2 acc = (vel[:-1,:] - vel[1:,:]) / 2.0 mfcc_means = [] for i in xrange(0, 14): key = "energy" if i == 0 else "mfcc" + str(i) # mfcc feature_dict[key + "_mean"] = mfcc_feat[:, i].mean() feature_dict[key + "_var"] = mfcc_feat[:, i].var() feature_dict[key + "_skewness"] = st.skew(mfcc_feat[:, i]) feature_dict[key + "_kurtosis"] = st.kurtosis(mfcc_feat[:, i]) # Vel feature_dict[key + "_vel_mean"] = vel[:, i].mean() feature_dict[key + "_vel_var"] = vel[:, i].var() feature_dict[key + "_vel_skewness"] = st.skew(vel[:, i]) feature_dict[key + "_vel_kurtosis"] = st.kurtosis(vel[:, i]) # Accel feature_dict[key + "_accel_mean"] = acc[:, i].mean() feature_dict[key + "_accel_var"] = acc[:, i].var() feature_dict[key + "_accel_skewness"] = st.skew(acc[:, i]) feature_dict[key + "_accel_kurtosis"] = st.kurtosis(acc[:, i]) # Need the skewness and kurtosis of all mfcc means if i > 0: mfcc_means.append(feature_dict[key + "_mean"]) feature_dict["mfcc_skewness"] = st.skew(mfcc_means) feature_dict["mfcc_kurtostis"] = st.kurtosis(mfcc_means) return feature_dict
def get_feature(region_props, n_region, feature_name): feature = [0] * 5 if n_region > 0: feature_values = [region[feature_name] for region in region_props] feature[MAX] = format_2f(np.max(feature_values)) feature[MEAN] = format_2f(np.mean(feature_values)) feature[VARIANCE] = format_2f(np.var(feature_values)) feature[SKEWNESS] = format_2f(st.skew(np.array(feature_values))) feature[KURTOSIS] = format_2f(st.kurtosis(np.array(feature_values))) return feature
def get_feature(region_props, n_region, feature_name): feature = [0] * 5 if n_region > 0: feature_values = [region[feature_name] for region in region_props] feature[MAX] = utils.format_2f(np.max(feature_values)) feature[MEAN] = utils.format_2f(np.mean(feature_values)) feature[VARIANCE] = utils.format_2f(np.var(feature_values)) feature[SKEWNESS] = utils.format_2f(st.skew(np.array(feature_values))) feature[KURTOSIS] = utils.format_2f(st.kurtosis(np.array(feature_values))) return feature
def aggregate_ftr_matrix(self, ftr_matrix): sig = [] for ftr in ftr_matrix: median = stats.nanmedian(ftr) mean = stats.nanmean(ftr) std = stats.nanstd(ftr) # Invalid double scalars warning appears here skew = stats.skew(ftr) if any(ftr) else 0.0 kurtosis = stats.kurtosis(ftr) sig.extend([median, mean, std, skew, kurtosis]) return sig
def extract_features_for_pqrst(row, pqrsts): features = [] p = [x[0] for x in pqrsts] q = [x[1] for x in pqrsts] r = [x[2] for x in pqrsts] s = [x[3] for x in pqrsts] t = [x[4] for x in pqrsts] pqrsts = pqrsts[:min(NB_RR, len(pqrsts))] row = low_pass_filtering(row) row = high_pass_filtering(row) for i in range(len(pqrsts)): pq = row[p[i]:q[i]] st = row[s[i]:t[i]] pt = row[p[i]:t[i]] pmax = np.amax(pq) pmin = np.amax(pq) tmax = np.amax(st) tmin = np.amax(st) p_mean = np.mean(pq) t_mean = np.mean(st) features += [ # features for PQ interval pmax, pmax / row[r[i]], pmin / pmax, p_mean, p_mean / pmax, np.std(pq), common.mode(pq), # feature for ST interval tmax, tmax / row[r[i]], tmin / tmax, t_mean, t_mean / tmax, np.std(st), common.mode(st), p_mean / t_mean, # features for whole PQRST interval stats.skew(pt), stats.kurtosis(pt) ] for i in range(NB_RR - len(pqrsts)): features += [0 for x in range(17)] return features
def get_feature(region_props, n_region, feature_name): if n_region > 0: feature_values = [region[feature_name] for region in region_props] feature = feature_tuple( MAX=format_2f(np.max(feature_values)), MEAN=format_2f(np.mean(feature_values)), VARIANCE=format_2f(np.var(feature_values)), SKEWNESS=format_2f(st.skew(np.array(feature_values))), KURTOSIS=format_2f(st.kurtosis(np.array(feature_values)))) else: feature = feature_tuple(*([0] * 5)) return feature
def getFourMoments(sequence, ax=1): finalArray = [ np.mean(sequence, axis=ax), np.var(sequence, axis=ax), skew(sequence, axis=ax), kurtosis(sequence, axis=ax), sem(sequence, axis=ax), ] if ax != None: finalArray = np.array(finalArray) finalArray = finalArray.T return np.concatenate((finalArray, np.array(mquantiles(sequence, axis=ax))), axis=ax) finalArray.extend(mquantiles(sequence, axis=ax)) return np.array(finalArray)
def _calculateStatistics(self, img, haralick=False, zernike=False): result = [] # 3-bin histogram result.extend(mquantiles(img)) # First four moments result.extend([img.mean(), img.var(), skew(img, axis=None), kurtosis(img, axis=None)]) # Haralick features if haralick: integerImage = dtype.img_as_ubyte(img) result.extend(texture.haralick(integerImage).flatten()) # Zernike moments if zernike: result.extend(zernike_moments(img, int(self.rows) / 2 + 1)) return result
def get_feature(region_props, n_region, feature_name): """ Returns: feature:list of [max, mean, variance, skewness, kurtosis] """ feature = [0] * 5 if n_region > 0: feature_values = [region[feature_name] for region in region_props] feature[MAX] = utils.format_2f(np.max(feature_values)) feature[MEAN] = utils.format_2f(np.mean(feature_values)) feature[VARIANCE] = utils.format_2f(np.var(feature_values)) feature[SKEWNESS] = utils.format_2f(st.skew(np.array(feature_values))) feature[KURTOSIS] = utils.format_2f(st.kurtosis(np.array(feature_values))) return feature
def getFourMoments(sequence, ax=1): finalArray = [ np.mean(sequence, axis=ax), np.var(sequence, axis=ax), skew(sequence, axis=ax), kurtosis(sequence, axis=ax), sem(sequence, axis=ax) ] if ax != None: finalArray = np.array(finalArray) finalArray = finalArray.T return np.concatenate( (finalArray, np.array(mquantiles(sequence, axis=ax))), axis=ax) finalArray.extend(mquantiles(sequence, axis=ax)) return np.array(finalArray)
def generate_moment(dataset, NO_OF_PROPERTIES, NO_MOMENTS): element_count = len(dataset) moments = np.zeros((element_count, NO_OF_PROPERTIES, NO_MOMENTS)) # TODO debugging here only for row in range(element_count): moments[row, :, :] = np.array([ scipy.mean(dataset[row][0:NO_OF_PROPERTIES, :], axis=1), # scipy.mean(dataset[row][0:NO_OF_PROPERTIES,:], axis=1), # scipy.mean(dataset[row][0:NO_OF_PROPERTIES,:], axis=1), # scipy.mean(dataset[row][0:NO_OF_PROPERTIES,:], axis=1), scipy.std(dataset[row][0:NO_OF_PROPERTIES, :], axis=1), stats.skew(dataset[row][0:NO_OF_PROPERTIES, :], axis=1), stats.kurtosis(dataset[row][0:NO_OF_PROPERTIES, :], axis=1) ]).transpose() return moments
def compute_features(dataframe, columns, bins, model, model_type="KMeans"): """ Compute the features of the specified columns from a Pandas dataframe using the given model. :param dataframe: Pandas dataframe. :param columns: List of the columns name. :param bins: Number of bins. :param model: Model. :param model_type: Type of the model. :return: Features. """ import numpy as np import scipy.stats.stats as st row = [] for j, column in enumerate(columns): column_df = dataframe[column] X = column_df.values if model is not None: if model_type == "KMeans": r = model[column].predict(X.reshape(-1, 1)) if model_type == "PolynomialFeatures": r = model[column].transform(X.reshape(-1, 1)).tolist() else: r = X # compute feature histogram # counts, bin_edges = np.histogram(result, bins=bins[j], density=False) # column_hist = counts # compute normalized feature histogram counts, bin_edges = np.histogram(r, bins=bins[j], density=True) column_hist = counts * np.diff(bin_edges) row.extend(column_hist) # add extra features kurtosis = st.kurtosis(X.reshape(-1, 1))[0] skew = st.skew(X.reshape(-1, 1))[0] min_value = column_df.min() max_value = column_df.max() mean_value = column_df.mean() median_value = column_df.median() row.extend( [kurtosis, skew, min_value, max_value, mean_value, median_value]) return row
def extract_features(data, y, window_len, task2=False): #num_windows): i = 0 #window_len = len(data)/(num_windows/2) if task2: num_windows = len(data) - window_len + 1 else: num_windows = len(data) / (window_len / 2) #print 'num_windows = 208, window_len = ' , str(len(data)/(208/2)) #print 'now num_windows = '+ str(num_windows)+', window_len = '+str(window_len) features = [] targets = [] for n in range(num_windows): win = data[i:i + window_len] if task2: target = y.iloc[i] else: try: target = int(y[i:i + window_len].mode()) except: target = int(y[i:i + window_len]) targets.append(target) for c in data.columns: s = np.array(win[c]) rms_val = rms(s) (min_max, peak, peaknum) = min_max_mean(s) mean = s.mean() std = s.std() skew = st.skew(s) kurtosis = st.kurtosis(s) coefficients = std / mean logpower = np.log10((s**2)).sum() new_features = [ rms_val, min_max, mean, std, skew, kurtosis, peak, peaknum, coefficients, logpower ] #new_features = [rms_val, min_max, mean, std] features.append(new_features) if (task2): i += 1 else: i += window_len / 2 features = np.array(features) features.shape = num_windows, 120 #48#72 targets = np.array(targets) return features, targets
def get_create_feature(row): feature = pd.Series() feature['user_id'] = list(row['user_id'])[0] # feature['create_count'] = len(row) diff_day = np.diff(row['day']) if len(diff_day) != 0: # feature['create_day_diff_mean'] = np.mean(diff_day) # feature['create_day_diff_std'] = np.std(diff_day) # feature['create_day_diff_min'] = np.min(diff_day) # feature['create_day_diff_mode'] = stats.mode(interval_data)[0][0] feature['create_day_diff_ske'] = stats.skew(diff_day) feature['create_day_diff_kur'] = stats.kurtosis(diff_day) # feature['create_day_diff_max'] = np.max(diff_day) feature['create_day_last'] = diff_day[-1] feature['create_sub_register'] = np.subtract(np.max(row['max_day']), np.max(row['day'])) feature['create_mode'] = stats.mode(row['day'])[0][0] return feature
def __init__(self): """ Init the environment, this is a `_reset` function however we don't need the `reset` function, so I put the code here """ input_path = 'data/vnindex.csv' df = pd.read_csv(input_path) df['return'] = df['close'].pct_change() for index, row in df.iterrows(): if index < 100: continue data = df.iloc[index - 100:index]['return'] mu, sigma = norm.fit(data) skew, kurtosis = st.skew(data), st.kurtosis(data) autocorr = f_autocorr(data.abs())[0, 1] df.loc[index, 'mu'] = mu df.loc[index, 'sigma'] = sigma df.loc[index, 'skew'] = skew df.loc[index, 'kurtosis'] = kurtosis df.loc[index, 'autocorr'] = autocorr # df.to_csv(input_path, index=False) self.df = df # self.df = pd.read_csv(input_path) self.sim_df = pd.DataFrame() # init parameters for fms self.total_number = 10000 self.init_price = 100000 # The Space object corresponding to valid observations self.obs_mu = [-0.0102, -0.0011, 0.0001, 0.0016, 0.0140] self.obs_sigma = [0.0028, 0.0084, 0.0120, 0.0159, 0.0492] self.obs_skew = [-2.0660, -0.2824, 0.0388, 0.3409, 2.6633] self.obs_kurtosis = [-1.47, -0.24, 0.34, 1.40, 16.19] self.observation_space = None self.observation_space_n = \ len(self.obs_mu) * \ len(self.obs_sigma) * \ len(self.obs_skew) * \ len(self.obs_kurtosis) # A tuple corresponding to the min and max possible rewards self.reward_range = (-np.inf, 0) # self.zero_pct = 0.3 self.herding_pct = 0.3
def _calculateStatistics(self, img, haralick=False, zernike=False): result = [] #3-bin histogram result.extend(mquantiles(img)) #First four moments result.extend([ img.mean(), img.var(), skew(img, axis=None), kurtosis(img, axis=None) ]) #Haralick features if haralick: integerImage = dtype.img_as_ubyte(img) result.extend(texture.haralick(integerImage).flatten()) #Zernike moments if zernike: result.extend(zernike_moments(img, int(self.rows) / 2 + 1)) return result
def compute_features(sub_df, columns, bins, model, model_type="KMeans"): import scipy.stats.stats as st row = [] for j, column in enumerate(columns): column_df = sub_df[column] X = column_df.values if model is not None: if model_type == "KMeans": result = model[column].predict(X.reshape(-1, 1)) if model_type == "PolynomialFeatures": result = model[column].transform(X.reshape(-1, 1)).tolist() else: result = X # compute feature histogram #counts, bin_edges = np.histogram(result, bins=bins[j], density=False) #column_hist = counts # compute normalized feature histogram counts, bin_edges = np.histogram(result, bins=bins[j], density=True) column_hist = counts * np.diff(bin_edges) row.extend(column_hist) # add extra features kurtosis = st.kurtosis(X.reshape(-1, 1))[0] skew = st.skew(X.reshape(-1, 1))[0] min_value = column_df.min() max_value = column_df.max() mean_value = column_df.mean() median_value = column_df.median() row.extend( [kurtosis, skew, min_value, max_value, mean_value, median_value]) return row
def wavelet_transform(data_1, type_w, level_w): coeff_mean = np.zeros((data_1.shape[0], level_w + 1)) coeff_std = np.zeros((data_1.shape[0], level_w + 1)) coeff_skew = np.zeros((data_1.shape[0], level_w + 1)) coeff_kurt = np.zeros((data_1.shape[0], level_w + 1)) for i in range(data_1.shape[0]): """ Wavelet decomposition """ w = pywt.wavedec(data_1[i], wavelet=type_w, level=level_w) for j in range(0, len(w)): coeff_mean[i, j] = np.mean(w[j]) coeff_std[i, j] = np.std(w[j]) coeff_skew[i, j] = st.skew(w[j], bias=False) coeff_kurt[i, j] = st.kurtosis(w[j], bias=False) """ Factor analysis on the wavelet coefficients Taking the first component""" fa_mean_coeff = FactorAnalysis( n_components=1).fit(coeff_mean).transform(coeff_mean) fa_std_coeff = FactorAnalysis( n_components=1).fit(coeff_std).transform(coeff_std) fa_skew_coeff = FactorAnalysis( n_components=1).fit(coeff_skew).transform(coeff_skew) fa_kurt_coeff = FactorAnalysis( n_components=1).fit(coeff_kurt).transform(coeff_kurt) return fa_mean_coeff, fa_std_coeff, fa_skew_coeff, fa_kurt_coeff
def noiseMeter(data=None): # get signal statistics to assess noise cols = ['Max', 'Std', 'Max/Std', 'Kurt', 'Skew'] table = pandas.DataFrame(index=range(data.shape[1]), columns=cols, dtype='float64') # maximum amplitude table['Max'] = np.abs(data).max(axis=0) #/sigma2 # standard deviation table['Std'] = data.std(ddof=1, axis=0) # max/std table['Max/Std'] = table['Max'] / table['Std'] # kurtosis table['Kurt'] = stats.kurtosis(data, bias=False, axis=0) # skewness table['Skew'] = np.abs(stats.skew(data, bias=False, axis=0)) return table
def get_mean_var_skew_kurt(np_array): return {"mean":np_array.mean(), "var":np_array.var(), "skewness":st.skew(np_array), "kurtosis":st.kurtosis(np_array),}
def contrast(img): kurt = kurtosis(img,axis=None,fisher=False) var = img.var() return var / np.power(kurt, 1. / 4.)
def signal_stats(signal=None): """Compute various metrics describing the signal. Parameters ---------- signal : array Input signal. Returns ------- mean : float Mean of the signal. median : float Median of the signal. max : float Maximum signal amplitude. var : float Signal variance (unbiased). std_dev : float Standard signal deviation (unbiased). abs_dev : float Absolute signal deviation. kurtosis : float Signal kurtosis (unbiased). skew : float Signal skewness (unbiased). """ # check inputs if signal is None: raise TypeError("Please specify an input signal.") # ensure numpy signal = np.array(signal) # mean mean = np.mean(signal) # median median = np.median(signal) # maximum amplitude maxAmp = np.abs(signal - mean).max() # variance sigma2 = signal.var(ddof=1) # standard deviation sigma = signal.std(ddof=1) # absolute deviation ad = np.sum(np.abs(signal - median)) # kurtosis kurt = stats.kurtosis(signal, bias=False) # skweness skew = stats.skew(signal, bias=False) # output args = (mean, median, maxAmp, sigma2, sigma, ad, kurt, skew) names = ('mean', 'median', 'max', 'var', 'std_dev', 'abs_dev', 'kurtosis', 'skewness') return utils.ReturnTuple(args, names)
args = parser.parse_args() segment_boundaries = np.loadtxt(args.segments_filename, usecols=(2, 3)) segment_lengths = segment_boundaries[:, 1] - segment_boundaries[:, 0] count = len(segment_lengths) mean = np.mean(segment_lengths) median = np.median(segment_lengths) print("num segments read: {:d}".format(count)) print("total time (h): {:.2f}".format(np.sum(segment_lengths) / 3600)) print("mean (s): {:.2f}".format(mean)) print("median (s): {:.2f}".format(median)) print("skew: {:.2f}".format(st.skew(segment_lengths, bias=True))) print("skew [corrected]: {:.2f}".format(st.skew(segment_lengths, bias=False))) print("skewtest: {}".format(st.skewtest(segment_lengths))) print("kurtosis: {:.2f}".format(st.kurtosis(segment_lengths))) # Figure out how many segments would fill the desired number of hours, # then round up to the nearest 10k. possible_num_hours_segmentations = (100, 300, 500, 1000, 1500, 3000) print("=== from mean ===") for num_hours in possible_num_hours_segmentations: num_segments = int(num_hours * 3600 / mean) print("{:d} h: {:d} ({:d}) segments".format(num_hours, round(num_segments, -4), num_segments)) print("=== from median ===") for num_hours in possible_num_hours_segmentations: num_segments = int(num_hours * 3600 / median) print("{:d} h: {:d} ({:d}) segments".format(num_hours, round(num_segments, -4), num_segments)) gp.plot((segment_lengths, {"histogram": "freq", "binwidth": 1}))
def evaluate(self, t): """ t地点における短期的尖度を返します. """ d = self.asset.getPreviousData(t, self.__length) return stats.kurtosis(d, fisher=self.__fisher)
if maxInterim > maxValue: maxValue = maxInterim minInterim = min(my_data[:, x]) if minInterim < minValue: minValue = minInterim binWidth = (maxValue - minValue) / (numBins) newBins = np.arange(minValue, maxValue, binWidth) # TODO process array only once for speedup? for x in range (0, numModels): frequency = plt.hist(my_data[:, x], bins=newBins, histtype='step', normed=True, label=labels[x]); b[x, 0] = mean(my_data[:, x]); b[x, 1] = var(my_data[:, x]); b[x, 2] = skew(my_data[:, x]); b[x, 3] = kurtosis(my_data[:, x]); b[x, 4] = entropy(frequency[0]) plt.title(csvString + " Frequency") plt.legend() deg = u'\N{DEGREE SIGN}' plt.xlabel("Airflow Rate (cfm)") plt.ylabel("Frequency") for i in range (0, 5): print(b[:, i]) plt.show()