def filter_data(file_names): # pragma: no cover from scipy.stats import median_absolute_deviation D = [] chi2 = [] dx = [] amplitude = [] regs = [] for name in file_names: # try: spectrum = Spectrum(name, fast_load=True) D.append(spectrum.header["D2CCD"]) dx.append(spectrum.header["PIXSHIFT"]) regs.append(np.log10(spectrum.header["PSF_REG"])) amplitude.append(np.sum(spectrum.data[300:])) if "CHI2_FIT" in spectrum.header: chi2.append(spectrum.header["CHI2_FIT"]) # except: # print(f"fail to open {name}") D = np.array(D) dx = np.array(dx) regs = np.array(regs) chi2 = np.array(chi2) k = np.arange(len(D)) plt.plot(k, amplitude) plt.show() plt.plot(k, D) # plt.plot(k, np.polyval(np.polyfit(k, reg, deg=1), k)) plt.axhline(np.median(D)) plt.axhline(np.median(D) + 3 * median_absolute_deviation(D)) plt.axhline(np.median(D) - 3 * median_absolute_deviation(D)) plt.grid() plt.title("D2CCD") plt.show() filter_indices = np.logical_and(D > np.median(D) - 3 * median_absolute_deviation(D), D < np.median(D) + 3 * median_absolute_deviation(D)) if len(chi2) > 0: filter_indices *= np.logical_and(chi2 > np.median(chi2) - 3 * median_absolute_deviation(chi2), chi2 < np.median(chi2) + 3 * median_absolute_deviation(chi2)) filter_indices *= np.logical_and(dx > np.median(dx) - 3 * median_absolute_deviation(dx), dx < np.median(dx) + 3 * median_absolute_deviation(dx)) filter_indices *= np.logical_and(regs > np.median(regs) - 3 * median_absolute_deviation(regs), regs < np.median(regs) + 3 * median_absolute_deviation(regs)) plt.plot(k, D) plt.title("D2CCD") plt.plot(k[filter_indices], D[filter_indices], "ko") plt.show() plt.plot(k, dx) plt.title("dx") plt.plot(k[filter_indices], dx[filter_indices], "ko") plt.show() plt.plot(k, regs) plt.title("regs") plt.plot(k[filter_indices], regs[filter_indices], "ko") plt.show() if len(chi2) > 0: plt.title("chi2") plt.plot(k, chi2) plt.plot(k[filter_indices], chi2[filter_indices], "ko") plt.show() return np.array(file_names)[filter_indices]
def get_statistics(self, vec): expStats = { # "position": { "N": len(vec), "mean": np.mean(vec), "median": np.median(vec), "q1": np.percentile(vec, 25), "q3": np.percentile(vec, 75), # }, # "spread": { "range": np.ptp(vec), "variance": np.var(vec), "std": np.std(vec), "iqr": stats.iqr(vec), "mad": stats.median_absolute_deviation(vec), "cv": stats.variation(vec), "mad/median": stats.median_absolute_deviation(vec)/np.median(vec), "iqr/median": stats.iqr(vec)/np.median(vec), # }, # "distribution": { # "log_histogram": np.histogram(np.log(vec)) # } } for stat in expStats: expStats[stat] = round(expStats[stat], 2) return expStats
def getROIs(obj, cobj, filename='brain.mat', atlas='language'): roi_names_all = obj.roi.values roi_names = np.unique(roi_names_all) roi_atlas = obj.roi[{'neuroid': [l == atlas for l in obj['atlas'].values]}].values roi_atlas_u = np.unique(roi_atlas) SPM_dim = (79,95,69) # Create empty brain matrix brain = np.empty(SPM_dim) # The original data dimensions from that particular subject brain[:] = np.nan assert set(obj.roi.values)==set(cobj.roi.values) d = {} for roiID in roi_atlas_u: roi = obj[{'neuroid': [roi == roiID for roi in obj['roi'].values]}] cobj_roi = cobj[{'neuroid': [roi == roiID for roi in cobj['roi'].values]}] unique_atlas = np.unique(roi.atlas.values) assert len(unique_atlas) == 1 # Ceil by median ROI val cobj_roi_med = cobj_roi.median().values c_vals = roi/cobj_roi_med # Not ceiled roi_mean = roi.mean().values roi_med = roi.median().values roi_std = roi.std().values roi_sem = roi_std/np.sqrt(len(roi.values)) roi_mad = stats.median_absolute_deviation(roi) roi_mad_m = roi_mad/np.sqrt(len(roi.values)) # Ceiled croi_med = c_vals.median().values croi_mad = stats.median_absolute_deviation(c_vals) croi_mad_m = croi_mad/np.sqrt(len(c_vals.values)) # obj = obj[{'neuroid': [roi == roiID for roi in obj['roi'].values]}]/cobj_roi_med for idx, element in enumerate(c_vals.values): brain[(c_vals.col_to_coord_1.values[idx])-1, (c_vals.col_to_coord_2.values[idx])-1, \ (c_vals.col_to_coord_3.values[idx])-1] = element d[roiID + '_save'] = [roi_mean, roi_med, roi_std, roi_sem, roi_mad, roi_mad_m, \ croi_med, croi_mad, croi_mad_m, cobj_roi_med, len(c_vals.values), c_vals] # Save brain matrix sio.savemat(filename, {'brain_matrix':brain}) return d
def sample_trend(self, t, hyper=False): t = self.scalers['t'].transform(t) s, A = changepoints(t, self.n_changepoints, self.changepoint_range) if hyper: if self.trend_hierarchical: m = np.random.normal(self.pe['m_mu'], self.pe['m_sigma']) k = np.random.normal(self.pe['k_mu'], self.pe['k_sigma']) delta = np.random.laplace(0, self.pe['delta_b'], (A.shape[1], 1)) else: warnings.warn('hyper=True but trend not hierarchical, using MLEs') # MLEs for normal distribution m_mu = np.mean(self.pe['m']) m_sd = np.std(self.pe['m']) m = np.random.normal(m_mu, m_sd) # MLEs for normal distribution k_mu = np.mean(self.pe['k']) k_sd = np.std(self.pe['k']) k = np.random.normal(k_mu, k_sd) # Median and MAD are the MLEs for mu and beta parameter of laplace distribution delta_mu = np.median(self.pe['delta'], axis=0) delta_b = st.median_absolute_deviation(self.pe['delta'], axis=0) delta = np.random.laplace(delta_mu, delta_b, (self.n_changepoints, 1)) else: delta = self.pe['delta'].T # Fix dimensions m = np.repeat(self.pe['m'][None, :], t.shape[0], axis=0) k = np.repeat(self.pe['k'][None, :], t.shape[0], axis=0) if any(t > 1): # Median and MAD are the MLEs for mu and beta parameter of laplace distribution delta_mu = np.median(self.pe['delta']) delta_b = st.median_absolute_deviation(np.ravel(self.pe['delta'])) n_future_changepoints = A.shape[1] - self.n_changepoints future_delta = np.random.laplace(delta_mu, delta_b, (n_future_changepoints, delta.shape[1])) delta = np.r_[delta, future_delta] g_t = m g_t += (k + A @ delta) * t[:, None] g_t += A @ (-s[:, None] * delta) return t, g_t
def tensor_function(self, tensor): if self.ignore_value is not None: mask = (tensor != self.ignore_value) median = np.median(tensor[mask]) mad = median_absolute_deviation(tensor[mask], axis=None) tensor[mask] = ((tensor - median) / (mad + self.eps))[mask] else: median = np.median(tensor) mad = median_absolute_deviation(tensor, axis=None) tensor = (tensor - median) / (mad + self.eps) if self.min is not None or self.max is not None: tensor = np.clip(tensor, a_min=self.min, a_max=self.max) return tensor
def remove_outliers_based_on_mad(x_data, y_data): mad_x = median_absolute_deviation(x_data) median_x = np.median(x_data) mad_y = median_absolute_deviation(y_data) median_y = np.median(y_data) filtered_x = [] filtered_y = [] for x, y in zip(x_data, y_data): if np.fabs((x - median_x) / mad_x) < OUTLIER_THRESHOLD_MAD and np.fabs( (y - median_y) / mad_y) < OUTLIER_THRESHOLD_MAD: filtered_x.append(x) filtered_y.append(y) return filtered_x, filtered_y
def get_statistics(raw_data): """ Return statistics from all the fitness values found after running a metaheuristic several times. The oncoming statistics are ``nob`` (number of observations), ``Min`` (minimum), ``Max`` (maximum), ``Avg`` (average), ``Std`` (standard deviation), ``Skw`` (skewness), ``Kur`` (kurtosis), ``IQR`` (interquartile range), ``Med`` (median), and ``MAD`` (Median absolute deviation). :param list raw_data: List of the fitness values. :return: dict """ # Get descriptive statistics dst = st.describe(raw_data) # Store statistics return dict(nob=dst.nobs, Min=dst.minmax[0], Max=dst.minmax[1], Avg=dst.mean, Std=np.std(raw_data), Skw=dst.skewness, Kur=dst.kurtosis, IQR=st.iqr(raw_data), Med=np.median(raw_data), MAD=st.median_absolute_deviation(raw_data))
def extract_features(train_sequences): root = np.apply_along_axis(cal_sq_root, 2, train_sequences) train_sequences = np.insert(train_sequences,-1,root,axis = 2) #add m dimension m = sqrt(x^2, y^2, z^2) frequency_domain = np.fft.fft(train_sequences, axis=1) #changing in to frequency domain frequency_domain = np.absolute(frequency_domain) #taking absolute to remove complex numbers #features from frequency_domain kur = kurtosis(frequency_domain, axis = 1) #kutosis integral = np.trapz(frequency_domain, axis = 1) #taking integration (trapezodial) skewness = skew(frequency_domain, axis = 1) #skewness min_fd = np.min(frequency_domain, axis = 1) #minimum max_fd = np.max(frequency_domain, axis = 1) #maximum min_max_sum_fd = np.sum([min_fd, max_fd],axis= 0) #minimum maximum sum var_fd = np.var(frequency_domain, axis=1) #variance mean_fd = np.mean(frequency_domain, axis=1) #mean min_max_sub_fd = np.subtract(max_fd,min_fd) #minimum maximum subtract #features from time_domain var= np.var(train_sequences, axis=1) #variance mean = np.mean(train_sequences, axis=1) #mean min = np.min(train_sequences, axis = 1) #minimum max = np.max(train_sequences, axis = 1) #maximum min_max_sum = np.sum([min, max],axis= 0) #minimum maximum sum qr = iqr(train_sequences, axis = 1) #inter quartile range mad = median_absolute_deviation(train_sequences, axis = 1)#mean absolute deviation min_max_sub = np.subtract(max,min) #minimum maximum subtract feature = np.concatenate((var,mean,min,max,min_max_sum, qr, mad, min_max_sub, kur, integral, skewness, min_fd, max_fd, min_max_sum_fd, var_fd, mean_fd, min_max_sub_fd), axis=1) #concat features return feature
def stat_summarizer(figure): avg_perf = np.nanmean(figure) min_perf = np.nanmin(figure) q1_perf = np.nanquantile(figure, 0.25) med_perf = np.nanmedian(figure) q3_perf = np.nanquantile(figure, 0.75) max_perf = np.nanmax(figure) stdev = np.nanstd(figure) medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit') sharpe = avg_perf / stdev sharpemad = med_perf / medianabdev finaldict = { 'avg_perf': avg_perf, 'med_perf': med_perf, 'stdev': stdev, 'medianabdev': medianabdev, 'min_perf': min_perf, 'max_perf': max_perf, 'q1_perf': q1_perf, 'q3_perf': q3_perf, 'sharpe': sharpe, 'sharpemad': sharpemad } return finaldict
def stat_summarizer_old(figure): avg_perf = np.nanmean(figure) min_perf = np.nanmin(figure) q1_perf = np.nanquantile(figure, 0.25) med_perf = np.nanmedian(figure) q3_perf = np.nanquantile(figure, 0.75) max_perf = np.nanmax(figure) iqr_perf = q3_perf - q1_perf max_min = max_perf - min_perf maxq3 = max_perf - q3_perf q1min = q1_perf - min_perf stdev = np.nanstd(figure) medianabdev = stats.median_absolute_deviation(figure, nan_policy='omit') finaldict = { 'avg_perf': avg_perf, 'min_perf': min_perf, 'q1_perf': q1_perf, 'med_perf': med_perf, 'q3_perf': q3_perf, 'max_perf': max_perf, 'iqr_perf': iqr_perf, 'max_min': max_min, 'maxq3': maxq3, 'q1min': q1min, 'stdev': stdev, 'medianabdev': medianabdev } return finaldict
def t4_rule(dataframe, df_quote=None): if df_quote is not None: # Do nothing # print('Part not yet done. Please remove the quote data') else: np_price = dataframe['price'].to_numpy() roll = rolling_window(np_price, 51) roll = np.insert(roll, [0] * 25, roll[0], axis=0) roll = np.insert(roll, [-1] * 25, roll[-1], axis=0) roll = np.insert(roll, 0, np.arange(len(roll)), axis=1) dat = np.apply_along_axis(roll_delete, 1, roll, len(roll)) # Calc median # med_list = np.median(dat, -1) # Calc mad # mad_list = stats.median_absolute_deviation(dat, -1) # Add to dataframe # dataframe['median'] = pd.Series(med_list, index=dataframe.index) dataframe['mad'] = pd.Series(mad_list, index=dataframe.index) # Output data # condition = (dataframe['price'] <= (dataframe['median'] + 5*dataframe['mad'])) & \ (dataframe['price'] >= (dataframe['median'] - 5*dataframe['mad'])) dat_out = dataframe[condition] return dat_out
def get_all_feature(self, inputs): inputs = np.array(inputs) # 最小值 min = np.min(inputs) # 最大值 max = np.max(inputs) # 均值 mean = np.mean(inputs) # 中值 median = np.median(inputs) # 中值绝对偏差 mad = stats.median_absolute_deviation(inputs) # 标准差 std = np.std(inputs, ddof=1) # 偏度 skew = stats.skew(inputs) # 峰度 kurtosis = stats.kurtosis(inputs) # 四分位数范围 iqr = stats.iqr(inputs) # 能量度量 energy = self.energy(inputs) # FFT变换 process = np.abs(fft(inputs)) / len(inputs) / 2 # 频域偏度系数 wskew = stats.skew(process) # 频域峰度系数 wkurtosis = stats.kurtosis(process) # 将所有特征合并为数组 array = [ min, max, mean, median, mad, std, skew, kurtosis, iqr, energy, wskew, wkurtosis ] return array
def div_signal(self, data_in): div_means = np.zeros(self.num_divs) div_len = int(np.floor(data_in.size / self.num_divs)) for i in range(self.num_divs): div_means[i] = stats.median_absolute_deviation( data_in[i * div_len:(i + 1) * div_len]) return div_means
def remove_outliers_based_on_mad(feature): mad = median_absolute_deviation(feature) median = np.median(feature) return [ i for i in feature if np.fabs((i - median) / mad) < OUTLIER_THRESHOLD_MAD ]
def calculate_statistics(list_values): coefficient_of_Variation = scipy.stats.variation(list_values) inter_quartile_range = scipy.stats.iqr(list_values) kstat = scipy.stats.kstat(list_values) standard_error_of_mean = stats.sem(list_values) median_absolute_deviation = stats.median_absolute_deviation(list_values) return coefficient_of_Variation, inter_quartile_range, kstat, standard_error_of_mean, median_absolute_deviation
def advance(self, delt): self.save() self.row = self.row + delt print(self.row) if self.row < 1: self.row = self.nRows if self.row > self.nRows: self.row = 1 self.setSpec() self.fitAll() self.fitResiduals() dW = self.lines_fit['dW'] - getpolyfit(self.lines_fit['wave'], self.poly) sigma_robust = stats.median_absolute_deviation(dW) print('robust_sigma = {}'.format(sigma_robust)) index = np.where(np.abs(dW) > 5.0 * sigma_robust) self.lines_fit['a'][index] = 0.0 self.lines_fit['b'][index] = 0.0 self.lines_fit['wave_fit'][index] = 0.0 self.lines_fit['amplitude'][index] = 0.0 self.lines_fit['sigma'][index] = 0.0 self.fitResiduals() #print('{}/{}'.format(self.row, self.nRows)) self.draw()
def robust_std(im, method='biweight'): # get robust stdev # Method can be biweight or mad, otherwise just normal stdev # im must be numpy array from astropy.stats import biweight_midvariance from scipy.stats import median_absolute_deviation if method == 'biweight': var = biweight_midvariance(im, axis=None, ignore_nan=True) std = np.sqrt(var) elif method == 'mad': std = median_absolute_deviation(im, axis=None, nan_policy='omit') else: std = np.nanstd(im, axis=None) # From Mike's code: """ m = np.nanmedian(im) # median value d = im - m # deviation ad = np.abs(d) # absolute deviation mad = np.nanmedian(ad) # median absolute deviation if mad == 0: std = 0. # no deviation -> zero stdev else: wt = biweight(d/1.483/mad) # weights sum_wt = wt.sum() sum_wt2 = (wt**2).sum() m = (im*wt).sum() / sum_wt # weighted mean d = im-m # deviation from weighted mean var = (d**2 * wt).sum() / (sum_wt-sum_wt2/sum_wt) # weighted var std = n.sqrt(var) # weighted stdev """ return std
def cal_numerical(target_df_1, numeric_feature, numerical_df): ''' Calculate metrices for numerical features including counts, missing values, Median and MAD, range/scaling ''' # get counts of non NA values count_log = target_df_1[numeric_feature].count() numerical_df.loc[numeric_feature, 'count'] = count_log # get missing value counts missing_count_log = target_df_1[numeric_feature].isna().sum() numerical_df.loc[numeric_feature, 'missing_count'] = missing_count_log # distribution # Median and MAD median_log = target_df_1[numeric_feature].median() numerical_df.loc[numeric_feature, 'median'] = median_log if missing_count_log == 0: mad_log = stats.median_absolute_deviation(target_df_1[numeric_feature]) numerical_df.loc[numeric_feature, 'mad'] = mad_log else: numerical_df.loc[numeric_feature, 'mad'] = 0 # range/ scaling range_log = target_df_1[numeric_feature].max( ) - target_df_1[numeric_feature].min() numerical_df.loc[numeric_feature, 'range'] = range_log return numerical_df
def testing(): with open( '/braintree/home/msch/.result_caching/neural_nlp.score/' 'benchmark=Pereira2018-encoding,model=gpt2-xl,subsample=None.pkl', 'rb') as f: ceiled_score = pickle.load(f)['data'] best_layer = ceiled_score.sel(aggregation='center').argmax('layer') ceiled_score = ceiled_score.isel(layer=best_layer.values) # overview ceiled_center, ceiled_error = ceiled_score.sel( aggregation='center'), ceiled_score.sel(aggregation='error') print(f"ceiled: {ceiled_center.values:.2f}-+{ceiled_error.values:.2f}") unceiled_score = ceiled_score.raw unceiled_center, unceiled_error = unceiled_score.sel( aggregation='center'), unceiled_score.sel(aggregation='error') print( f"unceiled: {unceiled_center.values:.2f}-+{unceiled_error.values:.2f}") ceiling_score = ceiled_score.ceiling ceiling_center = ceiling_score.sel(aggregation='center').values print(f"ceiling: {ceiling_center:.2f}-+[" f"{ceiling_score.sel(aggregation='error_low').values:.2f}," f"{ceiling_score.sel(aggregation='error_high').values:.2f}]") # reproduce raw = unceiled_score.raw subject_scores = raw.groupby('subject').median('neuroid') repr_center, repr_error = subject_scores.median( ), standard_error_of_the_mean(subject_scores, dim='subject') repr_center, repr_error = repr_center / ceiling_center, repr_error / ceiling_center print(f"reproduce: {repr_center.values:.2f}-+{repr_error.values:.2f}") # MAD mad_error = median_absolute_deviation(subject_scores.values) mad_error /= ceiling_center print(f"MAD: {repr_center.values:.2f}-+{mad_error:.2f}")
def smad_plotter(freq_time, sigma=5.0, clip=True): """ spectal Median Absolute Deviation clipper Args: freq_time: the frequency time data sigma (float): sigma at which to clip data clip (bool): if true replaces clips the data else replaces it with zeroes Returns: np.ndarray: clipped/flagged data """ medians = np.median(freq_time, axis=0) sigs = 1.4826 * sigma * stats.median_absolute_deviation(freq_time, axis=0) if clip: return np.clip(freq_time, a_min=medians - sigs, a_max=medians + sigs) else: for j, sig in enumerate(sigs): freq_time[np.absolute(freq_time[:, j] - medians[j]) >= sig, j] = 0.0 return freq_time
def calibrate(self, win_len = 0.5, win_overlap = 0.5, k=5): if self.x_c is None: print("First use clean_windows() or set_clean_windows() to set calibration data") return # Calculation of covariance matrix. cov_x = np.cov(self.x_c) l1_mean = geometric_median(cov_x.reshape((-1, self.n_channels * self.n_channels))) C = l1_mean.reshape((self.n_channels,self.n_channels)) self.mixing = sqrtm(np.real(C)) evals, evecs = np.linalg.eig(self.mixing) # compute PCA indx = np.argsort(evals) # sort in ascending evecs = evecs[:, indx] # Projection of the data into component space. y_c = np.dot(evecs.T, self.x_c) # Calculation of mean and std.dev of RMS values accross win_len second windows for each component i. n_samples = y_c.shape[1] win_samples= int(win_len * self.sf) offsets = np.int_(np.arange(0, n_samples - win_samples, np.round(win_samples * (1 - win_overlap)))) rms_scores=[] for o in offsets: rms = np.sqrt(y_c[:,o:o+win_samples] ** 2).mean(axis=1) rms_scores.append(rms) #Determine threshold per component #Use median it's more robust sig= median_absolute_deviation(rms_scores,axis = 0) mu = np.median(rms_scores,axis = 0) self.threshold = mu + k * sig self.threshold = np.diag(self.threshold.dot(np.transpose(evecs)))
def get_noise(self, method="default", rmbkgd=True): """ get an estimation of the image's noise Parameters ---------- method: [string/None] -optional- - None/default: become sep if a sourcebackground has been loaded, nmad otherwise. - nmad: get the median absolute deviation of self.data - sep: (float) global scatter estimated by sep (python Sextractor), i.e. rms for background subs image - std: (float) estimated as half of the counts difference between the 16 and 84 percentiles rmbkgd: [bool] // ignored if method != std // shall the std method be measured on background subtraced image ? Return ------ float (see method) """ if method is None or method in ["default"]: method = "sep" if hasattr(self,"_sourcebackground") else "nmad" if method in ["nmad"]: from scipy import stats return stats.median_absolute_deviation(self.data[~np.isnan(self.data)]) if method in ["std","16-84","84-16"]: data_ = self.get_data(rmbkgd=rmbkgd, applymask=True, alltrue=True) lowersigma,upsigma = np.percentile(data_[data_==data_], [16,84]) # clean nans out return 0.5*(upsigma-lowersigma) if method in ["sep","sextractor", "globalrms"]: return self.sourcebackground.globalrms raise NotImplementedError(f"method {method} has not been implemented. Use: 'std'")
def fold(self, pos, accept_lim=0.2, spread=0.1): """ Fold low acceptance walkers into main distribution At the end of the burn-in, some walkers appear stuck with low acceptance fraction. These can be selected using a threshold, and folded back into the main distribution, estimated based on the median of the walkers with an acceptance fraction above the threshold. The stuck walkers are relocated with multivariate Gaussian, with mean equal to the median of the high acceptancew walkers, and a standard deviation equal to the median absolute deviation of these. Parameters ---------- pos : array The final position of the walkers after the burn-in phase. accept_lim: float The value below which walkers will be labelled as bad and/or hence stuck. """ idx = self.sampler.acceptance_fraction < accept_lim nbad = np.shape(pos[idx, :])[0] if nbad > 0: flatchains = self.sampler.chain[~idx, :, :].reshape( (-1, self.ndim)) good_med = np.median(flatchains, axis=0) good_mad = st.median_absolute_deviation(flatchains, axis=0) * spread pos[idx, :] = np.array([ np.random.randn(self.ndim) * good_mad + good_med for n in range(nbad) ]) return pos
def q4_rule(dataframe): # Add Midquote and spread to the dataframe # dataframe['midquote'] = (dataframe['ofr'] + dataframe['bid'])/2 dataframe['spread'] = (dataframe['ofr'] - dataframe['bid']) np_price = dataframe['midquote'].to_numpy() # print(len(np_price)) roll = rolling_window(np_price, 51) roll = np.insert(roll, [0]*25, roll[0], axis = 0) roll = np.insert(roll, [-1]*25, roll[-1], axis = 0) roll = np.insert(roll, 0, np.arange(len(roll)), axis = 1) dat = np.apply_along_axis(roll_delete, 1, roll, len(roll)) # Calc median # med_list = np.median(dat, -1) # Calc mad # mad_list = stats.median_absolute_deviation(dat, -1) # Add to dataframe # dataframe['median'] = pd.Series(med_list, index = dataframe.index) dataframe['mad'] = pd.Series(mad_list, index = dataframe.index) # Output data # condition = (dataframe['midquote'] <= (dataframe['median'] + 5*dataframe['mad'])) & \ (dataframe['midquote'] >= (dataframe['median'] - 5*dataframe['mad'])) dat_out = dataframe[condition] return dat_out
def calc_rowwise_medmaxmad(self, column=''): """ Compute median, maximum, and median absolute devation from an array of values specified by the string of the input column name and add columns to hold the results. Input values might be from a filtered raster of iceberg pixel drafts or a series of measurements. Parameters --------- column: str, default '' Column name on which to compute median, maximum, and median absolute deviation """ req_cols = [ column ] # e.g. 'draft' for iceberg water depths, 'depth' for measured depths self._validate(self._gdf, req_cols) for key in [column + '_med', column + '_max', column + '_mad']: try: self._gdf[key] except KeyError: self._gdf[key] = float for datarow in self._gdf.itertuples(index=True, name='Pandas'): indata = datarow[self._gdf.columns.get_loc(column) + 1] #needs the +1 because an index column is added self._gdf.at[datarow.Index, column + '_med'] = np.nanmedian(indata) self._gdf.at[datarow.Index, column + '_max'] = np.nanmax(indata) self._gdf.at[datarow.Index, column + '_mad'] = stats.median_absolute_deviation( indata, nan_policy='omit') # set type for column (since default is now object) for key in [column + '_med', column + '_max', column + '_mad']: self._gdf[str(key)] = self._gdf[str(key)].astype('float64')
def cassm_clock_correct(gmug_tr, vbox_tr, trig_tr, which=0, debug=0, name=None): """ Find first CASSM shot in common and use cross correlation to estimate the clock offset between the two systems. Will be done for each GMuG file, but not each Vibbox file :param gmug_st: Trace of B81 on gmug :param vbox_st: Trace of B81 on vbox :param trig_tr: Trace of the CASSM trigger :param which: 0 for first or -1 for last trigger :param debug: Debug flag for correlation plot :param name: Name of output h5 file for plot naming if debug > 0 :return: """ # Use derivative of PPS signal to find pulse start dt = np.diff(trig_tr.data) # Use 70 * MAD threshold samp_to_trig = np.where( dt > np.mean(dt) + 70 * median_absolute_deviation(dt))[0][which] trig1_time = vbox_tr.stats.starttime + (float(samp_to_trig) / vbox_tr.stats.sampling_rate) print(' Trigger: {}'.format(trig1_time)) cc_vbox = vbox_tr.copy().trim(trig1_time, endtime=trig1_time + 0.01).detrend('demean') cc_gmug = gmug_tr.copy().trim(trig1_time, endtime=trig1_time + 0.2).detrend('demean') print(' Vbox {}--{}'.format(vbox_tr.stats.starttime, vbox_tr.stats.endtime)) print(' GMuG {}--{}'.format(gmug_tr.stats.starttime, gmug_tr.stats.endtime)) try: cc_gmug.resample(cc_vbox.stats.sampling_rate) except AttributeError as e: # Outside range of gmug waveform return 0., np.array([0.0]), UTCDateTime() ccc = normxcorr2(cc_vbox.data, cc_gmug.data) max_cc = np.argmax(ccc[0]) max_cc_sec = float(max_cc) / cc_vbox.stats.sampling_rate if debug > 0: fig, axes = plt.subplots(nrows=2) vbox_x = np.arange(start=max_cc, stop=max_cc + cc_vbox.data.shape[0]) axes[0].plot(cc_gmug.data / np.max(cc_gmug.data), color='k', linewidth=0.7) axes[1].axvline(x=max_cc, linestyle=':', color='gray') axes[0].axvline(x=max_cc, linestyle=':', color='gray') axes[0].plot(vbox_x, cc_vbox.data / np.max(cc_vbox.data), color='r', linewidth=0.7) axes[1].plot(ccc[0], color='b', linewidth=0.7) plt.savefig(name.replace('.h5', 'time_cc.png')) plt.close('all') return max_cc_sec, ccc, trig1_time
def calculate_IQR(data, column_name): Q1 = data[column_name].quantile(0.25) Q3 = data[column_name].quantile(0.75) IQR = Q3 - Q1 if (IQR == 0): IQR = stats.median_absolute_deviation(data[column_name].values, scale=1) return IQR
def robust_scale(x): if rpy2 is None: raise ImportError("bw_SJr requires rpy2 which is not installed.") stats = importr("stats") base = importr("base") return (x - np.median(x)) / ( scipystats.median_absolute_deviation(x) + np.finfo(float).eps )
def detect_peaks_one_median(x, n=3): median = np.median(x) mad = median_absolute_deviation(x) x_clean = np.copy(x) x_clean[abs(x_clean) > abs(median + n * mad)] = np.random.uniform( median - n * mad, median + n * mad, len(x_clean[abs(x_clean) > abs(median + n * mad)])) return x_clean, (median - n * mad, median + n * mad)
def ZRscore_outlier(df): med = np.median(df) ma = stats.median_absolute_deviation(df) for i in df: z = (0.6745*(i-med))/ (np.median(ma)) if np.abs(z) > 3: out.append(i) print("Outliers:",out)