def adjBoxplotStats(x, coeff=1.5, a=-4., b=3.): #creates bounds for outlier detection """ Calculates an adjusted interquartile range of distribution and points outside this range can be identified as outliers. Calculation and default parameters describe in: An adjusted boxplot for skewed distrubtions, Vanderviere and Huber, COMPSTAT 2004 Symposium. :param x: array :param coeff: scalar :param a: scalar :param b: scalar :return: dict with keys: fence: list of upper and lower adjusted quartiles IQR: unadjusted interquartile range MC: skew of distribution using medcouple which is a robust skew measure. """ x = np.array(x) MC = medcouple(x) [Q1, Q2, Q3] = np.percentile(x, [25, 50, 75]) IQR = Q3 - Q1 if (MC >= 0): fence = [ Q1 - coeff * np.exp(a * MC) * IQR, Q3 + coeff * np.exp(b * MC) * IQR ] else: fence = [ Q1 - coeff * np.exp(-b * MC) * IQR, Q3 + coeff * np.exp(-a * MC) * IQR ] return {'fence': fence, 'IQR': IQR, 'medcouple': MC}
def calculate_outlier_resistant_mean_and_st_dev(data, number_of_outliers_allowed): medcouple = stattools.medcouple(data) Q1 = np.percentile(data, 25, interpolation='midpoint') Q3 = np.percentile(data, 75, interpolation='midpoint') IQR = Q3 - Q1 if medcouple > 0: lower_constant = -4 upper_constant = 3 else: lower_constant = -3 upper_constant = 4 skew_corrected_outlier_minimum = Q1 - 1.5 * math.exp( lower_constant * medcouple) * IQR skew_corrected_outlier_maximum = Q3 + 1.5 * math.exp( upper_constant * medcouple) * IQR data_without_outliers = [] for x in data: if skew_corrected_outlier_minimum <= x <= skew_corrected_outlier_maximum: data_without_outliers.append(x) if len(data_without_outliers) < len(data) - number_of_outliers_allowed: clean_data = data else: clean_data = data_without_outliers return np.mean(clean_data), np.std(clean_data)
def adjusted_boxplot(self, x): """ An Adjusted Boxplot for Skewed Distributions Parameters ---------- x : numpy.array Array that contains the values of a feature quantitative Return ------ li : float lower boundary until where a normal point is considered ls : float upper boundary until where a normal point is considered """ MC = medcouple(x) q1 = np.quantile(x, q=0.25) q3 = np.quantile(x, q=0.75) iqr = q3 - q1 # compute medcouple if MC >= 0: f1 = 1.5 * np.exp(-4 * MC) f2 = 1.5 * np.exp(3 * MC) else: f1 = 1.5 * np.exp(-3 * MC) f2 = 1.5 * np.exp(4 * MC) li = q1 - f1 * iqr ls = q3 + f2 * iqr return li, ls
def adjBoxplotStats(self, x, coeff=1.5, a=-4., b=3.): # creates bounds for outlier detection x = np.array(x) MC = medcouple(x) [Q1, Q2, Q3] = np.percentile(x, [25, 50, 75]) IQR = Q3 - Q1 if (MC >= 0): fence = [Q1 - coeff * np.exp(a * MC) * IQR, Q3 + coeff * np.exp(b * MC) * IQR] else: fence = [Q1 - coeff * np.exp(-b * MC) * IQR, Q3 + coeff * np.exp(-a * MC) * IQR] return {'fence': fence, 'IQR': IQR, 'medcouple': MC}
def fx_adjusted_boxplot_Rule(series_id, series_data): ''' value should be in +- range of -->(y > q3 + 1.5 * iqr) or (y < q1 - 1.5 * iqr Usage : df_outliers= fx_boxplot_Rule(df['myid'],df['series1']) df_outliers For presense of outliers --less sensitive than 3 sigma but more sensitive to MAD test No depenedence of median and mean better for moderately asymmetric distribution Drawback : Too agressive ''' # warning ignore for verylarge values # np.seterr(invalid='ignore') # np.errstate(invalid='ignore') # np.warnings.filterwarnings('ignore') v_df = pd.DataFrame({}) v_df_outliers_final = pd.DataFrame({}) v_df['id'] = series_id v_df['data'] = series_data ############################# q1 = np.percentile(v_df['data'], 25) q3 = np.percentile(v_df['data'], 75) iqr = q3 - q1 mc = medcouple(v_df['data']) if (mc >= 0): lr = q1 - 1.5 * iqr * np.exp(-4 * mc) ur = q3 + 1.5 * iqr * np.exp(3 * mc) else: lr = q1 - 1.5 * iqr * np.exp(-3 * mc) ur = q3 + 1.5 * iqr * np.exp(4 * mc) print(str(lr)) print(str(ur)) where_tuple1 = (v_df['data'] > ur) where_tuple2 = (v_df['data'] < lr) v_df_outliers_final = v_df[where_tuple1 | where_tuple2] ############################ if len(v_df_outliers_final) > 0: return (v_df_outliers_final) else: print("Three are No Outliers")
def adjusted_whisker_skew(ys: List[Union[float, int]]) -> float: """Calculate IQR whisker modifier based on skew (medcouple)""" # Cannot compute medcouple for arrays of 1 or fewer if len(ys) <= 1: return 1.5 # Calculate medcouple and adjusted whisker based on skew mc = float(medcouple(ys)) if mc >= 0: return 1.5 * np.exp(3 * mc) else: return 1.5 * np.exp(4 * mc)
def range_adjust_box(vec): quart3 = np.percentile(vec, 75) quart1 = np.percentile(vec, 25) iqr = quart3 - quart1 mc = float(sss.medcouple(vec)) if mc > 0: whisk_plus = 1.5 * iqr * np.exp(3 * mc) whisk_min = 1.5 * iqr * np.exp(-4 * mc) else: whisk_plus = 1.5 * iqr * np.exp(4 * mc) whisk_min = 1.5 * iqr * np.exp(-3 * mc) range_plus = quart3 + whisk_plus range_min = quart1 - whisk_min return [range_min, range_plus]
def _correlation_bound(self, values, iqr_coef): """Computes the modified Tukey bound. Args: values: A list of correlation values. iqr_coef: (float) Coefficient to apply to the interquartile range. Returns: The correlation bound (a scalar). """ quartiles = np.percentile(values, (25, 75)) iqr = quartiles[1] - quartiles[0] mc = medcouple(np.array(values), axis=None) return quartiles[0] - iqr_coef * iqr * math.exp( -self._SKEWNESS_COEF * mc)
def remove_outliers(self, df, a=1.5, n_sample=10000): """ Based on https://wis.kuleuven.be/stat/robust/papers/2008/outlierdetectionskeweddata-revision.pdf""" mc = medcouple(df.ret) percentiles = np.percentile(df.ret, [25, 75]) q1 = percentiles[0] q3 = percentiles[1] iqr = q3 - q1 if mc > 0: lo = q1 - a * np.exp(-4 * mc) * iqr up = q3 + a * np.exp(3 * mc) * iqr else: lo = q1 - a * np.exp(-3 * mc) * iqr up = q3 + a * np.exp(4 * mc) * iqr df.loc[df.ret < lo, 'weight'] = 0 df.loc[df.ret > up, 'weight'] = 0 return df
def boxplot(data): # mc可以使用statsmodels包中的medcouple函数直接进行计算 mc = medcouple(data.dropna()) q1 = data.quantile(0.25) q3 = data.quantile(0.75) iqr = q3 - q1 if mc >= 0: l = q1 - 1.5 * np.exp(-3.5 * mc) * iqr u = q3 + 1.5 * np.exp(4 * mc) * iqr else: l = q1 - 1.5 * np.exp(-4 * mc) * iqr u = q3 + 1.5 * np.exp(3.5 * mc) * iqr data = pd.Series(data) data[data < l] = l data[data > u] = u return data
def outliers(self, column): """Return outliers. make correction for skewed data. see : Mia Hubert & al 2007 - Outlier detection for skewed data """ quartile_1 = self.data[column].quantile(0.25) quartile_3 = self.data[column].quantile(0.75) iqr = quartile_3 - quartile_1 mc = medcouple(self.data[column]) if mc > 0: lower_bound = quartile_1 - (iqr * 1.5 * np.exp(-4 * mc)) upper_bound = quartile_3 + (iqr * 1.5 * np.exp(3 * mc)) else: lower_bound = quartile_1 - (iqr * 1.5 * np.exp(-3 * mc)) upper_bound = quartile_3 + (iqr * 1.5 * np.exp(4 * mc)) return self.data[(self.data[column] < lower_bound) | (self.data[column] > upper_bound)]
def test_medcouple_symmetric(self): mc = medcouple(np.arange(5.0)) assert_almost_equal(mc, 0)
def test_medcouple_no_axis(self): x = np.reshape(np.arange(100.0), (50, 2)) mc = medcouple(x, axis=None) assert_almost_equal(mc, medcouple(x.ravel()))
return WM remaining = np.array([]) for i in range(self.p): for j in range(L[i], R[i] + 1): remaining = np.append(remaining, self.H(i, j)) find_index = medcouple_index - Ltotal k_minimum_element = remaining[np.argpartition(remaining, find_index)] # print(find_index,'tim trong mang ',sorted(remaining)) return k_minimum_element[find_index] def naive_algorithm_testing(self): result = [self.H(i, j) for i in range(self.p) for j in range(self.q)] return np.median(result) if __name__ == '__main__': sum = 0 for i in range(1000): data = np.random.randint(low=0, high=200000, size=1000) A = Med_couple(data) sum += abs(medcouple(data) - A.kth_pair_algorithm()) # print(skew(data)) # print("kth",A.kth_pair_algorithm()) # print("naive my code",A.naive_algorithm_testing()) # print("naive",medcouple(data)) print(sum)
def test_medcouple_ties(self, reset_randomstate): x = np.array([1, 2, 2, 3, 4]) mc = medcouple(x) assert_almost_equal(mc, 1.0 / 6.0)
def test_medcouple_symmetry(self): x = np.random.standard_normal(100) mcp = medcouple(x) mcn = medcouple(-x) assert_almost_equal(mcp + mcn, 0)
def test_medcouple_int(self): # GH 4243 mc1 = medcouple(np.array([1, 2, 7, 9, 10])) mc2 = medcouple(np.array([1, 2, 7, 9, 10.0])) assert_equal(mc1, mc2)
def test_medcouple_nonzero(self): mc = medcouple(np.array([1, 2, 7, 9, 10.0])) assert_almost_equal(mc, -0.3333333)
def test_medcouple_symmetry(self, reset_randomstate): x = np.random.standard_normal(100) mcp = medcouple(x) mcn = medcouple(-x) assert_almost_equal(mcp + mcn, 0)
else: dictData[key] = [] for j in range(1, index): dictData[key].append(0) dictData[key].append(value) data = {} for key in dictData: values = dictData[key] if (numpy.mean(values) > 100) or ((max(values) - min(values)) > 60): data[key] = [] for index in range(0, len(values) - window - 1, 1): stdDataSet.append(numpy.std(values[index:index + window])) data[key].append(numpy.std(values[index:index + window])) medc = medcouple(stdDataSet) q75, q25 = numpy.percentile(stdDataSet, [75, 25]) iqr = q75 - q25 innerFence = q75 + 1.5 * math.exp(4 * medc) * iqr outerFence = q75 + 3 * math.exp(4 * medc) * iqr return [innerFence, outerFence] def block_delay_mean(date, honeyNodeLocation): """ Compute the average delay between GETDATA and BLOCK messages on a given date. If no date is given, compute the average over the entire available data. :param date: :param honeyNodeLocation: :return: """
def outlier_num_low(count_series,m=1.5): q3=count_series.quantile(q=.75) q1=count_series.quantile(q=.25) #print(medcouple(values),q3+np.exp(3*medcouple(values))*m*(q3-q1)) return q1-np.exp(-4*stattools.medcouple(count_series))*m*(q3-q1)