def adjBoxplotStats(x,
                    coeff=1.5,
                    a=-4.,
                    b=3.):  #creates bounds for outlier detection
    """ Calculates an adjusted interquartile range of distribution and points outside this range
    can be identified as outliers.

    Calculation and default parameters describe in: An adjusted boxplot for skewed distrubtions, Vanderviere
    and Huber, COMPSTAT 2004 Symposium.

    :param x: array
    :param coeff: scalar
    :param a: scalar
    :param b: scalar
    :return: dict with keys:
        fence: list of upper and lower adjusted quartiles
        IQR: unadjusted interquartile range
        MC: skew of distribution using medcouple which is a robust skew measure.
    """
    x = np.array(x)
    MC = medcouple(x)
    [Q1, Q2, Q3] = np.percentile(x, [25, 50, 75])
    IQR = Q3 - Q1
    if (MC >= 0):
        fence = [
            Q1 - coeff * np.exp(a * MC) * IQR,
            Q3 + coeff * np.exp(b * MC) * IQR
        ]
    else:
        fence = [
            Q1 - coeff * np.exp(-b * MC) * IQR,
            Q3 + coeff * np.exp(-a * MC) * IQR
        ]
    return {'fence': fence, 'IQR': IQR, 'medcouple': MC}
Exemple #2
0
def calculate_outlier_resistant_mean_and_st_dev(data,
                                                number_of_outliers_allowed):
    medcouple = stattools.medcouple(data)
    Q1 = np.percentile(data, 25, interpolation='midpoint')
    Q3 = np.percentile(data, 75, interpolation='midpoint')
    IQR = Q3 - Q1

    if medcouple > 0:
        lower_constant = -4
        upper_constant = 3
    else:
        lower_constant = -3
        upper_constant = 4

    skew_corrected_outlier_minimum = Q1 - 1.5 * math.exp(
        lower_constant * medcouple) * IQR
    skew_corrected_outlier_maximum = Q3 + 1.5 * math.exp(
        upper_constant * medcouple) * IQR

    data_without_outliers = []
    for x in data:
        if skew_corrected_outlier_minimum <= x <= skew_corrected_outlier_maximum:
            data_without_outliers.append(x)

    if len(data_without_outliers) < len(data) - number_of_outliers_allowed:
        clean_data = data
    else:
        clean_data = data_without_outliers

    return np.mean(clean_data), np.std(clean_data)
Exemple #3
0
    def adjusted_boxplot(self, x):
        """
        An Adjusted Boxplot for Skewed Distributions
        
        Parameters
        ----------
        x : numpy.array
            Array that contains the values of a feature quantitative
        
        Return
        ------
        li : float
            lower boundary until where a normal point is considered
        ls : float
            upper boundary until where a normal point is considered
        """
        MC = medcouple(x)
        q1 = np.quantile(x, q=0.25)
        q3 = np.quantile(x, q=0.75)
        iqr = q3 - q1

        # compute medcouple
        if MC >= 0:
            f1 = 1.5 * np.exp(-4 * MC)
            f2 = 1.5 * np.exp(3 * MC)
        else:
            f1 = 1.5 * np.exp(-3 * MC)
            f2 = 1.5 * np.exp(4 * MC)

        li = q1 - f1 * iqr
        ls = q3 + f2 * iqr

        return li, ls
 def adjBoxplotStats(self, x, coeff=1.5, a=-4., b=3.):  # creates bounds for outlier detection
     x = np.array(x)
     MC = medcouple(x)
     [Q1, Q2, Q3] = np.percentile(x, [25, 50, 75])
     IQR = Q3 - Q1
     if (MC >= 0):
         fence = [Q1 - coeff * np.exp(a * MC) * IQR, Q3 + coeff * np.exp(b * MC) * IQR]
     else:
         fence = [Q1 - coeff * np.exp(-b * MC) * IQR, Q3 + coeff * np.exp(-a * MC) * IQR]
     return {'fence': fence, 'IQR': IQR, 'medcouple': MC}
Exemple #5
0
def fx_adjusted_boxplot_Rule(series_id, series_data):
    '''
    value should be in +- range of -->(y > q3 + 1.5 * iqr) or (y < q1 - 1.5 * iqr
    Usage : 
    df_outliers= fx_boxplot_Rule(df['myid'],df['series1'])
    df_outliers
    For presense of outliers --less sensitive than 3 sigma but more sensitive to MAD test 
    No depenedence of median and mean
    better for moderately asymmetric distribution
    Drawback :
    Too agressive

    '''
    # warning ignore for verylarge values 
    # np.seterr(invalid='ignore')
    # np.errstate(invalid='ignore')
    # np.warnings.filterwarnings('ignore')
    v_df = pd.DataFrame({})
    v_df_outliers_final = pd.DataFrame({})
    v_df['id'] = series_id
    v_df['data'] = series_data

    #############################
    q1 = np.percentile(v_df['data'], 25)

    q3 = np.percentile(v_df['data'], 75)

    iqr = q3 - q1

    mc = medcouple(v_df['data'])

    if (mc >= 0):

        lr = q1 - 1.5 * iqr * np.exp(-4 * mc)
        ur = q3 + 1.5 * iqr * np.exp(3 * mc)
    else:
        lr = q1 - 1.5 * iqr * np.exp(-3 * mc)
        ur = q3 + 1.5 * iqr * np.exp(4 * mc)

    print(str(lr))
    print(str(ur))

    where_tuple1 = (v_df['data'] > ur)
    where_tuple2 = (v_df['data'] < lr)
    v_df_outliers_final = v_df[where_tuple1 | where_tuple2]

    ############################

    if len(v_df_outliers_final) > 0:

        return (v_df_outliers_final)

    else:
        print("Three are No Outliers")
Exemple #6
0
def adjusted_whisker_skew(ys: List[Union[float, int]]) -> float:
    """Calculate IQR whisker modifier based on skew (medcouple)"""
    # Cannot compute medcouple for arrays of 1 or fewer
    if len(ys) <= 1:
        return 1.5

    # Calculate medcouple and adjusted whisker based on skew
    mc = float(medcouple(ys))
    if mc >= 0:
        return 1.5 * np.exp(3 * mc)
    else:
        return 1.5 * np.exp(4 * mc)
Exemple #7
0
def range_adjust_box(vec):
    quart3 = np.percentile(vec, 75)
    quart1 = np.percentile(vec, 25)
    iqr = quart3 - quart1
    mc = float(sss.medcouple(vec))
    if mc > 0:
        whisk_plus = 1.5 * iqr * np.exp(3 * mc)
        whisk_min = 1.5 * iqr * np.exp(-4 * mc)
    else:
        whisk_plus = 1.5 * iqr * np.exp(4 * mc)
        whisk_min = 1.5 * iqr * np.exp(-3 * mc)
    range_plus = quart3 + whisk_plus
    range_min = quart1 - whisk_min
    return [range_min, range_plus]
Exemple #8
0
    def _correlation_bound(self, values, iqr_coef):
        """Computes the modified Tukey bound.

    Args:
      values: A list of correlation values.
      iqr_coef: (float) Coefficient to apply to the interquartile range.

    Returns:
      The correlation bound (a scalar).
    """
        quartiles = np.percentile(values, (25, 75))
        iqr = quartiles[1] - quartiles[0]
        mc = medcouple(np.array(values), axis=None)
        return quartiles[0] - iqr_coef * iqr * math.exp(
            -self._SKEWNESS_COEF * mc)
Exemple #9
0
 def remove_outliers(self, df, a=1.5, n_sample=10000):
     """ Based on https://wis.kuleuven.be/stat/robust/papers/2008/outlierdetectionskeweddata-revision.pdf"""
     mc = medcouple(df.ret)
     percentiles = np.percentile(df.ret, [25, 75])
     q1 = percentiles[0]
     q3 = percentiles[1]
     iqr = q3 - q1
     if mc > 0:
         lo = q1 - a * np.exp(-4 * mc) * iqr
         up = q3 + a * np.exp(3 * mc) * iqr
     else:
         lo = q1 - a * np.exp(-3 * mc) * iqr
         up = q3 + a * np.exp(4 * mc) * iqr
     df.loc[df.ret < lo, 'weight'] = 0
     df.loc[df.ret > up, 'weight'] = 0
     return df
Exemple #10
0
def boxplot(data):
    # mc可以使用statsmodels包中的medcouple函数直接进行计算
    mc = medcouple(data.dropna())
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    if mc >= 0:
        l = q1 - 1.5 * np.exp(-3.5 * mc) * iqr
        u = q3 + 1.5 * np.exp(4 * mc) * iqr
    else:
        l = q1 - 1.5 * np.exp(-4 * mc) * iqr
        u = q3 + 1.5 * np.exp(3.5 * mc) * iqr
    data = pd.Series(data)
    data[data < l] = l
    data[data > u] = u
    return data
Exemple #11
0
    def outliers(self, column):
        """Return outliers.

        make correction for skewed data. see :
        Mia Hubert & al 2007 - Outlier detection for skewed data
        """
        quartile_1 = self.data[column].quantile(0.25)
        quartile_3 = self.data[column].quantile(0.75)
        iqr = quartile_3 - quartile_1
        mc = medcouple(self.data[column])
        if mc > 0:
            lower_bound = quartile_1 - (iqr * 1.5 * np.exp(-4 * mc))
            upper_bound = quartile_3 + (iqr * 1.5 * np.exp(3 * mc))
        else:
            lower_bound = quartile_1 - (iqr * 1.5 * np.exp(-3 * mc))
            upper_bound = quartile_3 + (iqr * 1.5 * np.exp(4 * mc))
        return self.data[(self.data[column] < lower_bound) |
                         (self.data[column] > upper_bound)]
Exemple #12
0
 def test_medcouple_symmetric(self):
     mc = medcouple(np.arange(5.0))
     assert_almost_equal(mc, 0)
Exemple #13
0
 def test_medcouple_no_axis(self):
     x = np.reshape(np.arange(100.0), (50, 2))
     mc = medcouple(x, axis=None)
     assert_almost_equal(mc, medcouple(x.ravel()))
Exemple #14
0
                    return WM
        remaining = np.array([])

        for i in range(self.p):
            for j in range(L[i], R[i] + 1):
                remaining = np.append(remaining, self.H(i, j))

        find_index = medcouple_index - Ltotal

        k_minimum_element = remaining[np.argpartition(remaining, find_index)]

        # print(find_index,'tim trong mang ',sorted(remaining))
        return k_minimum_element[find_index]

    def naive_algorithm_testing(self):
        result = [self.H(i, j) for i in range(self.p) for j in range(self.q)]
        return np.median(result)


if __name__ == '__main__':
    sum = 0
    for i in range(1000):
        data = np.random.randint(low=0, high=200000, size=1000)

        A = Med_couple(data)
        sum += abs(medcouple(data) - A.kth_pair_algorithm())
        # print(skew(data))
        # print("kth",A.kth_pair_algorithm())
        # print("naive my code",A.naive_algorithm_testing())
        # print("naive",medcouple(data))
    print(sum)
 def test_medcouple_ties(self, reset_randomstate):
     x = np.array([1, 2, 2, 3, 4])
     mc = medcouple(x)
     assert_almost_equal(mc, 1.0 / 6.0)
 def test_medcouple_symmetry(self):
     x = np.random.standard_normal(100)
     mcp = medcouple(x)
     mcn = medcouple(-x)
     assert_almost_equal(mcp + mcn, 0)
 def test_medcouple_int(self):
     # GH 4243
     mc1 = medcouple(np.array([1, 2, 7, 9, 10]))
     mc2 = medcouple(np.array([1, 2, 7, 9, 10.0]))
     assert_equal(mc1, mc2)
 def test_medcouple_nonzero(self):
     mc = medcouple(np.array([1, 2, 7, 9, 10.0]))
     assert_almost_equal(mc, -0.3333333)
 def test_medcouple_symmetric(self):
     mc = medcouple(np.arange(5.0))
     assert_almost_equal(mc, 0)
Exemple #20
0
 def test_medcouple_symmetry(self, reset_randomstate):
     x = np.random.standard_normal(100)
     mcp = medcouple(x)
     mcn = medcouple(-x)
     assert_almost_equal(mcp + mcn, 0)
Exemple #21
0
 def test_medcouple_nonzero(self):
     mc = medcouple(np.array([1, 2, 7, 9, 10.0]))
     assert_almost_equal(mc, -0.3333333)
Exemple #22
0
            else:
                dictData[key] = []
                for j in range(1, index):
                    dictData[key].append(0)
                dictData[key].append(value)

    data = {}
    for key in dictData:
        values = dictData[key]
        if (numpy.mean(values) > 100) or ((max(values) - min(values)) > 60):
            data[key] = []
            for index in range(0, len(values) - window - 1, 1):
                stdDataSet.append(numpy.std(values[index:index + window]))
                data[key].append(numpy.std(values[index:index + window]))

    medc = medcouple(stdDataSet)
    q75, q25 = numpy.percentile(stdDataSet, [75, 25])
    iqr = q75 - q25
    innerFence = q75 + 1.5 * math.exp(4 * medc) * iqr
    outerFence = q75 + 3 * math.exp(4 * medc) * iqr
    return [innerFence, outerFence]


def block_delay_mean(date, honeyNodeLocation):
    """
    Compute the average delay between GETDATA and BLOCK messages on a given date. If no date is given, compute the average
    over the entire available data.
    :param date:
    :param honeyNodeLocation:
    :return:
    """
Exemple #23
0
 def test_medcouple_int(self):
     # GH 4243
     mc1 = medcouple(np.array([1, 2, 7, 9, 10]))
     mc2 = medcouple(np.array([1, 2, 7, 9, 10.0]))
     assert_equal(mc1, mc2)
Exemple #24
0
def outlier_num_low(count_series,m=1.5):
    q3=count_series.quantile(q=.75)
    q1=count_series.quantile(q=.25)
    #print(medcouple(values),q3+np.exp(3*medcouple(values))*m*(q3-q1))
    return q1-np.exp(-4*stattools.medcouple(count_series))*m*(q3-q1)
Exemple #25
0
 def test_medcouple_ties(self, reset_randomstate):
     x = np.array([1, 2, 2, 3, 4])
     mc = medcouple(x)
     assert_almost_equal(mc, 1.0 / 6.0)
 def test_medcouple_no_axis(self):
     x = np.reshape(np.arange(100.0), (50, 2))
     mc = medcouple(x, axis=None)
     assert_almost_equal(mc, medcouple(x.ravel()))