Ejemplo n.º 1
def test_validate_median_initial():
    ser = Series([1, 2])
    msg = (r"the 'overwrite_input' parameter is not "
           r"supported in the pandas "
           r"implementation of median\(\)")
    with pytest.raises(ValueError, match=msg):
        # It seems like np.median doesn't dispatch, so we use the
        # method instead of the ufunc.
Ejemplo n.º 2
    def impute(self,
               value: float,
               sample: pd.Series,
               conf: float = 0.95) -> float:
        Imputes outlier values using Auto Regressive method with two lags


        * **:param value:** (float)
        * **:param sample:** (pd.Series)
        * **:param conf:** (float)


        * **value:** (float)
        qq = 1 - (1 - conf) / 2
        sample = sample.copy()
        sample.reset_index(drop=True, inplace=True)

        loc = np.where(np.asanyarray(~np.isnan(sample[sample == value])))[0][0]
        sample.iloc[loc, :] = np.nan
        sample.fillna(sample.median(), inplace=True)

        model = AutoReg(sample.values, lags=2, trend='n',
        ss = np.std(model.resid)

        predictions = model.predict(start=0, end=len(sample) + 1)

        percent = stats.t.ppf(q=qq, df=len(sample) - 1)
        max_lim = predictions[loc] + percent * ss * np.sqrt(1 +
                                                            1 / len(sample))
        min_lim = predictions[loc] - percent * ss * np.sqrt(1 +
                                                            1 / len(sample))

        if Utils.is_between(min_lim, value, max_lim):

            return np.array([])

        elif Utils.is_between(min_lim, predictions[loc], max_lim):

            return predictions[loc]


            return sample.median()
Ejemplo n.º 3
def global_filter(X: pd.Series,
                  no_change_window: int = 3,
                  min_value: float = None,
                  max_value: float = None,
                  allow_zero: bool = False,
                  allow_negative: bool = False,
                  copy=True) -> pd.Series:

    if not isinstance(X, pd.Series):
        raise ValueError('Input data is expected of pd.Series type')

    if copy:
        X = X.copy()

    time_step = X.index.to_series().diff().min()
    steps_per_hour = math.ceil(pd.Timedelta('1H') / time_step)
    start = int(no_change_window * steps_per_hour)

    changes = X.diff().abs()
    X[start:] = X[start:].mask(
        changes.rolling(f'{no_change_window}H').sum() < 1e-3, np.nan)

    if min_value is not None:
        X.loc[X < min_value] = np.nan
    if max_value is not None:
        X.loc[X > max_value] = np.nan
    if not allow_zero:
        X.loc[X <= np.finfo(np.float32).eps] = np.nan
    if not allow_negative:
        X.loc[X < 0] = np.nan

    median = X.median()
    X.loc[X.abs() > 10 * median] = np.nan
    return X
Ejemplo n.º 4
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype="m8[ns]")
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
Ejemplo n.º 5
def fix_missing(df: pd.DataFrame, col: pd.Series,
                target_label: str, na_dict: dict = None) -> dict:
    Replaces na values with median if data is numeric.
    Adds _na suffix columns where True means a NaN was replaced.
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> fix_missing(df, 'col1', 'col1')
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False
    # assumes a numerica dtype
    if na_dict is None:
        na_dict = {}
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (target_label in na_dict):
            df[target_label+'_na'] = pd.isnull(col)
            filler = na_dict[target_label] if target_label in na_dict else col.median()
            df[target_label] = col.fillna(filler)
            na_dict[target_label] = filler
    return na_dict
Ejemplo n.º 6
def calculate_offset(data: pd.Series, transform_value: str,
                     offset: Optional[float], gbd_round_id: int,
                     decomp_step: str) -> float:
    Calculates the offset based on the transform.
    Uses custom offset if specified.
    Default offset = median(data) * offset_factor (as per Dismod).

        data: the data whose median is used to calculate an offset
        transform_value: whether the model is running in log or logit space
        offset: user-specified custom offset

        Calculated offset, or custom offset if given
    if offset is not None:
        if not (helpers.use_old_methods(gbd_round_id, decomp_step)
                and gbd_round_id == 6):
                f'Found offset {offset} in config. Running with a custom offset '
                'is not recommended; if offset is left blank, ST-GPR will pick a '
                'decent offset for you')
        return offset

    # 0.01 is *way* too large for logit models, so use 0.001 instead.
    offset_factor = (0.01 if transform_value
                     == lookup_tables.TransformType.log.name else 0.001)
    return data.median() * offset_factor
Ejemplo n.º 7
def MAD_outliers(data: pd.Series) -> pd.Series:
    """Detect outliers use MAD based method and return a binary index

    Detect outliers we can use robust method based on median absolute deviation ( Page 19, Data Cleaning, 2019 ):

    >> ... the median and the median absolute deviation (MAD) that can replace mean and standard deviation, respectively.

    >> The median and MAD lead to a robust outlier detection technique known as Hampel X84 that is 
    >> quite reliable in the face of outliers since it has a breakdown point of 50%. Hampel X84 marks 
    >> outliers as those data points that are more than 1.4826θ MADs away from the median, where θ is 
    >> the number of standard deviations away from the mean one would have used if there were no 
    >> outliers in the dataset. The constant 1.4826 is derived under a normal distribution, where one 
    >> standard deviation away from the mean is about 1.4826 MADs.
    num_std = 3
    theta = 1.4826

    median = data.median()
    mad = np.median((data - median).abs())

    lower = median - mad * (theta * num_std)
    upper = median + mad * (theta * num_std)

    return (data < lower) | (data > upper)
Ejemplo n.º 8
def test_reductions_td64_with_nat():
    # GH#8617
    ser = Series([0, pd.NaT], dtype="m8[ns]")
    exp = ser[0]
    assert ser.median() == exp
    assert ser.min() == exp
    assert ser.max() == exp
Ejemplo n.º 9
def _prepare_float(
    column_series: pd.Series,
    mean: bool = True,
    rescale: bool = True,
    standardize: bool = True,
) -> pd.Series:
    Fill missing values of a float column with its mean or median,
    and standardize it.

    :author: Victor Nepveu
    :param column_series: column to process.
    :param mean: whether to fill missing values with the mean or the median.
    :param rescale: whether to rescale the column (standardize or normalize).
    :param standardize: whether to apply standardization or normalization.
    :return: the processed column.
    # Fill missing values with the mean or the median of the column
    filling_value = column_series.mean() if mean else column_series.median()
    column_series.fillna(filling_value, inplace=True)

    if rescale:
        a = column_series.mean() if standardize else column_series.min()
        b = column_series.std() if standardize else column_series.max() - a
        column_series = column_series.apply(lambda x: (x - a) / b)

    return column_series
Ejemplo n.º 10
    def test_median(self):
        string_series = tm.makeStringSeries().rename("series")
        self._check_stat_op("median", np.median, string_series)

        # test with integers, test failure
        int_ts = Series(np.ones(10, dtype=int), index=range(10))
        tm.assert_almost_equal(np.median(int_ts), int_ts.median())
Ejemplo n.º 11
    def test_median(self):
        string_series = tm.makeStringSeries().rename('series')
        self._check_stat_op('median', np.median, string_series)

        # test with integers, test failure
        int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
        tm.assert_almost_equal(np.median(int_ts), int_ts.median())
Ejemplo n.º 12
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
Ejemplo n.º 13
def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series:
    :param X:
    :param method: 目前仅支持均值、中位数、众数、最大值、最小值
    :param null_value: 缺失值列表
    X = X.copy()
    if null_value is not None:
        X[X.isin(null_value)] = np.nan

    if method == 'mean':
        fill_value = X.mean()
    elif method == 'median':
        fill_value = X.median()
    elif method == 'mode':
        fill_value = X.mode()[0]
    elif method == 'max':
        fill_value = X.max()
    elif method == 'min':
        fill_value = X.min()
        raise Exception('未配置的填充方法')

    X.fillna(fill_value, inplace=True)

    return X
Ejemplo n.º 14
def basic_stat_map(s: pd.Series) -> dict:
    return {
        "mean": s.mean(),
        "median": s.median(),
        "std": s.std(),
        "count": s.count(),
Ejemplo n.º 15
    def _box_stats(ds: pd.Series,
                   med: bool = True,
                   iqr: bool = True,
                   count: bool = True) -> str:
        Create the metric part with stats of the box (axis) caption

        ds: pd.Series
            data on which stats are found
        med: bool
        iqr: bool
        count: bool

        stats: str
            caption with summary stats
        # interquartile range
        iqr = ds.quantile(q=[0.75, 0.25]).diff()
        iqr = abs(float(iqr.loc[0.25]))

        met_str = []
        if med:
            met_str.append('Median: {:.3g}'.format(ds.median()))
        if iqr:
            met_str.append('IQR: {:.3g}'.format(iqr))
        if count:
            met_str.append('N: {:d}'.format(ds.count()))
        stats = '\n'.join(met_str)

        return stats
Ejemplo n.º 16
def replace_na_skewness(col_series: pd.Series,
                        skew_threshold: float = 0.2,
                        mode: str = 'auto') -> pd.Series:
    if mode == 'auto':
        # print("Skewness:", col_series.skew())
        if -skew_threshold <= col_series.skew() <= skew_threshold:
            # print("Using MEAN")
            return col_series.fillna(col_series.mean())
            # print("Using MEDIAN")
            return col_series.fillna(col_series.median())
    elif mode == 'mean':
        return col_series.fillna(col_series.mean())
    elif mode == 'median':
        return col_series.fillna(col_series.median())
        raise ValueError("invalid mode: only accepts 'auto', 'mean', 'median'")
Ejemplo n.º 17
def mod_z(col: pd.Series, alpha: float = 0.6745) -> pd.Series:
    Renvoie le Z-score modifié de notre variable col
    med_col = col.median()
    med_abs_dev = (np.abs(col - med_col)).median()
    mod_z = alpha * ((col - med_col) / med_abs_dev)
    return np.abs(mod_z)
Ejemplo n.º 18
 def create_interaction_description(interaction_count_series: Series) -> Dict:
     interaction_description = dict()
     interaction_description[MIN] = interaction_count_series.min()
     interaction_description[MAX] = interaction_count_series.max()
     interaction_description[MEAN] = interaction_count_series.mean()
     interaction_description[MEDIAN] = interaction_count_series.median()
     return interaction_description
Ejemplo n.º 19
    def _crunch_all(self, unit):
        """Call all statistic-calculating methods for each unit with data."""


        s = Series(unit.just_readings)

        unit.summary = s.describe()

        unit.median = s.median()
Ejemplo n.º 20
def test_empty_window_median_quantile():
    # GH 26005
    expected = Series([np.nan, np.nan, np.nan])
    roll = Series(np.arange(3)).rolling(0)

    result = roll.median()
    tm.assert_series_equal(result, expected)

    result = roll.quantile(0.1)
    tm.assert_series_equal(result, expected)
Ejemplo n.º 21
	def _crunch_all(self, unit):
		"""Call all statistic-calculating methods for each unit with data."""


		s = Series(unit.just_readings)

		unit.summary = s.describe()

		unit.median = s.median()
def MCZI(x: pd.Series):
    """ Pearson Type III distribution """
    zsi = (x - x.median()) / x.std()
    cs  = np.power(zsi, 3) / len(x)
    czi = (
        6.0 / cs * np.power(
            (cs / 2.0 * zsi + 1.0), 1.0 / 3.0
        ) - 6.0 / cs + cs / 6.0
    return czi
Ejemplo n.º 23
def plot_hist_level(s: pd.Series, label: str) -> None:

    s: pd.Series

                label='now: {0:.2f}'.format(s.iloc[-1]))
    plt.axhline(s.median(), c='r', label='Median: {0:.2f}'.format(s.median()))
    plt.axhline(s.mean(), c='y', label='Average: {0:.2f}'.format(s.mean()))
    plt.legend(bbox_to_anchor=(1.05, 0.5), loc=2, borderaxespad=0.)
Ejemplo n.º 24
def NormalizeDatasetMethod2(ds: pd.Series):
    median = ds.median()
    temp = 0
    for i in ds:
        temp += abs(i - median)
    asd = temp / len(ds)

    result = []
    for i in ds:
        result.append((i - median) / asd)
    return result
Ejemplo n.º 25
def summarize_column(column: pd.Series):
    if np.isnan(column.std()):
        std = None
        std = float(column.std())
    return {
        'mean': float(column.mean()),
        'median': float(column.median()),
        'min': float(column.min()),
        'max': float(column.max()),
        'std': std
Ejemplo n.º 26
def _prepare_int(column_series: pd.Series, mean: bool = True) -> pd.Series:
    """Fill missing values of an integer column with its mean or median.

    :author: Robin Courant
    :param column_series: column to process.
    :param mean: whether to fill missing values with the mean or the median.
    :return: the processed column.
    # Fill missing values with the mean or the median of the column
    filling_value = column_series.mean() if mean else column_series.median()
    column_series.fillna(int(filling_value), inplace=True)

    return column_series
Ejemplo n.º 27
def _get_ellipses(x: pd.Series, y: pd.Series, how, **kwargs):
    if not "alpha" in kwargs:
        kwargs["alpha"] = 0.1
    if how == "mean":
        center = (x.mean(), y.mean())
        s_x = x.std()
        s_y = y.std()
        widths = [2 * s_x, 4 * s_x, 6 * s_x]
        heights = [2 * s_y, 4 * s_y, 6 * s_y]
    elif how == "median":
        center = (x.median(), y.median())
        iqr_x = x.quantile(0.75) - x.quantile(0.25)
        iqr_y = y.quantile(0.75) - y.quantile(0.25)
        widths = [iqr_x, 1.5 * iqr_x, 3 * iqr_x]
        heights = [iqr_y, 1.5 * iqr_y, 3 * iqr_y]
        raise ValueError("`how` must be one of {mean,median}, got %s" % how)
    return [
        mpl.patches.Ellipse(center, widths[0], heights[0], **kwargs),
        mpl.patches.Ellipse(center, widths[1], heights[1], **kwargs),
        mpl.patches.Ellipse(center, widths[2], heights[2], **kwargs),
Ejemplo n.º 28
  def get_summary_indicators_from_hist(sf, hist, int_index=False):
    seriesHist = Series(hist)
    maxs = {
      'freq': dict()
    means = {'freq': seriesHist.mean()}
    medians = {'freq': seriesHist.median()}
    stds = {'freq': seriesHist.std()}
    maxs['freq']['freq'] = seriesHist.max()
    maxs['freq']['index'] = seriesHist.idxmax()
    index_total = 'NA'

    if int_index:
      index = seriesHist.index
      index = index.astype(int)
      index_list = index.tolist()
      index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))])
      index_series = Series(index_list)

      means['index'] = index_series.mean()
      medians['index'] = index_series.median()
      stds['index'] = index_series.std()
      maxs['freq']['index'] = int(maxs['freq']['index'])

      maxs['index'] = dict()
      maxs['index']['index'] = max(index_list)
      maxs['index']['freq'] = hist[str(maxs['index']['index'])]

    return {
      'means': means,
      'medians': medians,
      'stds': stds,
      'max': maxs,
      'index_total': index_total
Ejemplo n.º 29
    def _plot_price_histogram(price_data: pd.Series,
                              title: str,
                              x_tick_interval: int,
        """ Plot histogram of price

        price_data : pd.Series
            price data
        title : str
            plot title
        x_tick_interval : int
            interval for x axis
        price_data = price_data.dropna()
        n_obs = len(price_data)

        with plt.style.context('bmh'):
            n_bins = 20
            ax = price_data.hist(bins=n_bins, alpha=0.9, **kwargs)


            plt.ylabel("number of offers")

            # Format X axis

            # Add median price
            median_price = price_data.median()
            hist_y, hist_x = np.histogram(price_data, bins=n_bins)
            plt.axvline(median_price, color='midnightblue', linewidth=2)
            plt.text(median_price, np.quantile(hist_y, 0.25),
                     s=f"Median price={median_price:,.0f} z\u0142",

            # Add number of observations
            plt.text(hist_x.min(), hist_y.max() * 0.9, s=f"Total number of offers={n_obs}",
Ejemplo n.º 30
def get_stats(s: pd.Series):
    """ Calculate basic sample `s` statistics. """
    q1 = s.quantile(0.25)
    median = s.median()
    q3 = s.quantile(0.75)
    p90 = s.quantile(0.90)
    p95 = s.quantile(0.95)
    p99 = s.quantile(0.99)
    iqr = q3 - q1
    mean = round(s.mean(), 2)
    std = round(s.std(), 2)
    min = s.min()
    max = s.max()
    n = len(s)
    return [q1, median, q3, p90, p95, p99, iqr, mean, std, min, max, n]
Ejemplo n.º 31
def showNumericalInfo(data:pd.Series):
    显示数值统计信息,unique值,mean, median, mode, max, min
    data, Series
    print(data.name, data.dtype)
    print("Miss:", data.isnull().sum())
    print("Unique:", data.nunique())
    print("Max:", data.max())
    print("Min:", data.min())
    print("Mean:", data.mean())
    print("Median:", data.median())
    print("Mode:", data.mode()[0])
Ejemplo n.º 32
def robust_zscore(x: pd.Series, zscore=False):
    """Robust ZScore Normalization

    Use robust statistics for Z-Score normalization:
        mean(x) = median(x)
        std(x) = MAD(x) * 1.4826

    x = x - x.median()
    mad = x.abs().median()
    x = np.clip(x / mad / 1.4826, -3, 3)
    if zscore:
        x -= x.mean()
        x /= x.std()
    return x
Ejemplo n.º 33
 def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
     segment is divided by the median to determine its top or bottom part
     the part is smoothed and raised above the segment or put down below the segment
     if len(segment) < 2:
         return segment
     comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
     segment = segment - segment.min()
     segment_median = segment.median()
     part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
     part = pd.Series(part, index = segment.index)
     smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
     difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
     max_diff = max(difference)
     bound = [val + max_diff for val in smoothed_part.values]
     bound = pd.Series(bound, index = segment.index)
     return bound
Ejemplo n.º 34
def plot_monthly_returns(rets: pd.Series,
                         title: str = 'Retornos Mensais',
                         show_mean: bool = True,
                         show_median: bool = True,
                         size: tuple = (18, 6),
                         name: str = None,
                         is_return: bool = False) -> None:
    """Função desenvolvida especificamente para plotar os retornos
    mensais, em barplot. Imprime também a média e a mediana dos
    mesmos, se desejado. Salva o plot se 'name' for dado.

        rets (pd.Series): retornos (mensais).
        show_mean (bool, optional): se True, também exibe a média.
        show_median (bool, optional): se True, também exibe a mediana.
        size (tuple): tamanho do plot.
        name (str, optional): se != None, salva o plot em 'save_path'.

        NameError: se len(name) == 0.
    colors = map(lambda r: 'indianred' if r < 0 else 'blue', rets)

    fig, ax = plt.subplots(figsize=size)
    rets.plot.bar(ax=ax, color=list(colors))

    if show_mean:
        ax.axhline(y=rets.mean(), ls=':', color='green', label='Média')
    if show_median:
        ax.axhline(y=rets.median(), ls='-', color='goldenrod', label='Mediana')

    if show_mean or show_median:


    if name:
        if len(name) > 0:
            plt.savefig(save_path + str(name) + '.png', dpi=200)
            raise NameError('Nome da figura precisa ter ao menos um caracter.')

    return ax if is_return else plt.show()
Ejemplo n.º 35
def _esd(x, max_outlier, alpha, direction):
    The ESD test using median and MAD in the calculation of the test statistic.
    x = Series(x)
    n = len(x)
    outlier_index = []
    for i in range(1, max_outlier + 1):
        median = x.median()
        mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT
        if mad == 0:
        if direction == 'both':
            ares = x.map(lambda value: abs(value - median) / mad)
        elif direction == 'pos':
            ares = x.map(lambda value: (value - median) / mad)
        elif direction == 'neg':
            ares = x.map(lambda value: (median - value) / mad)
        r_idx = ares.idxmax()
        r = ares[r_idx]
        if direction == 'both':
            p = 1.0 - alpha / (2 * (n - i + 1))
            p = 1.0 - alpha / (n - i + 1)
        crit = t.ppf(p, n-i-1)
        lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" %
                         (i, max_outlier, median, mad, r_idx, r, crit, lam))
        if r > lam:
            x = x.drop(r_idx)
            # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time,
            # we can stop.
    return outlier_index
Ejemplo n.º 36
trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))

trie = {}
for trigram in unique_trigrams:
    current_dict = trie
    for index, letter in enumerate(trigram):
# <codecell>

Series(d, index=['b', 'c', 'd', 'a'])

# <codecell>


# <codecell>


# <codecell>

s[s > s.median()]

# <codecell>


# <codecell>


# <codecell>

'bob' in s

# <codecell>
Ejemplo n.º 38
# A data series does not have to be of homogeneous type.
# However, many of the manipulation which you'll perform probably assume
# homogeneity.
s = Series( { "a": 42, "b": "foo", "c": 42J } )
pprint( s )

# <demo> --- stop ---

s = Series(
    randn( 5 )
pprint( s )

# Data series are not only Python sequences but also act like NumPy 'ndarray'
# objects.
print( s[s > s.median()] )
pprint( np.exp(s) )

# Data series act like Python 'dict' objects as well.
print( 1 in s )

# Labels with no corresponding values use NaN for their missing values.
# This is true both during series initialization and alignment.
pprint( s[ 1 : ] + s[ : -1 ] )

# <demo> --- stop ---

# 'DataFrame' objects are 2D.
# They can be created from a 'dict' of Series objects, a 2D array, etc...

df = DataFrame( { "col1": randn( 4 ), "col2": randn( 4 ) } )