Python Series.median Examples, pandas.Series.median Python Examples

Example #1

0

Show file

File: test_reductions.py Project: 701789262a/arbobotti

def test_validate_median_initial():
    ser = Series([1, 2])
    msg = (r"the 'overwrite_input' parameter is not "
           r"supported in the pandas "
           r"implementation of median\(\)")
    with pytest.raises(ValueError, match=msg):
        # It seems like np.median doesn't dispatch, so we use the
        # method instead of the ufunc.
        ser.median(overwrite_input=True)

Example #2

0

Show file

File: outliers.py Project: crivero7/RackioAI

    def impute(self,
               value: float,
               sample: pd.Series,
               conf: float = 0.95) -> float:
        """
        Imputes outlier values using Auto Regressive method with two lags

        **Parameters**

        * **:param value:** (float)
        * **:param sample:** (pd.Series)
        * **:param conf:** (float)

        **returns**

        * **value:** (float)
        
        """
        qq = 1 - (1 - conf) / 2
        sample = sample.copy()
        sample.reset_index(drop=True, inplace=True)

        loc = np.where(np.asanyarray(~np.isnan(sample[sample == value])))[0][0]
        sample.iloc[loc, :] = np.nan
        sample.fillna(sample.median(), inplace=True)

        model = AutoReg(sample.values, lags=2, trend='n',
                        old_names=False).fit()
        ss = np.std(model.resid)

        predictions = model.predict(start=0, end=len(sample) + 1)

        percent = stats.t.ppf(q=qq, df=len(sample) - 1)
        max_lim = predictions[loc] + percent * ss * np.sqrt(1 +
                                                            1 / len(sample))
        min_lim = predictions[loc] - percent * ss * np.sqrt(1 +
                                                            1 / len(sample))

        if Utils.is_between(min_lim, value, max_lim):

            return np.array([])

        elif Utils.is_between(min_lim, predictions[loc], max_lim):

            return predictions[loc]

        else:

            return sample.median()

Example #3

0

Show file

def global_filter(X: pd.Series,
                  no_change_window: int = 3,
                  min_value: float = None,
                  max_value: float = None,
                  allow_zero: bool = False,
                  allow_negative: bool = False,
                  copy=True) -> pd.Series:

    if not isinstance(X, pd.Series):
        raise ValueError('Input data is expected of pd.Series type')

    if copy:
        X = X.copy()

    time_step = X.index.to_series().diff().min()
    steps_per_hour = math.ceil(pd.Timedelta('1H') / time_step)
    start = int(no_change_window * steps_per_hour)

    changes = X.diff().abs()
    X[start:] = X[start:].mask(
        changes.rolling(f'{no_change_window}H').sum() < 1e-3, np.nan)

    if min_value is not None:
        X.loc[X < min_value] = np.nan
    if max_value is not None:
        X.loc[X > max_value] = np.nan
    if not allow_zero:
        X.loc[X <= np.finfo(np.float32).eps] = np.nan
    if not allow_negative:
        X.loc[X < 0] = np.nan

    median = X.median()
    X.loc[X.abs() > 10 * median] = np.nan
    return X

Example #4

0

Show file

def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype="m8[ns]")
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp

Example #5

0

Show file

File: utilities.py Project: p2327/mltools

def fix_missing(df: pd.DataFrame, col: pd.Series,
                target_label: str, na_dict: dict = None) -> dict:
    """
    Replaces na values with median if data is numeric.
    Adds _na suffix columns where True means a NaN was replaced.
    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2
    >>> fix_missing(df, 'col1', 'col1')
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False
    """
    # assumes a numerica dtype
    if na_dict is None:
        na_dict = {}
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (target_label in na_dict):
            df[target_label+'_na'] = pd.isnull(col)
            filler = na_dict[target_label] if target_label in na_dict else col.median()
            df[target_label] = col.fillna(filler)
            na_dict[target_label] = filler
    return na_dict

Example #6

0

Show file

File: offset.py Project: cheth-rowe/ihmexp

def calculate_offset(data: pd.Series, transform_value: str,
                     offset: Optional[float], gbd_round_id: int,
                     decomp_step: str) -> float:
    """
    Calculates the offset based on the transform.
    Uses custom offset if specified.
    Default offset = median(data) * offset_factor (as per Dismod).

    Args:
        data: the data whose median is used to calculate an offset
        transform_value: whether the model is running in log or logit space
        offset: user-specified custom offset

    Returns:
        Calculated offset, or custom offset if given
    """
    if offset is not None:
        if not (helpers.use_old_methods(gbd_round_id, decomp_step)
                and gbd_round_id == 6):
            warnings.warn(
                f'Found offset {offset} in config. Running with a custom offset '
                'is not recommended; if offset is left blank, ST-GPR will pick a '
                'decent offset for you')
        return offset

    # 0.01 is *way* too large for logit models, so use 0.001 instead.
    offset_factor = (0.01 if transform_value
                     == lookup_tables.TransformType.log.name else 0.001)
    return data.median() * offset_factor

Example #7

0

Show file

def MAD_outliers(data: pd.Series) -> pd.Series:
    """Detect outliers use MAD based method and return a binary index

    Detect outliers we can use robust method based on median absolute deviation ( Page 19, Data Cleaning, 2019 ):

    >> ... the median and the median absolute deviation (MAD) that can replace mean and standard deviation, respectively.

    >> The median and MAD lead to a robust outlier detection technique known as Hampel X84 that is 
    >> quite reliable in the face of outliers since it has a breakdown point of 50%. Hampel X84 marks 
    >> outliers as those data points that are more than 1.4826θ MADs away from the median, where θ is 
    >> the number of standard deviations away from the mean one would have used if there were no 
    >> outliers in the dataset. The constant 1.4826 is derived under a normal distribution, where one 
    >> standard deviation away from the mean is about 1.4826 MADs.
    
    """
    num_std = 3
    theta = 1.4826

    median = data.median()
    mad = np.median((data - median).abs())

    lower = median - mad * (theta * num_std)
    upper = median + mad * (theta * num_std)

    return (data < lower) | (data > upper)

Example #8

0

Show file

File: test_reductions.py Project: 701789262a/arbobotti

def test_reductions_td64_with_nat():
    # GH#8617
    ser = Series([0, pd.NaT], dtype="m8[ns]")
    exp = ser[0]
    assert ser.median() == exp
    assert ser.min() == exp
    assert ser.max() == exp

Example #9

0

Show file

def _prepare_float(
    column_series: pd.Series,
    mean: bool = True,
    rescale: bool = True,
    standardize: bool = True,
) -> pd.Series:
    """
    Fill missing values of a float column with its mean or median,
    and standardize it.

    :author: Victor Nepveu
    :param column_series: column to process.
    :param mean: whether to fill missing values with the mean or the median.
    :param rescale: whether to rescale the column (standardize or normalize).
    :param standardize: whether to apply standardization or normalization.
    :return: the processed column.
    """
    # Fill missing values with the mean or the median of the column
    filling_value = column_series.mean() if mean else column_series.median()
    column_series.fillna(filling_value, inplace=True)

    if rescale:
        a = column_series.mean() if standardize else column_series.min()
        b = column_series.std() if standardize else column_series.max() - a
        column_series = column_series.apply(lambda x: (x - a) / b)

    return column_series

Example #10

0

Show file

    def test_median(self):
        string_series = tm.makeStringSeries().rename("series")
        self._check_stat_op("median", np.median, string_series)

        # test with integers, test failure
        int_ts = Series(np.ones(10, dtype=int), index=range(10))
        tm.assert_almost_equal(np.median(int_ts), int_ts.median())

Example #11

0

Show file

File: test_stat_reductions.py Project: DusanMilunovic/pandas

    def test_median(self):
        string_series = tm.makeStringSeries().rename('series')
        self._check_stat_op('median', np.median, string_series)

        # test with integers, test failure
        int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
        tm.assert_almost_equal(np.median(int_ts), int_ts.median())

Example #12

0

Show file

File: test_datetime.py Project: BobMcFry/pandas

def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp

Example #13

0

Show file

File: Imputer.py Project: pengliang1226/model_procedure

def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series:
    """
    统计指标填充，例如：均值、中位数、众数等
    :param X:
    :param method: 目前仅支持均值、中位数、众数、最大值、最小值
    :param null_value: 缺失值列表
    :return:
    """
    X = X.copy()
    if null_value is not None:
        X[X.isin(null_value)] = np.nan

    if method == 'mean':
        fill_value = X.mean()
    elif method == 'median':
        fill_value = X.median()
    elif method == 'mode':
        fill_value = X.mode()[0]
    elif method == 'max':
        fill_value = X.max()
    elif method == 'min':
        fill_value = X.min()
    else:
        raise Exception('未配置的填充方法')

    X.fillna(fill_value, inplace=True)

    return X

Example #14

0

Show file

def basic_stat_map(s: pd.Series) -> dict:
    return {
        "mean": s.mean(),
        "median": s.median(),
        "std": s.std(),
        "count": s.count(),
    }

Example #15

0

Show file

    def _box_stats(ds: pd.Series,
                   med: bool = True,
                   iqr: bool = True,
                   count: bool = True) -> str:
        """
        Create the metric part with stats of the box (axis) caption

        Parameters
        ----------
        ds: pd.Series
            data on which stats are found
        med: bool
        iqr: bool
        count: bool
            statistics

        Returns
        -------
        stats: str
            caption with summary stats
        """
        # interquartile range
        iqr = ds.quantile(q=[0.75, 0.25]).diff()
        iqr = abs(float(iqr.loc[0.25]))

        met_str = []
        if med:
            met_str.append('Median: {:.3g}'.format(ds.median()))
        if iqr:
            met_str.append('IQR: {:.3g}'.format(iqr))
        if count:
            met_str.append('N: {:d}'.format(ds.count()))
        stats = '\n'.join(met_str)

        return stats

Example #16

0

Show file

def replace_na_skewness(col_series: pd.Series,
                        skew_threshold: float = 0.2,
                        mode: str = 'auto') -> pd.Series:
    if mode == 'auto':
        # print("Skewness:", col_series.skew())
        if -skew_threshold <= col_series.skew() <= skew_threshold:
            # print("Using MEAN")
            return col_series.fillna(col_series.mean())
        else:
            # print("Using MEDIAN")
            return col_series.fillna(col_series.median())
    elif mode == 'mean':
        return col_series.fillna(col_series.mean())
    elif mode == 'median':
        return col_series.fillna(col_series.median())
    else:
        raise ValueError("invalid mode: only accepts 'auto', 'mean', 'median'")

Example #17

0

Show file

def mod_z(col: pd.Series, alpha: float = 0.6745) -> pd.Series:
    '''
    Renvoie le Z-score modifié de notre variable col
    '''
    med_col = col.median()
    med_abs_dev = (np.abs(col - med_col)).median()
    mod_z = alpha * ((col - med_col) / med_abs_dev)
    return np.abs(mod_z)

Example #18

0

Show file

 def create_interaction_description(interaction_count_series: Series) -> Dict:
     """某种类型交互的用户交互次数统计值"""
     interaction_description = dict()
     interaction_description[MIN] = interaction_count_series.min()
     interaction_description[MAX] = interaction_count_series.max()
     interaction_description[MEAN] = interaction_count_series.mean()
     interaction_description[MEDIAN] = interaction_count_series.median()
     return interaction_description

Example #19

0

Show file

File: dexcom_stats.py Project: dmblue12/iPancreas-archive

    def _crunch_all(self, unit):
        """Call all statistic-calculating methods for each unit with data."""

        unit.calculate_GVI_and_PGS()

        s = Series(unit.just_readings)

        unit.summary = s.describe()

        unit.median = s.median()

Example #20

0

Show file

File: test_rolling.py Project: xinrong-databricks/pandas

def test_empty_window_median_quantile():
    # GH 26005
    expected = Series([np.nan, np.nan, np.nan])
    roll = Series(np.arange(3)).rolling(0)

    result = roll.median()
    tm.assert_series_equal(result, expected)

    result = roll.quantile(0.1)
    tm.assert_series_equal(result, expected)

Example #21

0

Show file

File: dexcom_stats.py Project: kentquirk/iPancreas

	def _crunch_all(self, unit):
		"""Call all statistic-calculating methods for each unit with data."""

		unit.calculate_GVI_and_PGS()

		s = Series(unit.just_readings)

		unit.summary = s.describe()

		unit.median = s.median()

Example #22

0

Show file

File: modified_china_z_index.py Project: tommylees112/esowc_notes

def MCZI(x: pd.Series):
    """ Pearson Type III distribution """
    zsi = (x - x.median()) / x.std()
    cs  = np.power(zsi, 3) / len(x)
    czi = (
        6.0 / cs * np.power(
            (cs / 2.0 * zsi + 1.0), 1.0 / 3.0
        ) - 6.0 / cs + cs / 6.0
    )
    return czi

Example #23

0

Show file

def plot_hist_level(s: pd.Series, label: str) -> None:
    """
    画历史水平图

    Parameters
    ----------
    s: pd.Series
        序列
    Returns
    -------

    """
    s.plot(label=label)
    plt.axhline(s.iloc[-1],
                linestyle='--',
                label='now: {0:.2f}'.format(s.iloc[-1]))
    plt.axhline(s.median(), c='r', label='Median: {0:.2f}'.format(s.median()))
    plt.axhline(s.mean(), c='y', label='Average: {0:.2f}'.format(s.mean()))
    plt.legend(bbox_to_anchor=(1.05, 0.5), loc=2, borderaxespad=0.)
    plt.show()

Example #24

0

Show file

def NormalizeDatasetMethod2(ds: pd.Series):
    median = ds.median()
    temp = 0
    for i in ds:
        temp += abs(i - median)
    asd = temp / len(ds)

    result = []
    for i in ds:
        result.append((i - median) / asd)
    return result

Example #25

0

Show file

def summarize_column(column: pd.Series):
    if np.isnan(column.std()):
        std = None
    else:
        std = float(column.std())
    return {
        'mean': float(column.mean()),
        'median': float(column.median()),
        'min': float(column.min()),
        'max': float(column.max()),
        'std': std
    }

Example #26

0

Show file

def _prepare_int(column_series: pd.Series, mean: bool = True) -> pd.Series:
    """Fill missing values of an integer column with its mean or median.

    :author: Robin Courant
    :param column_series: column to process.
    :param mean: whether to fill missing values with the mean or the median.
    :return: the processed column.
    """
    # Fill missing values with the mean or the median of the column
    filling_value = column_series.mean() if mean else column_series.median()
    column_series.fillna(int(filling_value), inplace=True)

    return column_series

Example #27

0

Show file

def _get_ellipses(x: pd.Series, y: pd.Series, how, **kwargs):
    if not "alpha" in kwargs:
        kwargs["alpha"] = 0.1
    if how == "mean":
        center = (x.mean(), y.mean())
        s_x = x.std()
        s_y = y.std()
        widths = [2 * s_x, 4 * s_x, 6 * s_x]
        heights = [2 * s_y, 4 * s_y, 6 * s_y]
    elif how == "median":
        center = (x.median(), y.median())
        iqr_x = x.quantile(0.75) - x.quantile(0.25)
        iqr_y = y.quantile(0.75) - y.quantile(0.25)
        widths = [iqr_x, 1.5 * iqr_x, 3 * iqr_x]
        heights = [iqr_y, 1.5 * iqr_y, 3 * iqr_y]
    else:
        raise ValueError("`how` must be one of {mean,median}, got %s" % how)
    return [
        mpl.patches.Ellipse(center, widths[0], heights[0], **kwargs),
        mpl.patches.Ellipse(center, widths[1], heights[1], **kwargs),
        mpl.patches.Ellipse(center, widths[2], heights[2], **kwargs),
    ]

Example #28

0

Show file

File: processing.py Project: igncp/encina

  def get_summary_indicators_from_hist(sf, hist, int_index=False):
    seriesHist = Series(hist)
    maxs = {
      'freq': dict()
    }
    
    means = {'freq': seriesHist.mean()}
    medians = {'freq': seriesHist.median()}
    stds = {'freq': seriesHist.std()}
    maxs['freq']['freq'] = seriesHist.max()
    maxs['freq']['index'] = seriesHist.idxmax()
    index_total = 'NA'

    if int_index:
      index = seriesHist.index
      index = index.astype(int)
      index_list = index.tolist()
      index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))])
      index_series = Series(index_list)

      means['index'] = index_series.mean()
      medians['index'] = index_series.median()
      stds['index'] = index_series.std()
      
      maxs['freq']['index'] = int(maxs['freq']['index'])

      maxs['index'] = dict()
      maxs['index']['index'] = max(index_list)
      maxs['index']['freq'] = hist[str(maxs['index']['index'])]

    return {
      'means': means,
      'medians': medians,
      'stds': stds,
      'max': maxs,
      'index_total': index_total
    }

Example #29

0

Show file

File: analyzer.py Project: tzaton/flat_scraper

    def _plot_price_histogram(price_data: pd.Series,
                              title: str,
                              x_tick_interval: int,
                              **kwargs):
        """ Plot histogram of price

        Parameters
        ----------
        price_data : pd.Series
            price data
        title : str
            plot title
        x_tick_interval : int
            interval for x axis
        """
        price_data = price_data.dropna()
        n_obs = len(price_data)

        with plt.style.context('bmh'):
            n_bins = 20
            ax = price_data.hist(bins=n_bins, alpha=0.9, **kwargs)

            ax.grid(linewidth=0.5)
            ax.spines['right'].set_visible(False)
            ax.spines['top'].set_visible(False)

            plt.title(title)
            plt.xlabel("price")
            plt.ylabel("number of offers")

            # Format X axis
            plt.xticks(rotation=45)
            ax.xaxis.set_major_locator(
                ticker.MultipleLocator(base=x_tick_interval))
            ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

            # Add median price
            median_price = price_data.median()
            hist_y, hist_x = np.histogram(price_data, bins=n_bins)
            plt.axvline(median_price, color='midnightblue', linewidth=2)
            plt.text(median_price, np.quantile(hist_y, 0.25),
                     s=f"Median price={median_price:,.0f} z\u0142",
                     rotation=90,
                     horizontalalignment="right",
                     verticalalignment="bottom")

            # Add number of observations
            plt.text(hist_x.min(), hist_y.max() * 0.9, s=f"Total number of offers={n_obs}",
                     horizontalalignment="left")

Example #30

0

Show file

File: packet_iat.py Project: mbakholdina/srt-packet-iat

def get_stats(s: pd.Series):
    """ Calculate basic sample `s` statistics. """
    q1 = s.quantile(0.25)
    median = s.median()
    q3 = s.quantile(0.75)
    p90 = s.quantile(0.90)
    p95 = s.quantile(0.95)
    p99 = s.quantile(0.99)
    iqr = q3 - q1
    mean = round(s.mean(), 2)
    std = round(s.std(), 2)
    min = s.min()
    max = s.max()
    n = len(s)
    return [q1, median, q3, p90, p95, p99, iqr, mean, std, min, max, n]

Example #31

0

Show file

def showNumericalInfo(data:pd.Series):
    '''
    @Description
    显示数值统计信息，unique值，mean, median, mode, max, min
    ------------
    @Params
    data, Series
    '''
    print(data.name, data.dtype)
    print("Miss:", data.isnull().sum())
    print("Unique:", data.nunique())
    print("Max:", data.max())
    print("Min:", data.min())
    print("Mean:", data.mean())
    print("Median:", data.median())
    print("Mode:", data.mode()[0])
    print(data.value_counts().head(n=10))

Example #32

0

Show file

File: data.py Project: yutiansut/qlib

def robust_zscore(x: pd.Series, zscore=False):
    """Robust ZScore Normalization

    Use robust statistics for Z-Score normalization:
        mean(x) = median(x)
        std(x) = MAD(x) * 1.4826

    Reference:
        https://en.wikipedia.org/wiki/Median_absolute_deviation.
    """
    x = x - x.median()
    mad = x.abs().median()
    x = np.clip(x / mad / 1.4826, -3, 3)
    if zscore:
        x -= x.mean()
        x /= x.std()
    return x

Example #33

0

Show file

File: anomaly_detector.py Project: tuapuikia/hastic-server

 def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
     '''
     segment is divided by the median to determine its top or bottom part
     the part is smoothed and raised above the segment or put down below the segment
     '''
     if len(segment) < 2:
         return segment
     comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
     segment = segment - segment.min()
     segment_median = segment.median()
     part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
     part = pd.Series(part, index = segment.index)
     smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
     difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
     max_diff = max(difference)
     bound = [val + max_diff for val in smoothed_part.values]
     bound = pd.Series(bound, index = segment.index)
     return bound

Example #34

0

Show file

def plot_monthly_returns(rets: pd.Series,
                         title: str = 'Retornos Mensais',
                         show_mean: bool = True,
                         show_median: bool = True,
                         size: tuple = (18, 6),
                         name: str = None,
                         is_return: bool = False) -> None:
    """Função desenvolvida especificamente para plotar os retornos
    mensais, em barplot. Imprime também a média e a mediana dos
    mesmos, se desejado. Salva o plot se 'name' for dado.

    Args:
        rets (pd.Series): retornos (mensais).
        show_mean (bool, optional): se True, também exibe a média.
        show_median (bool, optional): se True, também exibe a mediana.
        size (tuple): tamanho do plot.
        name (str, optional): se != None, salva o plot em 'save_path'.

    Raises:
        NameError: se len(name) == 0.
    """
    colors = map(lambda r: 'indianred' if r < 0 else 'blue', rets)

    fig, ax = plt.subplots(figsize=size)
    rets.plot.bar(ax=ax, color=list(colors))

    if show_mean:
        ax.axhline(y=rets.mean(), ls=':', color='green', label='Média')
    if show_median:
        ax.axhline(y=rets.median(), ls='-', color='goldenrod', label='Mediana')

    if show_mean or show_median:
        plt.legend()

    plt.title(title)
    plt.ylabel('Percentual')

    if name:
        if len(name) > 0:
            plt.savefig(save_path + str(name) + '.png', dpi=200)
        else:
            raise NameError('Nome da figura precisa ter ao menos um caracter.')

    return ax if is_return else plt.show()

Example #35

0

Show file

File: anoms.py Project: indeedeng/anomaly-detection

def _esd(x, max_outlier, alpha, direction):
    """
    The ESD test using median and MAD in the calculation of the test statistic.
    """
    x = Series(x)
    n = len(x)
    outlier_index = []
    for i in range(1, max_outlier + 1):
        median = x.median()
        mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT
        if mad == 0:
            break
        if direction == 'both':
            ares = x.map(lambda value: abs(value - median) / mad)
        elif direction == 'pos':
            ares = x.map(lambda value: (value - median) / mad)
        elif direction == 'neg':
            ares = x.map(lambda value: (median - value) / mad)
        r_idx = ares.idxmax()
        r = ares[r_idx]
        if direction == 'both':
            p = 1.0 - alpha / (2 * (n - i + 1))
        else:
            p = 1.0 - alpha / (n - i + 1)
        crit = t.ppf(p, n-i-1)
        lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" %
                         (i, max_outlier, median, mad, r_idx, r, crit, lam))
        if r > lam:
            outlier_index.append(r_idx)
            x = x.drop(r_idx)
        else:
            # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time,
            # we can stop.
            break
    return outlier_index

Example #36

0

Show file

File: make_trie.py Project: Rudloff/interest-dashboard

trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))

trie = {}
for trigram in unique_trigrams:
    current_dict = trie
    for index, letter in enumerate(trigram):

Example #37

0

Show file

File: ipython_notebook_examples.py Project: Najah-lshanableh/learning

# <codecell>

Series(d, index=['b', 'c', 'd', 'a'])

# <codecell>

s[1]

# <codecell>

s[:3]   

# <codecell>

s[s > s.median()]

# <codecell>

np.exp(s)

# <codecell>

s['evan']=9
s

# <codecell>

'bob' in s

# <codecell>

Example #38

0

Show file

File: pandas-demo.py Project: spartanem/python-for-researchers

# A data series does not have to be of homogeneous type.
# However, many of the manipulation which you'll perform probably assume
# homogeneity.
s = Series( { "a": 42, "b": "foo", "c": 42J } )
pprint( s )

# <demo> --- stop ---

s = Series(
    randn( 5 )
)
pprint( s )

# Data series are not only Python sequences but also act like NumPy 'ndarray'
# objects.
print( s[s > s.median()] )
pprint( np.exp(s) )

# Data series act like Python 'dict' objects as well.
print( 1 in s )

# Labels with no corresponding values use NaN for their missing values.
# This is true both during series initialization and alignment.
pprint( s[ 1 : ] + s[ : -1 ] )

# <demo> --- stop ---

# 'DataFrame' objects are 2D.
# They can be created from a 'dict' of Series objects, a 2D array, etc...

df = DataFrame( { "col1": randn( 4 ), "col2": randn( 4 ) } )