Example #1
0
    def test_describe_objects(self):
        s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
        result = s.describe()
        expected = Series({
            'count': 7,
            'unique': 4,
            'top': 'a',
            'freq': 3
        },
                          index=result.index)
        assert_series_equal(result, expected)

        dt = list(self.ts.index)
        dt.append(dt[0])
        ser = Series(dt)
        rs = ser.describe()
        min_date = min(dt)
        max_date = max(dt)
        xp = Series(
            {
                'count': len(dt),
                'unique': len(self.ts.index),
                'first': min_date,
                'last': max_date,
                'freq': 2,
                'top': min_date
            },
            index=rs.index)
        assert_series_equal(rs, xp)
Example #2
0
    def __setstate__(self,state):
        #self._missing,
        _d = state
        udict.clear(self)
        for el in _d:
            k,(t,i,d,s,f) = el
            if t=='S' or t=='F':
                s=i
                l=1
                LOGGER.debug('Read %s from pickle is %s %s',k,l,type(s))
            elif t=='A':
                s = Series(d,index=i)
                l = len(s)
                LOGGER.debug('Read %s from pickle is %s %s %s',k,l,type(s),s.index)
                LOGGER.debug("%s",s.describe())
            elif t=='T':
                pr = period_range(s,periods=len(d),freq=f)
                s = Series(d,index=pr)
                l = len(s)
                LOGGER.debug('Read %s from pickle is %s %s %s',k,l,type(s),s.index)
                LOGGER.debug("%s",s.describe())
            else:
                raise E4tSystemError("ED_001: Cannot set state for %s",k)
            self[k]=s

        self._mk_environment()
Example #3
0
    def test_describe(self):
        s = Series([0, 1, 2, 3, 4], name="int_data")
        result = s.describe()
        expected = Series(
            [5, 2, s.std(), 0, 1, 2, 3, 4],
            name="int_data",
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        tm.assert_series_equal(result, expected)

        s = Series([True, True, False, False, False], name="bool_data")
        result = s.describe()
        expected = Series([5, 2, False, 3],
                          name="bool_data",
                          index=["count", "unique", "top", "freq"])
        tm.assert_series_equal(result, expected)

        s = Series(["a", "a", "b", "c", "d"], name="str_data")
        result = s.describe()
        expected = Series([5, 4, "a", 2],
                          name="str_data",
                          index=["count", "unique", "top", "freq"])
        tm.assert_series_equal(result, expected)

        s = Series(
            [
                Timedelta("1 days"),
                Timedelta("2 days"),
                Timedelta("3 days"),
                Timedelta("4 days"),
                Timedelta("5 days"),
            ],
            name="timedelta_data",
        )
        result = s.describe()
        expected = Series(
            [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]],
            name="timedelta_data",
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        tm.assert_series_equal(result, expected)

        s = Series(
            [
                Period("2020-01", "M"),
                Period("2020-01", "M"),
                Period("2019-12", "M")
            ],
            name="period_data",
        )
        result = s.describe()
        expected = Series(
            [3, 2, s[0], 2],
            name="period_data",
            index=["count", "unique", "top", "freq"],
        )
        tm.assert_series_equal(result, expected)
Example #4
0
    def test_describe_categorical(self):
        df = DataFrame({"value": np.random.randint(0, 10000, 100)})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)
        cat = df

        # Categoricals should not show up together with numerical columns
        result = cat.describe()
        assert len(result.columns) == 1

        # In a frame, describe() for the cat should be the same as for string
        # arrays (count, unique, top, freq)

        cat = Categorical(["a", "b", "b", "b"],
                          categories=["a", "b", "c"],
                          ordered=True)
        s = Series(cat)
        result = s.describe()
        expected = Series([4, 2, "b", 3],
                          index=["count", "unique", "top", "freq"])
        tm.assert_series_equal(result, expected)

        cat = Series(Categorical(["a", "b", "c", "c"]))
        df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
        result = df3.describe()
        tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
Example #5
0
def single_mean_test(sample: pd.Series, mu_0: float,
                     alternative: str) -> Dict[str, float]:
    """Performs a single mean test

    Args:
        sample: Numeric variable with the values in a Pandas Series
        mu_0: Mean from the Null Hypothesis
        alternative: Defines the alternative hypothesis. Possible values: 'less', 'greater', or 'two-sided'.

    Returns:
        Dict with the calculated "t" parameter and the p-value
    """
    _statistics = sample.describe()
    _SE = _statistics['std'] / np.sqrt(_statistics['count'])
    t = (_statistics['mean'] - mu_0) / _SE
    df = _statistics['count'] - 1
    validate_conditions_for_theoretical_distns(inference_type='single-mean',
                                               n=_statistics['count'])
    return {
        't': t,
        'p-value': get_p_value(t,
                               distribution='t',
                               alternative=alternative,
                               df=df)
    }
Example #6
0
 def test_describe_bools(self):
     ser = Series([True, True, False, False, False], name="bool_data")
     result = ser.describe()
     expected = Series(
         [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]
     )
     tm.assert_series_equal(result, expected)
def generate_stats(x1: pd.Series, x2: pd.Series) -> pd.DataFrame:
    """Generate summary statistic to compare APC off versus APC on.

    Args:
        x1 (pd.Series): Series for APC off data.
        x2 (pd.Series): Series for APC on data.

    Returns:
        pd.DataFrame: Dataframe of summary statistics to compare the two series.
    """
    data_for_stats = {"APC OFF": x1.describe(), "APC ON": x2.describe()}
    data_stats = pd.DataFrame(data_for_stats)
    data_stats2 = data_stats.transpose()
    data_stats2.insert(loc=1,
                       column="% count",
                       value=data_stats.loc["count"] /
                       sum(data_stats.loc["count"]) * 100)

    data_stats2["low_fence"] = data_stats.loc["25%"] - 1.5 * (
        data_stats.loc["75%"] - data_stats.loc["25%"])
    data_stats2["high_fence"] = data_stats.loc["75%"] + 1.5 * (
        data_stats.loc["75%"] - data_stats.loc["25%"])
    data_stats2["data min"] = np.where(
        data_stats2["low_fence"] > data_stats2["min"],
        data_stats2["low_fence"], data_stats2["min"])
    data_stats2["data max"] = np.where(
        data_stats2["high_fence"] < data_stats2["max"],
        data_stats2["high_fence"], data_stats2["max"])

    data_stats2.insert(
        loc=3,
        column="mean \u0394",
        value=data_stats.loc["mean", ].diff(),
    )
    data_stats2.insert(
        loc=4,
        column="% mean \u0394",
        value=data_stats.loc["mean", ].pct_change() * 100,
    )
    data_stats2.insert(
        loc=6,
        column="% std \u0394",
        value=data_stats.loc["std", ].pct_change() * 100,
    )

    data_stats2.drop(["low_fence", "high_fence"], axis="columns", inplace=True)
    return data_stats2
Example #8
0
def summary(
    many_values: List[List[float]],
    days_per_simulation: int,
) -> Series:
    rois = Series([roi(values) for values in many_values])
    rois_desc = rois.describe()
    rois_desc['sterling_ratio'] = sterling_ratio(many_values, days_per_simulation)
    return rois_desc
Example #9
0
    def test_describe_strs(self):

        ser = Series(["a", "a", "b", "c", "d"], name="str_data")
        result = ser.describe()
        expected = Series(
            [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]
        )
        tm.assert_series_equal(result, expected)
Example #10
0
 def test_describe_ints(self):
     ser = Series([0, 1, 2, 3, 4], name="int_data")
     result = ser.describe()
     expected = Series(
         [5, 2, ser.std(), 0, 1, 2, 3, 4],
         name="int_data",
         index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
     )
     tm.assert_series_equal(result, expected)
Example #11
0
    def test_describe_objects(self):
        s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
        result = s.describe()
        expected = Series({'count': 7, 'unique': 4,
                           'top': 'a', 'freq': 3}, index=result.index)
        assert_series_equal(result, expected)

        dt = list(self.ts.index)
        dt.append(dt[0])
        ser = Series(dt)
        rs = ser.describe()
        min_date = min(dt)
        max_date = max(dt)
        xp = Series({'count': len(dt),
                     'unique': len(self.ts.index),
                     'first': min_date, 'last': max_date, 'freq': 2,
                     'top': min_date}, index=rs.index)
        assert_series_equal(rs, xp)
    def _crunch_all(self, unit):
        """Call all statistic-calculating methods for each unit with data."""

        unit.calculate_GVI_and_PGS()

        s = Series(unit.just_readings)

        unit.summary = s.describe()

        unit.median = s.median()
Example #13
0
	def _crunch_all(self, unit):
		"""Call all statistic-calculating methods for each unit with data."""

		unit.calculate_GVI_and_PGS()

		s = Series(unit.just_readings)

		unit.summary = s.describe()

		unit.median = s.median()
Example #14
0
    def test_describe_empty(self):
        result = pd.Series().describe()

        self.assertEqual(result['count'], 0)
        self.assertTrue(result.drop('count').isnull().all())

        nanSeries = Series([np.nan])
        nanSeries.name = 'NaN'
        result = nanSeries.describe()
        self.assertEqual(result['count'], 0)
        self.assertTrue(result.drop('count').isnull().all())
Example #15
0
    def test_describe_empty(self):
        result = pd.Series().describe()

        self.assertEqual(result['count'], 0)
        self.assertTrue(result.drop('count').isnull().all())

        nanSeries = Series([np.nan])
        nanSeries.name = 'NaN'
        result = nanSeries.describe()
        self.assertEqual(result['count'], 0)
        self.assertTrue(result.drop('count').isnull().all())
Example #16
0
 def test_describe_period(self):
     ser = Series(
         [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")],
         name="period_data",
     )
     result = ser.describe()
     expected = Series(
         [3, 2, ser[0], 2],
         name="period_data",
         index=["count", "unique", "top", "freq"],
     )
     tm.assert_series_equal(result, expected)
    def __init__(self, column: pd.Series):
        super().__init__(column)
        self.description = column.describe()
        self.mean = self.description.at["mean"]
        self.std_dev = self.description.at["std"]
        self.min = self.description.at["min"]
        self.max = self.description.at["max"]

        # refers to interquartile range (IQR)
        self.q1 = self.description.at["25%"]
        self.median = self.description.at["50%"]
        self.q3 = self.description.at["75%"]
        self.iqr = self.q3 - self.q1
Example #18
0
def condition_stat(start_date, end_date, index_code, condition_num):
    """
    给定指定日期和美股变化的条件值,index_code为国内股市的指定变化
    """
    conn = connect_data_source()
    doom_data = find_condition_date_usa(start_date, end_date, condition_num)
    select_date_time_list = doom_data.index
    open_price_change_list = []
    day_price_change_list = []
    for selected_date in select_date_time_list:
        open_price_change, day_price_change = trading_day_state(
            index_code, selected_date, conn)
        open_price_change_list.append(open_price_change)
        day_price_change_list.append(day_price_change)
    open_price_change_series = Series(open_price_change_list)
    day_price_change_series = Series(day_price_change_list)
    open_price_change_series.hist()
    day_price_change_series.hist()
    print(open_price_change_series.describe())
    print(day_price_change_series.describe())
    print(sum(day_price_change_series > 0))
    return open_price_change_series, day_price_change_series
Example #19
0
    def test_describe(self):
        s = Series([0, 1, 2, 3, 4], name="int_data")
        result = s.describe()
        expected = Series(
            [5, 2, s.std(), 0, 1, 2, 3, 4],
            name="int_data",
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        tm.assert_series_equal(result, expected)

        s = Series([True, True, False, False, False], name="bool_data")
        result = s.describe()
        expected = Series([5, 2, False, 3],
                          name="bool_data",
                          index=["count", "unique", "top", "freq"])
        tm.assert_series_equal(result, expected)

        s = Series(["a", "a", "b", "c", "d"], name="str_data")
        result = s.describe()
        expected = Series([5, 4, "a", 2],
                          name="str_data",
                          index=["count", "unique", "top", "freq"])
        tm.assert_series_equal(result, expected)
Example #20
0
def main():
    url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL)
    #import pdb; pdb.set_trace()
    results = simplejson.loads(download(url))
    db = get_db_engine()
    counts = []
    for doc in results['response']['docs']:
        item_id = doc['item_id']
        count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id)
        if count.rowcount:
            counts.append(list(count)[0][0])
        else:
            counts.append(0)
    cs = Series(counts)
    logger.info(cs.describe())
Example #21
0
 def test_datetime_is_numeric_includes_datetime(self):
     s = Series(date_range("2012", periods=3))
     result = s.describe(datetime_is_numeric=True)
     expected = Series(
         [
             3,
             Timestamp("2012-01-02"),
             Timestamp("2012-01-01"),
             Timestamp("2012-01-01T12:00:00"),
             Timestamp("2012-01-02"),
             Timestamp("2012-01-02T12:00:00"),
             Timestamp("2012-01-03"),
         ],
         index=["count", "mean", "min", "25%", "50%", "75%", "max"],
     )
     tm.assert_series_equal(result, expected)
Example #22
0
def print_not_rarefying(dat: str, tsv_sam_sum: pd.Series) -> None:
    """
    Parameters
    ----------
    dat: str
        Dataset name
    tsv_sam_sum : pd.Series
        Sum of reads per sample
    """
    print('[%s] Second quantile of the reads-per-sample '
          'distribution is <1000' % dat)
    print('- The sequencing might have failed! Analyze with caution')
    print('- reads-per-sample distribution described:')
    for x, y in tsv_sam_sum.describe().to_dict().items():
        print('\t%s: %s' % (x, round(y, 3)))
    print('!!! NOT RAREFYING %s !!!' % dat)
Example #23
0
    def test_describe_empty_object(self):
        # https://github.com/pandas-dev/pandas/issues/27183
        s = Series([None, None], dtype=object)
        result = s.describe()
        expected = Series(
            [0, 0, np.nan, np.nan],
            dtype=object,
            index=["count", "unique", "top", "freq"],
        )
        tm.assert_series_equal(result, expected)

        result = s[:0].describe()
        tm.assert_series_equal(result, expected)
        # ensure NaN, not None
        assert np.isnan(result.iloc[2])
        assert np.isnan(result.iloc[3])
Example #24
0
def custom_series_function(ser: pd.Series,
                           within: int) -> pd.core.series.Series:
    """A more challenging mask to apply.
    When passed a series of floats, return all values
        within the given rage of:
         - the minimum value
         - the 1st quartile value
         - the second quartile value
         - the mean
         - the third quartile value
         - the maximum value
    You may want to brush up on some simple statistics to help you here.
    Also, the series is passed to you sorted assending.
        Be sure that you don't return values out of sequence.

    So, for example if you mean is 5.0 and within is 0.1
        return all value between 4.9 and 5.1 inclusive

    :param ser: Series to perform operation on
    :param within: The value to calculate the range of number within
    """
    def value_filter(value, stats, within):
        # What I forgot to mention in the original solution is that
        #  in this context, value is the value of the row we are comparing

        # The statistics from describe are passed in as a dictionary
        for k, v in stats.items():
            # We want to ignore the count and the standard deviation
            if (k == "count") or (k == "std"):
                next

            # For clarity create the min / max rage
            range_min = v - within
            range_max = v + within

            # If the current value is in the range return true
            if range_min <= value <= range_max:
                return True

        # Return False if the value does not match any of the ranges
        return False

    # Create a dictionary of the series statistics
    measures = ser.describe().to_dict()

    return ser[ser.apply(value_filter, args=(measures, within))]
def gaussian_noise_fct(numSamples):
    #Seed for the random Number generator
    seed(1)
    #Gaussian White noise series
    gaussNoise = [gauss(0.0, 1.0) for i in range(numSamples)]
    gaussNoise = Series(gaussNoise)

    #Print gaussian noise information
    print("\n--- GAUSSIAN WHITE NOISE Information ---")
    print("Mean must be near 0.0 and Standard Deviation must be near 1.0")
    print(gaussNoise.describe())

    xMax = int(round(max(gaussNoise)))
    xMin = int(round(min(gaussNoise)))
    gaussMod = [((gaussNoise[i] - xMin) / (xMax - xMin)) * (1980 - 0) + 0
                for i in range(numSamples)]
    return gaussMod
Example #26
0
def get_datasets_raref_evals(sam_sum: pd.Series) -> set:
    """

    Parameters
    ----------
    sam_sum : pd.Series
        Sum of reads per sample

    Returns
    -------
    datasets_raref_evals : set
    """
    datasets_raref_evals = set([
        int(x) for x in sam_sum.describe(
            percentiles=[x / 100 for x in range(10, 101, 10)])[4:-1]
    ])
    return datasets_raref_evals
Example #27
0
def single_mean_interval(sample: pd.Series, ci: float) -> Tuple[float, float]:
    """

    Args:
        sample: Numeric variable with the values in a Pandas Series
        ci: Level of confidence for the interval as a real number between 0 and 1. i.e. 0.90 for a 90% interval

    Returns:
        Tuple with the start and end values of the interval.
    """
    _statistics = sample.describe()
    _SE = _statistics['std'] / np.sqrt(_statistics['count'])
    df = _statistics['count'] - 1
    t_star = st.t.ppf((1-ci)/2, df=df)
    _ME = t_star * _SE
    validate_conditions_for_theoretical_distns(inference_type='single-mean', n=_statistics['count'])
    return _statistics['mean'] - _ME, _statistics['mean'] + _ME
Example #28
0
 def test_describe_timedelta64(self):
     ser = Series(
         [
             Timedelta("1 days"),
             Timedelta("2 days"),
             Timedelta("3 days"),
             Timedelta("4 days"),
             Timedelta("5 days"),
         ],
         name="timedelta_data",
     )
     result = ser.describe()
     expected = Series(
         [5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]],
         name="timedelta_data",
         index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
     )
     tm.assert_series_equal(result, expected)
def boxplot_summary(column: pd.Series):
    """Get summary stats of box-plot
    
    Arguments:
    ----------
    column: pd.Series
        Input column for summary
        
    Returns:
    --------
    Summary of the box-plot
    
    Example:
    --------
    >>> boxplot_summary(df['column'])

    Reference:
    ----------
    Skewness: Skewness for normal distribution is zero.
              Any symmetric data should have a skewness near to zero.
              Negative values indicate, data is skewed left.
              Positive values indicate, data is skewed right.

    Kurtosis: Kurtosis for standard normal distribution is
              0, if fisher's definition is used.
              3, if pearson's  definition is used.
              With respect to fisher's definition
              Positive kurtosis indicate heavy tailed
              Negative kurtosis indicate light tailed
    """
    col_desc = column.describe()
    q1 = col_desc.loc['25%']
    q3 = col_desc.loc['75%']
    iqr = q3 - q1
    lower_extreme = q1 - 1.5*iqr
    upper_extreme = q3 + 1.5*iqr
    col_desc.at['lower-extreme'] = lower_extreme
    col_desc.at['upper-extreme'] = upper_extreme
    col_desc.at['median'] = column.median()
    col_desc.at['iqr'] = iqr
    col_desc.at['skewness'] = skew(column)
    col_desc.at['kurtosis'] = kurtosis(column)
    print(col_desc)
Example #30
0
    def get_numerical_stats(cls,
                            column: pd.Series,
                            column_baseline: Dict = None):
        describe = column.describe().to_dict()

        quantiles = cls.get_quantiles(column=column)
        distribution = cls.get_numerical_distribution(
            column=column, column_baseline=column_baseline)

        stats = {
            "mean": describe["mean"],
            "sum": float(column.sum()),
            "std_dev": describe["std"],
            "min": describe["min"],
            "max": describe["max"],
            "quantiles": quantiles,
            "distribution": distribution,
        }

        return stats
Example #31
0
def summary_stats(series: pd.Series) -> pd.Series:
    '''Produce univariate summary statistics for a numerical series.

    Provides quartiles (q1, median and q3 respectively), mean, standard
    deviation (std), skewness (skew), kurtosis (kurt) and extremes (min, max).
    Note that for very short series, the higher moments (std, skew, kurt)
    might come out as NaN.

    :param series: A numerical series to compute summary statistics for.
    '''
    sumstat = series.describe().drop('count')
    # rename quartiles
    index = sumstat.index.tolist()
    index[index.index('25%'):index.index('75%')+1] = ['q1', 'median', 'q3']
    sumstat.index = index
    # add what pandas describe does not provide
    for key in SUMMARY_STATS:
        if key not in index:
            sumstat[key] = getattr(series, key)()
    return sumstat
Example #32
0
def plot_noise():
    # seed random number generator
    seed(30)
    # create white noise series
    series = [gauss(0.0, 1.0) for i in range(50)]
    series = Series(series)
    # summary stats
    print(series.describe())
    # prelims for subplots
    fig, ax = plt.subplots(nrows=2, ncols=2)
    # line plot
    series.plot(ax=ax[0, 0])
    ax[0, 0].set_title('White Noise')
    # histogram plot
    series.hist(ax=ax[0, 1])
    ax[0, 1].set_title('Noise Histogram')
    # autocorrelation
    from pandas.plotting import autocorrelation_plot
    autocorrelation_plot(series, ax=ax[1, 0])
    plt.tight_layout()
    plt.show()
Example #33
0
 def test_describe_with_tz(self, tz_naive_fixture):
     # GH 21332
     tz = tz_naive_fixture
     name = str(tz_naive_fixture)
     start = Timestamp(2018, 1, 1)
     end = Timestamp(2018, 1, 5)
     s = Series(date_range(start, end, tz=tz), name=name)
     result = s.describe()
     expected = Series(
         [
             5,
             5,
             s.value_counts().index[0],
             1,
             start.tz_localize(tz),
             end.tz_localize(tz),
         ],
         name=name,
         index=["count", "unique", "top", "freq", "first", "last"],
     )
     tm.assert_series_equal(result, expected)
Example #34
0
    def test_describe_tz_values2(self):
        tz = "CET"
        s1 = Series(range(5))
        start = Timestamp(2018, 1, 1)
        end = Timestamp(2018, 1, 5)
        s2 = Series(date_range(start, end, tz=tz))
        df = DataFrame({"s1": s1, "s2": s2})

        s1_ = s1.describe()
        s2_ = Series(
            [
                5,
                5,
                s2.value_counts().index[0],
                1,
                start.tz_localize(tz),
                end.tz_localize(tz),
            ],
            index=["count", "unique", "top", "freq", "first", "last"],
        )
        idx = [
            "count",
            "unique",
            "top",
            "freq",
            "first",
            "last",
            "mean",
            "std",
            "min",
            "25%",
            "50%",
            "75%",
            "max",
        ]
        expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]

        with tm.assert_produces_warning(FutureWarning):
            result = df.describe(include="all")
        tm.assert_frame_equal(result, expected)
Example #35
0
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print '求和'
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print df
print df.sum()  # 按列求和
print df.sum(axis = 1)  # 按行求和
print

print '平均数'
print df.mean(axis = 1, skipna = False)
print df.mean(axis = 1)
print

print '其它'
print df.idxmax()
print df.cumsum()
print df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
print obj.describe()
Example #36
0
 def test_describe_none(self):
     noneSeries = Series([None])
     noneSeries.name = 'None'
     assert_series_equal(noneSeries.describe(),
                         Series([0, 0], index=['count', 'unique']))
Example #37
0
if __name__ == "__main__":
    _files = glob.glob(r"{path}\*{file_ext}".format(path=PATH, file_ext=FILE_EXT))
    F_DICT = {_fp: os.path.split(_fp)[1].split(".")[0] for _fp in _files}

    sb_all = DataFrame()

    for _file in _files:

        fp, trace = parse_file(_file)

        total_duration = trace.duration if INTERVAL is None else INTERVAL
        ss = Series(
            (event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL))
        )
        summary = ss.describe()
        summary["90%"] = ss.quantile(0.9)
        summary["Janks Per Second"] = trace.android.jankrate(interval=INTERVAL)
        summary["Average FPS"] = trace.android.framerate(interval=INTERVAL)

        ss_first = Series(
            (
                event.interval.duration * 1000
                for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL)
                if trace.cpu.frequency_intervals(cpu=0, interval=event.interval)
                and trace.cpu.frequency_intervals(cpu=0, interval=event.interval)[0] == 384000
            )
        )
        summary_first = ss_first.describe()
        summary_first["90%"] = ss_first.quantile(0.9)
        summary_first["Janks Per Second"] = summary["Janks Per Second"]
print
df.describe()  # 对DataFrame每列计算汇总统计
'''
            one      two
count  3.000000  2.00000
mean   2.666667  2.50000
std    3.785939  2.12132
min    0.000000  1.00000
25%         NaN      NaN
50%         NaN      NaN
75%         NaN      NaN
max    7.000000  4.00000
'''
obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c'])
print
obj.describe()  # 对Series计算汇总统计
'''
count    4.000000
mean     4.500000
std      2.516611
min      2.000000
25%      3.500000
50%      4.000000
75%      5.000000
max      8.000000
dtype: float64
'''

print
'去重'
obj = Series(['c', 'a', 'd', 'b', 'b', 'c'])
Example #39
0
class GradeBook(object):
    """A class encapsulating a pandas DataFrame and meant to store
    the grades for a whole class. It provides the method compute_total_grades
    that compute the totla grade for each student according to a weights provided
    by the caller.
    """

    def __init__(self, grade_arr, student_ids, item_list, max_scores):
        """
        Constructor of the class grade frame:
        It should set the following attributes:

        (1) self.raw_grades, which is a DataFrame with
        - row labels given by student_ids
        - column labels given by item_list
        - values given by grade_arr

        (2) self.total_grades, set to None

        (3) self.letter_grades, set to None

        (4) self.max_scores, set to max_scores

        Parameters
        ----------
        grade_arr : numpy array of grades as returned by simulate_grades

        student_ids: a list of student ids

        item_list: a list of grade items (e.g. ['HW', 'M', 'F'])

        max_scores: a list of the maximum possible score for each grade item

        Returns
        -------
        nothing

        Examples
        --------
        >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50])
        >>> type(a.raw_grades) == DataFrame
        True
        >>> a.total_grades == None
        True
        >>> a.raw_grades.shape == (2,2)
        True
        >>> a.raw_grades.ix[0,0] == 1
        True
        >>> a.max_scores[0] == 30
        True
        """
        self.total_grades = None
        self.letter_grades = None
        self.max_scores = max_scores
        self.student_ids = student_ids
        self.item_list = item_list
        self.grade_arr = grade_arr
        self.raw_grades = DataFrame(data = grade_arr, index = student_ids, columns = item_list)
        


    def compute_total_grades(self, item_weights=None, max_score=100):
        """
        Compute student total class grades as a weighted average of the column in self.raw_grades
        according to the weights passed to item_weight for each of the columns.
        The student total class grades are then stored in the Series attribute self.total_grades
        The return value should be a Series containing a numerical summary
        (as returned by the Series method describe) of the total class grade distribution.

        Parameters
        ----------
        item_weights: list of floats summing up to one
            List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3])

        max_score: float
            Maximal possible score for the total class grade

        Returns
        -------
        out : Series
            A Series containing a numerical summary of the total
            grade distribution previously stored by the function
            in the attribute self.total_grades; this Series is the
            output of the Series method describe.
        ----

        Examples
        --------
        >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10])
        >>> b = a.compute_total_grades([0.5, 0.5], 100)
        >>> len(b) == 5
        False
        >>> a.total_grades['22'] == 50
        True
        >>> a.total_grades['34'] == 10
        True
        """
        grades = []
        raw_sum = []
        for x in range(len(self.grade_arr)):
        	for y in range(len(self.grade_arr[x])):
        		self.grade_arr[x][y] = self.max_scores[y]*self.grade_arr[x][y]
        	grades.append(self.grade_arr[x])
        for x in range(len(grades)):
        	for y in range(len(grades[x])):
        		grades[x][y] = grades[x][y] * item_weights[y]
        	raw_sum.append(sum(grades[x]))
        	
        self.total_grades = Series(raw_sum, index= self.student_ids)
        return self.total_grades.describe()
Example #40
0
class HisRecord():
    """
    This class is a single record
    - hisId is the haystack Id of the trend
    - data is created as DataFrame to be used directly in Pandas
    """
    def __init__(self,session,hisId,dateTimeRange='today'):
        """
        GET data from server and fill this object with historical info
        """
        self.hisId = hisId
        self.name = self.getHisNameFromId(session,self.hisId)
        index = []
        values = []

        for eachRows in session.read('hisRead?id='+self.hisId+'&range='+dateTimeRange)['rows']:
            index.append(pd.Timestamp(pd.to_datetime(datetime.datetime(*map(int, re.split('[^\d]', eachRows['ts'].split(' ')[0])[:-2])))))
            #This will allow conversion of Enum value to float so Pandas will work            
            
            
            if (eachRows['val'] == 'F'):
                values.append(False)
            elif (eachRows['val'] == 'T'):
                values.append(True)
            # regex coding here to extract float value when units are part of value (ex. 21.8381°C)
            elif tools.isfloat(re.findall(r"[-+]?\d*\.*\d+", eachRows['val'])[0]):
                values.append(float(re.findall(r"[-+]?\d*\.*\d+", eachRows['val'])[0]))    
            else:
                values.append(eachRows['val'])
        
        try:
            #Declare Series and localize using Site Timezone
            self.data = Series(values,index=index).tz_localize(session.timezone)
            #Renaming index so the name will be part of the serie
            self.data = self.data.reindex(self.data.index.rename([self.name]))
        except Exception:
            print('%s is an Unknown history type' % self.hisId)
    
    def getHisNameFromId(self,session,pointId):
        """
        Retrieve name from id of an history
        """
        for each in session.read("read?filter=his")['rows']:
            if each['id'].split(' ',1)[0] == pointId:
                return (each['id'].split(' ',1)[1])
        return 'Id Not found'    
        
    def plot(self):
        """
        Draw a graph of the DataFrame
        """
        self.data.plot()
        
    def breakdownPlot(self, startTime = '08:00', endTime = '17:00', bins=np.array([0,0.5,1,18.0,18.5,19.0,19.5,20.0,20.5,21.0,21.5,22.0,22.5,23.0, 23.5, 24.0, 24.5,25.0])):
        """
        By default, creates a breakdown plot of temperature distribution between 18 and 25
        bins (distribution) can be past as argument
        By default, takes values between 8:00 and 17:00
        startTime = string representation of time (ex. '08:00')
        endtime = string representation of time (ex. '17:00')
        bin = np.array representing distribution
        """
        x = self.data.between_time(startTime,endTime)
        barplot = pd.cut(x.dropna(),bins)
        x.groupby(barplot).size().plot(kind='bar')
        #self.data.groupby(barplot).size()
    
    def simpleStats(self):
        """
        Shortcut for describe() pandas version
        """
        return self.data.describe()
        
    def __str__(self):
        return 'History Record of %s' % self.name
Example #41
0
def get_mode(arr):  
    mode = [];  
    arr_appear = dict((a, arr.count(a)) for a in arr);  # 统计各个元素出现的次数  
    if max(arr_appear.values()) == 1:  # 如果最大的出现为1  
        return;  # 则没有众数  
    else:  
        for k, v in arr_appear.items():  # 否则,出现次数最大的数字,就是众数  
            if v == max(arr_appear.values()):  
                mode.append(k);  
    return mode;  

get_mode(a)

var(a)
std(a)

a=Series(a)
a.skew()
a.kurt()
a.describe()

df = DataFrame({'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df.cov()
df.corr()

###假设检验
from scipy import stats as ss
df=DataFrame({'data':[10.1,10,9.8,10.5,9.7,10.1,9.9,10.2,10.3,9.9]})
ss.ttest_1samp(a = df, popmean = 10)
Example #42
0
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
for tic, data in all_data.iteritems()})
price
volume = DataFrame({tic: data['Volume'] 
for tic, data in all_data.iteritems()})
# percent changes of the prices:
returns = price.pct_change()
# -*- coding: utf-8 -*-
from pandas import Series,DataFrame
import pandas as pd

s=Series([1,2,3],index=['a','b','c'])
d=DataFrame([[1,2,3],[4,5,6]],columns=['a','b','c'])

#head() method will return top 5 records
print (s.head())
print (s.describe())
print (d.head())
print (d.describe())

#read data from xml file
excel_data=pd.read_excel("./server.xlsx")
print (excel_data.head)
Example #44
0
#**********************************
# Set ABOVE
#**********************************


def parse_file(filepath):
    trace = Ftrace(filepath)
    return (filepath, trace)

if __name__ == '__main__':
    _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT))
    F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files}
    
    sb_all = DataFrame(columns=F_DICT.values())
    
    for _file in _files:
        
        fp, trace = parse_file(_file)

        total_duration = trace.duration if INTERVAL is None else INTERVAL
        ss = Series((event.interval.duration for event in trace.android.render_frame_intervals(interval=INTERVAL)))
        ss = ss * 1000. #
        summary = ss.describe()
        summary['90%'] = ss.quantile(.9)
        summary['Janks'] = trace.android.num_janks(interval=INTERVAL)
        summary['Janks Per Second'] = summary['Janks']/total_duration
        summary['Average FPS'] = trace.android.framerate(interval=INTERVAL)
        sb_all[F_DICT[fp]] = summary
    
    sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
        
Example #45
0
print(df.mean(axis=1,skipna=False))
print('\n')
print(df.idxmax())
print('\n')
print(df.cumsum())
print('\n')
print(df.cumsum(axis=1))
print('\n')
print(df.describe())
print('\n')

###############################################################

obj = Series(['a','a','b','c']*4)
print(obj)
print(obj.describe())
print('\n')

###############################################################

all_data = {}

for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOGL']:
    all_data[ticker] = web.get_data_yahoo(ticker, '10/1/2015', '10/11/2015')

price = DataFrame({tic:data['Adj Close']
                            for tic, data in all_data.items()})
volume = DataFrame({tic:data['Volume']
                            for tic, data in all_data.items()})

print(price)
Example #46
0
def pd_05():
    obj=Series([7,-5,7,4,2,0,4])
    print obj.rank()
    print obj.rank(method='first')
    print obj.rank(ascending=False,method='first')
    print obj.describe()
Example #47
0
 def test_describe_objects(self):
     s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
     result = s.describe()
     expected = Series({'count' : 7, 'unique' : 4,
                        'top' : 'a', 'freq' : 3}, index=result.index)
     assert_series_equal(result, expected)
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
## isnull notnull

##-- isnull() returns a Series with the same indices containing Boolean values 
##--  indicating True for null values which include NaN and None, among others. 
		
##-- notnull() returns the negation of isnull() 
##-– that is, True for non-null values, and False otherwise.


## describe() returns a simple set of summary statistics about a Series. 
## The values returned is a series where

########################################################

s1 = Series(arange(10.0,20.0))
s1.describe()
summ = s1.describe()
summ["mean"]
########################################################

	unique and nunique
## unique() returns the unique elements of a series 
## nunique() returns the number of unique values in a Series.

drop and dropna
drop(labels) drop elements with the selected labels from a Series.

	
## drop(labels) drop elements with the selected labels from a Series.
s1 = Series(arange(1.0,6),index=["a","a","b","c","d"])
s1
Example #50
0
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]
#[Out]#       INCIDENT DATE   LATITUDE   LONGITUDE
#[Out]# 0  05/07/2010 17:26  18.233333  -72.533333
#[Out]# 1  28/06/2010 23:06  50.226029    5.729886
#[Out]# 2  24/06/2010 16:21  22.278381  114.174287
#[Out]# 3  20/06/2010 21:59  44.407062    8.933989
#[Out]# 4  18/05/2010 16:26  18.571084  -72.334671
#[Out]# 5  26/04/2010 13:14  18.593707  -72.310079
#[Out]# 6  26/04/2010 14:19  18.482800  -73.638800
#[Out]# 7  26/04/2010 14:27  18.415000  -73.195000
#[Out]# 8  15/03/2010 10:58  18.517443  -72.236841
#[Out]# 9  15/03/2010 11:00  18.547790  -72.410010
#[Out]# 
#[Out]# [10 rows x 3 columns]
# Wed, 09 Jul 2014 00:38:24
data.describe()
#[Out]#             Serial     LATITUDE    LONGITUDE
#[Out]# count  3593.000000  3593.000000  3593.000000
#[Out]# mean   2080.277484    18.611495   -72.322680
#[Out]# std    1171.100360     0.738572     3.650776
#[Out]# min       4.000000    18.041313   -74.452757
#[Out]# 25%    1074.000000    18.524070   -72.417500
#[Out]# 50%    2163.000000    18.539269   -72.335000
#[Out]# 75%    3088.000000    18.561820   -72.293570
#[Out]# max    4052.000000    50.226029   114.174287
#[Out]# 
#[Out]# [8 rows x 3 columns]
# Wed, 09 Jul 2014 00:38:53
data['CATEGORY'][:6]
#[Out]# 0          1. Urgences | Emergency, 3. Public Health, 
#[Out]# 1    1. Urgences | Emergency, 2. Urgences logistiqu...