def test_describe_objects(self): s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) result = s.describe() expected = Series({ 'count': 7, 'unique': 4, 'top': 'a', 'freq': 3 }, index=result.index) assert_series_equal(result, expected) dt = list(self.ts.index) dt.append(dt[0]) ser = Series(dt) rs = ser.describe() min_date = min(dt) max_date = max(dt) xp = Series( { 'count': len(dt), 'unique': len(self.ts.index), 'first': min_date, 'last': max_date, 'freq': 2, 'top': min_date }, index=rs.index) assert_series_equal(rs, xp)
def __setstate__(self,state): #self._missing, _d = state udict.clear(self) for el in _d: k,(t,i,d,s,f) = el if t=='S' or t=='F': s=i l=1 LOGGER.debug('Read %s from pickle is %s %s',k,l,type(s)) elif t=='A': s = Series(d,index=i) l = len(s) LOGGER.debug('Read %s from pickle is %s %s %s',k,l,type(s),s.index) LOGGER.debug("%s",s.describe()) elif t=='T': pr = period_range(s,periods=len(d),freq=f) s = Series(d,index=pr) l = len(s) LOGGER.debug('Read %s from pickle is %s %s %s',k,l,type(s),s.index) LOGGER.debug("%s",s.describe()) else: raise E4tSystemError("ED_001: Cannot set state for %s",k) self[k]=s self._mk_environment()
def test_describe(self): s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() expected = Series( [5, 2, s.std(), 0, 1, 2, 3, 4], name="int_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) s = Series([True, True, False, False, False], name="bool_data") result = s.describe() expected = Series([5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) s = Series(["a", "a", "b", "c", "d"], name="str_data") result = s.describe() expected = Series([5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) s = Series( [ Timedelta("1 days"), Timedelta("2 days"), Timedelta("3 days"), Timedelta("4 days"), Timedelta("5 days"), ], name="timedelta_data", ) result = s.describe() expected = Series( [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]], name="timedelta_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) s = Series( [ Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M") ], name="period_data", ) result = s.describe() expected = Series( [3, 2, s[0], 2], name="period_data", index=["count", "unique", "top", "freq"], ) tm.assert_series_equal(result, expected)
def test_describe_categorical(self): df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) df["value_group"] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) cat = df # Categoricals should not show up together with numerical columns result = cat.describe() assert len(result.columns) == 1 # In a frame, describe() for the cat should be the same as for string # arrays (count, unique, top, freq) cat = Categorical(["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True) s = Series(cat) result = s.describe() expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) cat = Series(Categorical(["a", "b", "c", "c"])) df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
def single_mean_test(sample: pd.Series, mu_0: float, alternative: str) -> Dict[str, float]: """Performs a single mean test Args: sample: Numeric variable with the values in a Pandas Series mu_0: Mean from the Null Hypothesis alternative: Defines the alternative hypothesis. Possible values: 'less', 'greater', or 'two-sided'. Returns: Dict with the calculated "t" parameter and the p-value """ _statistics = sample.describe() _SE = _statistics['std'] / np.sqrt(_statistics['count']) t = (_statistics['mean'] - mu_0) / _SE df = _statistics['count'] - 1 validate_conditions_for_theoretical_distns(inference_type='single-mean', n=_statistics['count']) return { 't': t, 'p-value': get_p_value(t, distribution='t', alternative=alternative, df=df) }
def test_describe_bools(self): ser = Series([True, True, False, False, False], name="bool_data") result = ser.describe() expected = Series( [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] ) tm.assert_series_equal(result, expected)
def generate_stats(x1: pd.Series, x2: pd.Series) -> pd.DataFrame: """Generate summary statistic to compare APC off versus APC on. Args: x1 (pd.Series): Series for APC off data. x2 (pd.Series): Series for APC on data. Returns: pd.DataFrame: Dataframe of summary statistics to compare the two series. """ data_for_stats = {"APC OFF": x1.describe(), "APC ON": x2.describe()} data_stats = pd.DataFrame(data_for_stats) data_stats2 = data_stats.transpose() data_stats2.insert(loc=1, column="% count", value=data_stats.loc["count"] / sum(data_stats.loc["count"]) * 100) data_stats2["low_fence"] = data_stats.loc["25%"] - 1.5 * ( data_stats.loc["75%"] - data_stats.loc["25%"]) data_stats2["high_fence"] = data_stats.loc["75%"] + 1.5 * ( data_stats.loc["75%"] - data_stats.loc["25%"]) data_stats2["data min"] = np.where( data_stats2["low_fence"] > data_stats2["min"], data_stats2["low_fence"], data_stats2["min"]) data_stats2["data max"] = np.where( data_stats2["high_fence"] < data_stats2["max"], data_stats2["high_fence"], data_stats2["max"]) data_stats2.insert( loc=3, column="mean \u0394", value=data_stats.loc["mean", ].diff(), ) data_stats2.insert( loc=4, column="% mean \u0394", value=data_stats.loc["mean", ].pct_change() * 100, ) data_stats2.insert( loc=6, column="% std \u0394", value=data_stats.loc["std", ].pct_change() * 100, ) data_stats2.drop(["low_fence", "high_fence"], axis="columns", inplace=True) return data_stats2
def summary( many_values: List[List[float]], days_per_simulation: int, ) -> Series: rois = Series([roi(values) for values in many_values]) rois_desc = rois.describe() rois_desc['sterling_ratio'] = sterling_ratio(many_values, days_per_simulation) return rois_desc
def test_describe_strs(self): ser = Series(["a", "a", "b", "c", "d"], name="str_data") result = ser.describe() expected = Series( [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] ) tm.assert_series_equal(result, expected)
def test_describe_ints(self): ser = Series([0, 1, 2, 3, 4], name="int_data") result = ser.describe() expected = Series( [5, 2, ser.std(), 0, 1, 2, 3, 4], name="int_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected)
def test_describe_objects(self): s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) result = s.describe() expected = Series({'count': 7, 'unique': 4, 'top': 'a', 'freq': 3}, index=result.index) assert_series_equal(result, expected) dt = list(self.ts.index) dt.append(dt[0]) ser = Series(dt) rs = ser.describe() min_date = min(dt) max_date = max(dt) xp = Series({'count': len(dt), 'unique': len(self.ts.index), 'first': min_date, 'last': max_date, 'freq': 2, 'top': min_date}, index=rs.index) assert_series_equal(rs, xp)
def _crunch_all(self, unit): """Call all statistic-calculating methods for each unit with data.""" unit.calculate_GVI_and_PGS() s = Series(unit.just_readings) unit.summary = s.describe() unit.median = s.median()
def test_describe_empty(self): result = pd.Series().describe() self.assertEqual(result['count'], 0) self.assertTrue(result.drop('count').isnull().all()) nanSeries = Series([np.nan]) nanSeries.name = 'NaN' result = nanSeries.describe() self.assertEqual(result['count'], 0) self.assertTrue(result.drop('count').isnull().all())
def test_describe_period(self): ser = Series( [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")], name="period_data", ) result = ser.describe() expected = Series( [3, 2, ser[0], 2], name="period_data", index=["count", "unique", "top", "freq"], ) tm.assert_series_equal(result, expected)
def __init__(self, column: pd.Series): super().__init__(column) self.description = column.describe() self.mean = self.description.at["mean"] self.std_dev = self.description.at["std"] self.min = self.description.at["min"] self.max = self.description.at["max"] # refers to interquartile range (IQR) self.q1 = self.description.at["25%"] self.median = self.description.at["50%"] self.q3 = self.description.at["75%"] self.iqr = self.q3 - self.q1
def condition_stat(start_date, end_date, index_code, condition_num): """ 给定指定日期和美股变化的条件值,index_code为国内股市的指定变化 """ conn = connect_data_source() doom_data = find_condition_date_usa(start_date, end_date, condition_num) select_date_time_list = doom_data.index open_price_change_list = [] day_price_change_list = [] for selected_date in select_date_time_list: open_price_change, day_price_change = trading_day_state( index_code, selected_date, conn) open_price_change_list.append(open_price_change) day_price_change_list.append(day_price_change) open_price_change_series = Series(open_price_change_list) day_price_change_series = Series(day_price_change_list) open_price_change_series.hist() day_price_change_series.hist() print(open_price_change_series.describe()) print(day_price_change_series.describe()) print(sum(day_price_change_series > 0)) return open_price_change_series, day_price_change_series
def test_describe(self): s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() expected = Series( [5, 2, s.std(), 0, 1, 2, 3, 4], name="int_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) s = Series([True, True, False, False, False], name="bool_data") result = s.describe() expected = Series([5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) s = Series(["a", "a", "b", "c", "d"], name="str_data") result = s.describe() expected = Series([5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected)
def main(): url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL) #import pdb; pdb.set_trace() results = simplejson.loads(download(url)) db = get_db_engine() counts = [] for doc in results['response']['docs']: item_id = doc['item_id'] count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id) if count.rowcount: counts.append(list(count)[0][0]) else: counts.append(0) cs = Series(counts) logger.info(cs.describe())
def test_datetime_is_numeric_includes_datetime(self): s = Series(date_range("2012", periods=3)) result = s.describe(datetime_is_numeric=True) expected = Series( [ 3, Timestamp("2012-01-02"), Timestamp("2012-01-01"), Timestamp("2012-01-01T12:00:00"), Timestamp("2012-01-02"), Timestamp("2012-01-02T12:00:00"), Timestamp("2012-01-03"), ], index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected)
def print_not_rarefying(dat: str, tsv_sam_sum: pd.Series) -> None: """ Parameters ---------- dat: str Dataset name tsv_sam_sum : pd.Series Sum of reads per sample """ print('[%s] Second quantile of the reads-per-sample ' 'distribution is <1000' % dat) print('- The sequencing might have failed! Analyze with caution') print('- reads-per-sample distribution described:') for x, y in tsv_sam_sum.describe().to_dict().items(): print('\t%s: %s' % (x, round(y, 3))) print('!!! NOT RAREFYING %s !!!' % dat)
def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 s = Series([None, None], dtype=object) result = s.describe() expected = Series( [0, 0, np.nan, np.nan], dtype=object, index=["count", "unique", "top", "freq"], ) tm.assert_series_equal(result, expected) result = s[:0].describe() tm.assert_series_equal(result, expected) # ensure NaN, not None assert np.isnan(result.iloc[2]) assert np.isnan(result.iloc[3])
def custom_series_function(ser: pd.Series, within: int) -> pd.core.series.Series: """A more challenging mask to apply. When passed a series of floats, return all values within the given rage of: - the minimum value - the 1st quartile value - the second quartile value - the mean - the third quartile value - the maximum value You may want to brush up on some simple statistics to help you here. Also, the series is passed to you sorted assending. Be sure that you don't return values out of sequence. So, for example if you mean is 5.0 and within is 0.1 return all value between 4.9 and 5.1 inclusive :param ser: Series to perform operation on :param within: The value to calculate the range of number within """ def value_filter(value, stats, within): # What I forgot to mention in the original solution is that # in this context, value is the value of the row we are comparing # The statistics from describe are passed in as a dictionary for k, v in stats.items(): # We want to ignore the count and the standard deviation if (k == "count") or (k == "std"): next # For clarity create the min / max rage range_min = v - within range_max = v + within # If the current value is in the range return true if range_min <= value <= range_max: return True # Return False if the value does not match any of the ranges return False # Create a dictionary of the series statistics measures = ser.describe().to_dict() return ser[ser.apply(value_filter, args=(measures, within))]
def gaussian_noise_fct(numSamples): #Seed for the random Number generator seed(1) #Gaussian White noise series gaussNoise = [gauss(0.0, 1.0) for i in range(numSamples)] gaussNoise = Series(gaussNoise) #Print gaussian noise information print("\n--- GAUSSIAN WHITE NOISE Information ---") print("Mean must be near 0.0 and Standard Deviation must be near 1.0") print(gaussNoise.describe()) xMax = int(round(max(gaussNoise))) xMin = int(round(min(gaussNoise))) gaussMod = [((gaussNoise[i] - xMin) / (xMax - xMin)) * (1980 - 0) + 0 for i in range(numSamples)] return gaussMod
def get_datasets_raref_evals(sam_sum: pd.Series) -> set: """ Parameters ---------- sam_sum : pd.Series Sum of reads per sample Returns ------- datasets_raref_evals : set """ datasets_raref_evals = set([ int(x) for x in sam_sum.describe( percentiles=[x / 100 for x in range(10, 101, 10)])[4:-1] ]) return datasets_raref_evals
def single_mean_interval(sample: pd.Series, ci: float) -> Tuple[float, float]: """ Args: sample: Numeric variable with the values in a Pandas Series ci: Level of confidence for the interval as a real number between 0 and 1. i.e. 0.90 for a 90% interval Returns: Tuple with the start and end values of the interval. """ _statistics = sample.describe() _SE = _statistics['std'] / np.sqrt(_statistics['count']) df = _statistics['count'] - 1 t_star = st.t.ppf((1-ci)/2, df=df) _ME = t_star * _SE validate_conditions_for_theoretical_distns(inference_type='single-mean', n=_statistics['count']) return _statistics['mean'] - _ME, _statistics['mean'] + _ME
def test_describe_timedelta64(self): ser = Series( [ Timedelta("1 days"), Timedelta("2 days"), Timedelta("3 days"), Timedelta("4 days"), Timedelta("5 days"), ], name="timedelta_data", ) result = ser.describe() expected = Series( [5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]], name="timedelta_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected)
def boxplot_summary(column: pd.Series): """Get summary stats of box-plot Arguments: ---------- column: pd.Series Input column for summary Returns: -------- Summary of the box-plot Example: -------- >>> boxplot_summary(df['column']) Reference: ---------- Skewness: Skewness for normal distribution is zero. Any symmetric data should have a skewness near to zero. Negative values indicate, data is skewed left. Positive values indicate, data is skewed right. Kurtosis: Kurtosis for standard normal distribution is 0, if fisher's definition is used. 3, if pearson's definition is used. With respect to fisher's definition Positive kurtosis indicate heavy tailed Negative kurtosis indicate light tailed """ col_desc = column.describe() q1 = col_desc.loc['25%'] q3 = col_desc.loc['75%'] iqr = q3 - q1 lower_extreme = q1 - 1.5*iqr upper_extreme = q3 + 1.5*iqr col_desc.at['lower-extreme'] = lower_extreme col_desc.at['upper-extreme'] = upper_extreme col_desc.at['median'] = column.median() col_desc.at['iqr'] = iqr col_desc.at['skewness'] = skew(column) col_desc.at['kurtosis'] = kurtosis(column) print(col_desc)
def get_numerical_stats(cls, column: pd.Series, column_baseline: Dict = None): describe = column.describe().to_dict() quantiles = cls.get_quantiles(column=column) distribution = cls.get_numerical_distribution( column=column, column_baseline=column_baseline) stats = { "mean": describe["mean"], "sum": float(column.sum()), "std_dev": describe["std"], "min": describe["min"], "max": describe["max"], "quantiles": quantiles, "distribution": distribution, } return stats
def summary_stats(series: pd.Series) -> pd.Series: '''Produce univariate summary statistics for a numerical series. Provides quartiles (q1, median and q3 respectively), mean, standard deviation (std), skewness (skew), kurtosis (kurt) and extremes (min, max). Note that for very short series, the higher moments (std, skew, kurt) might come out as NaN. :param series: A numerical series to compute summary statistics for. ''' sumstat = series.describe().drop('count') # rename quartiles index = sumstat.index.tolist() index[index.index('25%'):index.index('75%')+1] = ['q1', 'median', 'q3'] sumstat.index = index # add what pandas describe does not provide for key in SUMMARY_STATS: if key not in index: sumstat[key] = getattr(series, key)() return sumstat
def plot_noise(): # seed random number generator seed(30) # create white noise series series = [gauss(0.0, 1.0) for i in range(50)] series = Series(series) # summary stats print(series.describe()) # prelims for subplots fig, ax = plt.subplots(nrows=2, ncols=2) # line plot series.plot(ax=ax[0, 0]) ax[0, 0].set_title('White Noise') # histogram plot series.hist(ax=ax[0, 1]) ax[0, 1].set_title('Noise Histogram') # autocorrelation from pandas.plotting import autocorrelation_plot autocorrelation_plot(series, ax=ax[1, 0]) plt.tight_layout() plt.show()
def test_describe_with_tz(self, tz_naive_fixture): # GH 21332 tz = tz_naive_fixture name = str(tz_naive_fixture) start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s = Series(date_range(start, end, tz=tz), name=name) result = s.describe() expected = Series( [ 5, 5, s.value_counts().index[0], 1, start.tz_localize(tz), end.tz_localize(tz), ], name=name, index=["count", "unique", "top", "freq", "first", "last"], ) tm.assert_series_equal(result, expected)
def test_describe_tz_values2(self): tz = "CET" s1 = Series(range(5)) start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) df = DataFrame({"s1": s1, "s2": s2}) s1_ = s1.describe() s2_ = Series( [ 5, 5, s2.value_counts().index[0], 1, start.tz_localize(tz), end.tz_localize(tz), ], index=["count", "unique", "top", "freq", "first", "last"], ) idx = [ "count", "unique", "top", "freq", "first", "last", "mean", "std", "min", "25%", "50%", "75%", "max", ] expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx] with tm.assert_produces_warning(FutureWarning): result = df.describe(include="all") tm.assert_frame_equal(result, expected)
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame print '求和' df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index = ['a', 'b', 'c', 'd'], columns = ['one', 'two']) print df print df.sum() # 按列求和 print df.sum(axis = 1) # 按行求和 print print '平均数' print df.mean(axis = 1, skipna = False) print df.mean(axis = 1) print print '其它' print df.idxmax() print df.cumsum() print df.describe() obj = Series(['a', 'a', 'b', 'c'] * 4) print obj.describe()
def test_describe_none(self): noneSeries = Series([None]) noneSeries.name = 'None' assert_series_equal(noneSeries.describe(), Series([0, 0], index=['count', 'unique']))
if __name__ == "__main__": _files = glob.glob(r"{path}\*{file_ext}".format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split(".")[0] for _fp in _files} sb_all = DataFrame() for _file in _files: fp, trace = parse_file(_file) total_duration = trace.duration if INTERVAL is None else INTERVAL ss = Series( (event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL)) ) summary = ss.describe() summary["90%"] = ss.quantile(0.9) summary["Janks Per Second"] = trace.android.jankrate(interval=INTERVAL) summary["Average FPS"] = trace.android.framerate(interval=INTERVAL) ss_first = Series( ( event.interval.duration * 1000 for event in trace.android.input_latencies(TOUCH_IRQ, interval=INTERVAL) if trace.cpu.frequency_intervals(cpu=0, interval=event.interval) and trace.cpu.frequency_intervals(cpu=0, interval=event.interval)[0] == 384000 ) ) summary_first = ss_first.describe() summary_first["90%"] = ss_first.quantile(0.9) summary_first["Janks Per Second"] = summary["Janks Per Second"]
print df.describe() # 对DataFrame每列计算汇总统计 ''' one two count 3.000000 2.00000 mean 2.666667 2.50000 std 3.785939 2.12132 min 0.000000 1.00000 25% NaN NaN 50% NaN NaN 75% NaN NaN max 7.000000 4.00000 ''' obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c']) print obj.describe() # 对Series计算汇总统计 ''' count 4.000000 mean 4.500000 std 2.516611 min 2.000000 25% 3.500000 50% 4.000000 75% 5.000000 max 8.000000 dtype: float64 ''' print '去重' obj = Series(['c', 'a', 'd', 'b', 'b', 'c'])
class GradeBook(object): """A class encapsulating a pandas DataFrame and meant to store the grades for a whole class. It provides the method compute_total_grades that compute the totla grade for each student according to a weights provided by the caller. """ def __init__(self, grade_arr, student_ids, item_list, max_scores): """ Constructor of the class grade frame: It should set the following attributes: (1) self.raw_grades, which is a DataFrame with - row labels given by student_ids - column labels given by item_list - values given by grade_arr (2) self.total_grades, set to None (3) self.letter_grades, set to None (4) self.max_scores, set to max_scores Parameters ---------- grade_arr : numpy array of grades as returned by simulate_grades student_ids: a list of student ids item_list: a list of grade items (e.g. ['HW', 'M', 'F']) max_scores: a list of the maximum possible score for each grade item Returns ------- nothing Examples -------- >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50]) >>> type(a.raw_grades) == DataFrame True >>> a.total_grades == None True >>> a.raw_grades.shape == (2,2) True >>> a.raw_grades.ix[0,0] == 1 True >>> a.max_scores[0] == 30 True """ self.total_grades = None self.letter_grades = None self.max_scores = max_scores self.student_ids = student_ids self.item_list = item_list self.grade_arr = grade_arr self.raw_grades = DataFrame(data = grade_arr, index = student_ids, columns = item_list) def compute_total_grades(self, item_weights=None, max_score=100): """ Compute student total class grades as a weighted average of the column in self.raw_grades according to the weights passed to item_weight for each of the columns. The student total class grades are then stored in the Series attribute self.total_grades The return value should be a Series containing a numerical summary (as returned by the Series method describe) of the total class grade distribution. Parameters ---------- item_weights: list of floats summing up to one List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3]) max_score: float Maximal possible score for the total class grade Returns ------- out : Series A Series containing a numerical summary of the total grade distribution previously stored by the function in the attribute self.total_grades; this Series is the output of the Series method describe. ---- Examples -------- >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10]) >>> b = a.compute_total_grades([0.5, 0.5], 100) >>> len(b) == 5 False >>> a.total_grades['22'] == 50 True >>> a.total_grades['34'] == 10 True """ grades = [] raw_sum = [] for x in range(len(self.grade_arr)): for y in range(len(self.grade_arr[x])): self.grade_arr[x][y] = self.max_scores[y]*self.grade_arr[x][y] grades.append(self.grade_arr[x]) for x in range(len(grades)): for y in range(len(grades[x])): grades[x][y] = grades[x][y] * item_weights[y] raw_sum.append(sum(grades[x])) self.total_grades = Series(raw_sum, index= self.student_ids) return self.total_grades.describe()
class HisRecord(): """ This class is a single record - hisId is the haystack Id of the trend - data is created as DataFrame to be used directly in Pandas """ def __init__(self,session,hisId,dateTimeRange='today'): """ GET data from server and fill this object with historical info """ self.hisId = hisId self.name = self.getHisNameFromId(session,self.hisId) index = [] values = [] for eachRows in session.read('hisRead?id='+self.hisId+'&range='+dateTimeRange)['rows']: index.append(pd.Timestamp(pd.to_datetime(datetime.datetime(*map(int, re.split('[^\d]', eachRows['ts'].split(' ')[0])[:-2]))))) #This will allow conversion of Enum value to float so Pandas will work if (eachRows['val'] == 'F'): values.append(False) elif (eachRows['val'] == 'T'): values.append(True) # regex coding here to extract float value when units are part of value (ex. 21.8381°C) elif tools.isfloat(re.findall(r"[-+]?\d*\.*\d+", eachRows['val'])[0]): values.append(float(re.findall(r"[-+]?\d*\.*\d+", eachRows['val'])[0])) else: values.append(eachRows['val']) try: #Declare Series and localize using Site Timezone self.data = Series(values,index=index).tz_localize(session.timezone) #Renaming index so the name will be part of the serie self.data = self.data.reindex(self.data.index.rename([self.name])) except Exception: print('%s is an Unknown history type' % self.hisId) def getHisNameFromId(self,session,pointId): """ Retrieve name from id of an history """ for each in session.read("read?filter=his")['rows']: if each['id'].split(' ',1)[0] == pointId: return (each['id'].split(' ',1)[1]) return 'Id Not found' def plot(self): """ Draw a graph of the DataFrame """ self.data.plot() def breakdownPlot(self, startTime = '08:00', endTime = '17:00', bins=np.array([0,0.5,1,18.0,18.5,19.0,19.5,20.0,20.5,21.0,21.5,22.0,22.5,23.0, 23.5, 24.0, 24.5,25.0])): """ By default, creates a breakdown plot of temperature distribution between 18 and 25 bins (distribution) can be past as argument By default, takes values between 8:00 and 17:00 startTime = string representation of time (ex. '08:00') endtime = string representation of time (ex. '17:00') bin = np.array representing distribution """ x = self.data.between_time(startTime,endTime) barplot = pd.cut(x.dropna(),bins) x.groupby(barplot).size().plot(kind='bar') #self.data.groupby(barplot).size() def simpleStats(self): """ Shortcut for describe() pandas version """ return self.data.describe() def __str__(self): return 'History Record of %s' % self.name
def get_mode(arr): mode = []; arr_appear = dict((a, arr.count(a)) for a in arr); # 统计各个元素出现的次数 if max(arr_appear.values()) == 1: # 如果最大的出现为1 return; # 则没有众数 else: for k, v in arr_appear.items(): # 否则,出现次数最大的数字,就是众数 if v == max(arr_appear.values()): mode.append(k); return mode; get_mode(a) var(a) std(a) a=Series(a) a.skew() a.kurt() a.describe() df = DataFrame({'data1' : np.random.randn(5), 'data2' : np.random.randn(5)}) df.cov() df.corr() ###假设检验 from scipy import stats as ss df=DataFrame({'data':[10.1,10,9.8,10.5,9.7,10.1,9.9,10.2,10.3,9.9]}) ss.ttest_1samp(a = df, popmean = 10)
index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'], for tic, data in all_data.iteritems()}) price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) price volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) # percent changes of the prices: returns = price.pct_change()
# -*- coding: utf-8 -*- from pandas import Series,DataFrame import pandas as pd s=Series([1,2,3],index=['a','b','c']) d=DataFrame([[1,2,3],[4,5,6]],columns=['a','b','c']) #head() method will return top 5 records print (s.head()) print (s.describe()) print (d.head()) print (d.describe()) #read data from xml file excel_data=pd.read_excel("./server.xlsx") print (excel_data.head)
#********************************** # Set ABOVE #********************************** def parse_file(filepath): trace = Ftrace(filepath) return (filepath, trace) if __name__ == '__main__': _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files} sb_all = DataFrame(columns=F_DICT.values()) for _file in _files: fp, trace = parse_file(_file) total_duration = trace.duration if INTERVAL is None else INTERVAL ss = Series((event.interval.duration for event in trace.android.render_frame_intervals(interval=INTERVAL))) ss = ss * 1000. # summary = ss.describe() summary['90%'] = ss.quantile(.9) summary['Janks'] = trace.android.num_janks(interval=INTERVAL) summary['Janks Per Second'] = summary['Janks']/total_duration summary['Average FPS'] = trace.android.framerate(interval=INTERVAL) sb_all[F_DICT[fp]] = summary sb_all.to_csv(r'{path}\frame_stats.csv'.format(path=PATH))
print(df.mean(axis=1,skipna=False)) print('\n') print(df.idxmax()) print('\n') print(df.cumsum()) print('\n') print(df.cumsum(axis=1)) print('\n') print(df.describe()) print('\n') ############################################################### obj = Series(['a','a','b','c']*4) print(obj) print(obj.describe()) print('\n') ############################################################### all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOGL']: all_data[ticker] = web.get_data_yahoo(ticker, '10/1/2015', '10/11/2015') price = DataFrame({tic:data['Adj Close'] for tic, data in all_data.items()}) volume = DataFrame({tic:data['Volume'] for tic, data in all_data.items()}) print(price)
def pd_05(): obj=Series([7,-5,7,4,2,0,4]) print obj.rank() print obj.rank(method='first') print obj.rank(ascending=False,method='first') print obj.describe()
def test_describe_objects(self): s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) result = s.describe() expected = Series({'count' : 7, 'unique' : 4, 'top' : 'a', 'freq' : 3}, index=result.index) assert_series_equal(result, expected)
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
## isnull notnull ##-- isnull() returns a Series with the same indices containing Boolean values ##-- indicating True for null values which include NaN and None, among others. ##-- notnull() returns the negation of isnull() ##-– that is, True for non-null values, and False otherwise. ## describe() returns a simple set of summary statistics about a Series. ## The values returned is a series where ######################################################## s1 = Series(arange(10.0,20.0)) s1.describe() summ = s1.describe() summ["mean"] ######################################################## unique and nunique ## unique() returns the unique elements of a series ## nunique() returns the number of unique values in a Series. drop and dropna drop(labels) drop elements with the selected labels from a Series. ## drop(labels) drop elements with the selected labels from a Series. s1 = Series(arange(1.0,6),index=["a","a","b","c","d"]) s1
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10] #[Out]# INCIDENT DATE LATITUDE LONGITUDE #[Out]# 0 05/07/2010 17:26 18.233333 -72.533333 #[Out]# 1 28/06/2010 23:06 50.226029 5.729886 #[Out]# 2 24/06/2010 16:21 22.278381 114.174287 #[Out]# 3 20/06/2010 21:59 44.407062 8.933989 #[Out]# 4 18/05/2010 16:26 18.571084 -72.334671 #[Out]# 5 26/04/2010 13:14 18.593707 -72.310079 #[Out]# 6 26/04/2010 14:19 18.482800 -73.638800 #[Out]# 7 26/04/2010 14:27 18.415000 -73.195000 #[Out]# 8 15/03/2010 10:58 18.517443 -72.236841 #[Out]# 9 15/03/2010 11:00 18.547790 -72.410010 #[Out]# #[Out]# [10 rows x 3 columns] # Wed, 09 Jul 2014 00:38:24 data.describe() #[Out]# Serial LATITUDE LONGITUDE #[Out]# count 3593.000000 3593.000000 3593.000000 #[Out]# mean 2080.277484 18.611495 -72.322680 #[Out]# std 1171.100360 0.738572 3.650776 #[Out]# min 4.000000 18.041313 -74.452757 #[Out]# 25% 1074.000000 18.524070 -72.417500 #[Out]# 50% 2163.000000 18.539269 -72.335000 #[Out]# 75% 3088.000000 18.561820 -72.293570 #[Out]# max 4052.000000 50.226029 114.174287 #[Out]# #[Out]# [8 rows x 3 columns] # Wed, 09 Jul 2014 00:38:53 data['CATEGORY'][:6] #[Out]# 0 1. Urgences | Emergency, 3. Public Health, #[Out]# 1 1. Urgences | Emergency, 2. Urgences logistiqu...