def summary_df(self, thresholds=None, lower_quantile=None, upper_quantile=None): """ Calculates the pair of metrics for each threshold for each result. """ if thresholds is None: thresholds = self.thresholds if lower_quantile is None: lower_quantile = self.config['lower_quantile'] if upper_quantile is None: upper_quantile = self.config['upper_quantile'] if self.n_current_results > self.n_cached_curves: # If there are new curves, recompute colnames = ['_'.join([metric, stat]) for metric in [self.metric1.name, self.metric2.name] for stat in ['Mean', 'Median', '%d_Percentile' % (100*lower_quantile), '%d_Percentile' % (upper_quantile*100)]] self.ret = pd.DataFrame(columns=colnames, index=thresholds, dtype='float64') for threshold in thresholds: m1s = Series([self.metric1.score(result, threshold) for result in self.results]) m2s = Series([self.metric2.score(result, threshold) for result in self.results]) self.ret.loc[threshold] = (m1s.mean(), m1s.quantile(.5), m1s.quantile(.05), m1s.quantile(.95), m2s.mean(), m2s.quantile(.5), m2s.quantile(.05), m2s.quantile(.95)) return self.ret
def test_nanmean_overflow(self): # GH 10155 # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy from pandas import Series # numpy < 1.9.0 is not computing this correctly from distutils.version import LooseVersion if LooseVersion(np.__version__) >= '1.9.0': for a in [2 ** 55, -2 ** 55, 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() self.assertEqual(result, a) self.assertEqual(result, np_result) self.assertTrue(result.dtype == np.float64) # check returned dtype for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: s = Series(range(10), dtype=dtype) result = s.mean() if is_integer_dtype(dtype): self.assertTrue(result.dtype == np.float64) else: self.assertTrue(result.dtype == dtype)
def onDataArrived(data,trader): ''' 执行数据每次接收到数据 ''' global count,bid0,bid1,ask0,ask1,diffs,lastDirection count += 1 #print 'instrumentID =',data['instrumentID'],'count=',count #if count == 10: # print 'bid =',data['bid'],'ask =',data['ask'] # result = trader.open(instrumentID=data['instrumentID'],directionCode = 'buy',volume=1) # print result # print result[2].openPrice #if count in (15,25): # print 'total volume=',trader.getTotalVolume() #if count == 20: # trader.closeAll() if data['instrumentID'] == instrumentID0: bid0 = data['bid'] ask0 = data['ask'] if data['instrumentID'] == instrumentID1: bid1 = data['bid'] ask1 = data['ask'] if bid0 != 0 and bid1 !=0: diff = bid1 - bid0 diffs.append(diff) if len(diffs) > 350: s = Series(diffs) pts = diff - s.mean() if abs(pts) > 3 : if pts > 0 and lastDirection <= 0: lastDirection = 1 print '开仓条件触发,头寸方向:',lastDirection print 'bid0 =',bid0,'bid1 =',bid1,'ask0 =',ask0,'ask1 =',ask1 print 'diff =',diff,'偏离值 =',pts trader.closeAll() price0,price1=openPair(trader,instrumentID0,instrumentID1,lastDirection) print '实际价差 =',price1-price0-s.mean() print '买盘滑点 =',price0-bid0 print '卖盘滑点 =',ask1-price1 if pts < 0 and lastDirection >= 0: lastDirection = -1 print '开仓条件触发,头寸方向:',lastDirection print 'bid0 =',bid0,'bid1 =',bid1,'ask0 =',ask0,'ask1 =',ask1 print 'diff =',diff,'偏离值 =',pts trader.closeAll() openPair(trader,instrumentID0,instrumentID1,lastDirection) print '实际价差 =',price0-price1-s.mean() print '买盘滑点 =',price1-bid1 print '卖盘滑点 =',ask0-price0 else: if count % 30 == 0 : print '平均点差 =',s.mean(),'diff =',diff,'偏离值 =',pts
def query_rent_avgprice(filename): """ query mongo calculate the avg price of each city return { city : price } """ # { city : [price1,price2...]} rent_avgprice = {} res = RentHouse.objects.only('city', 'price') for item in res: city = item.city price = clean_price(item.price) if price == -1: continue if city in rent_avgprice.keys(): rent_avgprice[city].append(price) else: rent_avgprice[city] = [price] for key, value in rent_avgprice.items(): s = Series(value) rent_avgprice[key] = s.mean() for key, value in rent_avgprice.items(): rent_avgprice[key] = int(value) # write the result to cache file fd = open('./cache/' + filename, "w", encoding="utf-8") fd.write(json.dumps(rent_avgprice)) return rent_avgprice
def get_red_yellow_bins(series: pd.Series, method: str, red_bin=None, yellow_bin=None): if method == "Percentile": red = round(series.quantile(q=red_bin / 100), 6) yellow = round(series.quantile(q=(red_bin + yellow_bin) / 100), 6) return red, yellow elif method == "Count": red = round(series.sort_values().values[int(round(red_bin, 0)) - 1], 6) yellow = round( series.sort_values().values[int(round(red_bin + yellow_bin, 0)) - 1], 6) return red, yellow elif method == "NAVF": mean = series.mean() sd = series.std() red = round(mean - 3 * sd, 6) yellow = round(mean - 2 * sd, 6) return red, yellow elif method == "fuzzy AVF": y = fuzzy_smf(series) red, yellow = get_fuzzy_bins(series, y, red_bin, yellow_bin) return red, yellow else: red = None yellow = None return red, yellow
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. Parameters ---------- series : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ from pandas import Series # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]"; # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str, # float]]]" formatted_percentiles = format_percentiles( percentiles) # type: ignore[arg-type] stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name)
def z_score( s: pd.Series, moments_dict: dict = None, keys: Tuple[str] = ("mean", "std")) -> pd.Series: """ Transforms the Series into z-scores :param s: Input Series :type s: pd.Series :param moments_dict: If not None, then the mean and standard deviation used to compute the z-score transformation is saved as entries in moments_dict with keys determined by the keys argument, defaults to None :type moments_dict: dict, optional :param keys: Determines the keys saved in moments_dict if moments are saved, defaults to ("mean", "std") :type keys: Tuple[str], optional :return: Transformed Series :rtype: pd.Series """ mean = s.mean() std = s.std() if std == 0: return 0 if moments_dict is not None: moments_dict[keys[0]] = mean moments_dict[keys[1]] = std return (s - mean) / std
def column_info(labels: pd.Series) -> pd.Series: count = len(labels) gini = gini_ratio(labels) prob = labels.astype(bool).sum() / len(labels) avg = labels.mean() std = labels.std() return count, gini, prob, avg, std
def _check_Xy(X: pd.DataFrame, y: pd.Series, *, norm_y=False) -> Tuple[pd.Series, pd.Series]: if np.ndim(X) == 1: X = pd.Series(X).to_frame() elif np.ndim(X) == 2: X = pd.DataFrame(X) assert X.ndim == 2 assert np.ndim(y) == 1 assert len(X) == len(y) valid = ~X.isnull().any(1).values X = pd.Series(list(zip(*X.values[valid].T)), name=tuple(X.columns)).astype('category') y = pd.Series(y).reset_index(drop=True)[valid] if is_object_dtype(y): y = pd.Categorical(y) if norm_y: assert is_numeric_dtype(y) y = (y - y.mean()) / y.std() return X, y
def infer_posterior(data: Series, alpha: Optional[float] = None, beta: Optional[float] = None) -> Gamma: """ Return a new Gamma distribution of the posterior most likely to generate the given data. :param data: Series of float values representing duration of, or between each observation. :param alpha: Value for the α hyper-parameter of the prior Gamma distribution (number of observations). Defaults to Vague. :param beta: Value for the β hyper-parameter of the prior Gamma distribution (sum of observations). Defaults to Vague. """ if alpha is None: alpha = VaguePrior.Gamma.alpha if beta is None: beta = VaguePrior.Gamma.beta data = data.dropna() n = len(data) x_mean = data.mean() return GammaExponentialConjugate(n=n, x_mean=x_mean, alpha=alpha, beta=beta).posterior()
def get_moments_annotation( s: pd.Series, xref: str, yref: str, x: float, y: float, xanchor: str, title: str, labels: List, ) -> go.layout.Annotation: """Calculates summary statistics for a series and returns and Annotation object. """ moments = list(stats.describe(s.to_numpy())) moments[3] = np.sqrt(moments[3]) sharpe = s.mean() / s.std() return go.layout.Annotation( text=(f"<b>sharpe: {sharpe:>8.4f}</b><br>" + ("<br>").join( [f"{k[0]:<9}{k[1](moments[i])}" for i, k in enumerate(labels)])), align="left", showarrow=False, xref=xref, yref=yref, x=x, y=y, bordercolor="black", borderwidth=0.5, borderpad=2, bgcolor="white", xanchor=xanchor, yanchor="top", )
def relative_absolute_error(y_true: pd.Series, y_pred: pd.Series): y_true_mean = y_true.mean() n = len(y_true) # Relative Absolute Error # err = math.sqrt(sum(np.square(y_true - y_pred)) / math.sqrt(sum(np.square(y_true-y_true_mean)))) err = sum(abs(y_true - y_pred)) / sum(abs(y_true - y_true_mean)) return err
def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series: """ 统计指标填充,例如:均值、中位数、众数等 :param X: :param method: 目前仅支持均值、中位数、众数、最大值、最小值 :param null_value: 缺失值列表 :return: """ X = X.copy() if null_value is not None: X[X.isin(null_value)] = np.nan if method == 'mean': fill_value = X.mean() elif method == 'median': fill_value = X.median() elif method == 'mode': fill_value = X.mode()[0] elif method == 'max': fill_value = X.max() elif method == 'min': fill_value = X.min() else: raise Exception('未配置的填充方法') X.fillna(fill_value, inplace=True) return X
def query_rent_avgzone(filename): """ query mongo calculate the avg price of each zone return { zone : num} """ # { zone : [price1,price2...]} rent_avgzone = {} res = RentHouse.objects.only('city', 'price', 'location') for item in res: tmp = item.location.split('-') zone = item.city + '-' + tmp[0] price = clean_price(item.price) if price == -1: continue if zone in rent_avgzone.keys(): rent_avgzone[zone].append(price) else: rent_avgzone[zone] = [price] for key, value in rent_avgzone.items(): s = Series(value) rent_avgzone[key] = s.mean() # write the result to cache file fd = open('./cache/' + filename, "w") fd.write(json.dumps(rent_avgzone)) return rent_avgzone
def update_enhanced_meta(serie: pd.Series, catalog_id: str, distribution_id: str): """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de la misma DEBE ser el ID de la serie en la base de datos""" field = Field.objects.get(distribution__dataset__catalog__identifier=catalog_id, distribution__identifier=distribution_id, identifier=serie.name) periodicity = meta_keys.get(field.distribution, meta_keys.PERIODICITY) days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days last = serie[-1] second_to_last = serie[-2] if serie.index.size > 1 else None last_pct_change = last / second_to_last - 1 # Cálculos meta = { meta_keys.INDEX_START: serie.first_valid_index().date(), meta_keys.INDEX_END: serie.last_valid_index().date(), meta_keys.PERIODICITY: periodicity, meta_keys.INDEX_SIZE: _get_index_size(serie), meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update, meta_keys.LAST_VALUE: last, meta_keys.SECOND_TO_LAST_VALUE: second_to_last, meta_keys.LAST_PCT_CHANGE: last_pct_change, meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity), meta_keys.MAX: serie.max(), meta_keys.MIN: serie.min(), meta_keys.AVERAGE: serie.mean(), } for meta_key, value in meta.items(): field.enhanced_meta.update_or_create(key=meta_key, defaults={'value': value})
def basic_stat_map(s: pd.Series) -> dict: return { "mean": s.mean(), "median": s.median(), "std": s.std(), "count": s.count(), }
def standardize(self, s: pd.Series): mean, std = s.mean(), s.std() self.logger.info("{name}'s mean: {m}, std: {s}".format(name=s.name, m=mean, s=std)) stdized = s.apply(lambda x: (x - mean) / std).rename("stdized_price") return stdized, mean, std
def _sortino_ratio(self, returns: pd.Series) -> float: """ Return the sortino ratio for a given series of a returns. 返回索提诺比率 Sortino ratio的思路和Sharpe ratio的思路是一样的,但是对分子分母分别都做了调整。它将分子换为超额收益率, 而分母换为Lower partial standard deviation,下偏标准差,主要是为了解决传统的正态分布存在的几个问题: 分布其实并不对称。尤其是收益率函数分布左偏(偏度为负)的情况下,正态分布会低估风险, 此时使用偏态分布就要比正态分布要合理;投资组合的下限应该是无风险投资工具。 因此传统的sharpe ratio中分母使用全体的标准差(全体对平均投资收益的偏离)是不合适的, 应该使用收益对无风险投资收益的偏离。总体上来看,Sortino ratio更看重对(左)尾部的预期损失分析, 而Sharpe ratio则是对全体样本进行分析;而当 更换为收益列中大于无风险收益的全部样本时, 则是对(右)尾部的超额收入分析。金融历史不能完全指导过去,尤其是在极端风险的考量上。 因此使用Sortino ratio就变成一种更审慎的评估工具了。另外说一句,Sortino ratio也没有解决尖峰(leptokurtic)肥尾(fat tail)的问题。 链接:https://www.zhihu.com/question/37128695/answer/230508370 https://en.wikipedia.org/wiki/Sortino_ratio """ # 亏损收益 downside_returns = (returns[returns < self._target_returns])**2 # 资产期望收益率 expected_return = returns.mean() # 亏损收益方差 downside_std = np.sqrt(downside_returns.mean()) return (expected_return - self._risk_free_rate) / (downside_std + 1E-9)
def _sharpe_ratio(self, returns: pd.Series) -> float: """ Return the sharpe ratio for a given series of a returns. 返回夏普比率 https://en.wikipedia.org/wiki/Sharpe_ratio """ return (returns.mean() - self._risk_free_rate) / (returns.std() + 1E-9)
def dist_table(diff: pd.Series): diff = diff.map(lambda x: trueround_precision(x, 3)) head = '<tr><td>区分度</td><td>区分度描述</td><td>题目数量</td></tr>' rtn = [ head, ] groups = [(0, 0.199), (0.2, 0.299), (0.3, 0.399), (0.4, 1)] labels = [ '需要修改', '修改之后会更好', '合格', '较好', ] i = 0 for g in groups: label = labels[i] n = sum((diff >= g[0]) & (diff <= g[1])) i += 1 row = f'<tr><td>{g[0]}~{g[1]}</td><td>{label}</td><td>{n}</td></tr>' rtn.append(row) discribe = {} discribe['最大区分度值'] = diff.max() discribe['最小区分度值'] = diff.min() discribe['平均区分度值'] = diff.mean() for k, v in discribe.items(): row = f'<tr><td>{k}</td><td>{v}</td></tr>' rtn.append(row) rows = '\n'.join(rtn) return f'<table class="table table-striped">{rows}</table>'
def calc_avg_time(timeseries, month=0, wake=True): seconds_list = [] if wake is True: label = 'wake_time' else: label = 'bed_time' if month == 0: abrv_month = 'all time' timeseries = timeseries[label] else: given_month = '2017/' + str(month) abrv_month = month_dict[month] timeseries = timeseries.ix[given_month][label] for t_stamp in timeseries: if t_stamp.hour <= 12: ts_seconds = t_stamp.hour * 3600 + t_stamp.minute * 60 + t_stamp.second * 1. seconds_list.append(ts_seconds) else: ts_seconds = t_stamp.hour * 3600 + t_stamp.minute * 60 + t_stamp.second * 1. - 24 * 3600 seconds_list.append(ts_seconds) timeseries_temp = Series(seconds_list, index=timeseries.index) ts_m = timeseries_temp.mean() if ts_m > 0: ts_m = int(ts_m) else: ts_m = int(ts_m) + 24 * 3600 result = time(ts_m / 3600, (ts_m % 3600) / 60, (ts_m % 3600) % 60) result = result.strftime('%H:%M:%S') return result
def calculate_enhanced_meta(serie: pd.Series, periodicity: str) -> dict: """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de la misma DEBE ser el ID de la serie en la base de datos""" days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days last = serie[-1] second_to_last = serie[-2] if serie.index.size > 1 else None last_pct_change = last / second_to_last - 1 # Cálculos meta = { meta_keys.INDEX_START: serie.first_valid_index().date(), meta_keys.INDEX_END: serie.last_valid_index().date(), meta_keys.PERIODICITY: periodicity, meta_keys.INDEX_SIZE: _get_index_size(serie), meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update, meta_keys.LAST_VALUE: last, meta_keys.SECOND_TO_LAST_VALUE: second_to_last, meta_keys.LAST_PCT_CHANGE: last_pct_change, meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity), meta_keys.MAX: serie.max(), meta_keys.MIN: serie.min(), meta_keys.AVERAGE: serie.mean(), meta_keys.SIGNIFICANT_FIGURES: significant_figures(serie.values) } return meta
def test_all_values_single_bin(self): # 2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.randn(len(index)), index=index) result = s.resample("A", how='mean') tm.assert_almost_equal(result[0], s.mean())
def plot_norm(data: pd.Series, bins=10, ax=None, is_show_plot=None): """ 显示当前数据的正太分布曲线 :param data: :param bins: bar 数量 :param ax: 如果为None,则新建一个画布 :param is_show_plot: 是否展示 :return: n, bins_v, mean, std """ if ax is None: fig, ax = plt.subplots() if is_show_plot is None: is_show_plot = True if is_show_plot is None: is_show_plot = False n, bins_v = np.histogram(data, bins=bins) mu = data.mean() # mean of distribution sigma = data.std() # standard deviation of distribution # def norm_func(x, mu, sigma): # pdf = np.exp(-((x - mu)**2)/(2*sigma**2)) / (sigma * np.sqrt(2*np.pi)) # return pdf # y = norm_func(bins, mu, sigma) # 与 mlab.normpdf(bins, mu, sigma) 相同 # y = mlab.normpdf(bins, mu, sigma) y = stats.norm.pdf(bins_v, loc=mu, scale=sigma) ax.plot(bins_v, y, '--') plt.grid(True) if is_show_plot: plt.show() return n, bins_v, mu, sigma
def get_descriptive_stats(data: pd.Series) -> dict: """Calculate descriptive statistics for the supplied `data`. Args: data (pd.Series): An array of the values to summarise. Returns: dict: A dictionary of summary statistics. """ stats = [ "Mean", "Standard Deviation", "Minimum", "Maximum", "Median", "Mode", ] values = resolve_integer_or_float([ data.mean(), data.std(), data.min(), data.max(), median(data), mode(data), ]) return dict(zip(stats, values))
def test_all_values_single_bin(self): # 2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.randn(len(index)), index=index) result = s.resample("A").mean() tm.assert_almost_equal(result[0], s.mean())
def zscore(s: Series) -> Series: """ Returns the z-score for every value in the series. Z = (x - mu) / sigma Example ------- >>> x = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> x 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 dtype: int64 >>> x.zscore() 0 -1.460593 1 -1.095445 2 -0.730297 3 -0.365148 4 0.000000 5 0.365148 6 0.730297 7 1.095445 8 1.460593 dtype: float64 """ return (s - s.mean()) / s.std()
def calc_avg_time(timeseries,month=0, wake=True): seconds_list = [] if wake is True: label = 'wake_time' else: label = 'bed_time' if month == 0: abrv_month = 'all time' timeseries = timeseries[label] else: given_month = '2016/' + str(month) abrv_month = month_dict[month] timeseries = timeseries.ix[given_month][label] for t_stamp in timeseries: if t_stamp.hour <= 12: ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. seconds_list.append(ts_seconds) else: ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. - 24*3600 seconds_list.append(ts_seconds) timeseries_temp = Series(seconds_list,index = timeseries.index) ts_m = timeseries_temp.mean() if ts_m > 0: ts_m = int(ts_m) else: ts_m = int(ts_m) + 24*3600 result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60) result = result.strftime('%H:%M:%S') return result
def calc_long_short_prec(pred: pd.Series, label: pd.Series, date_col="datetime", quantile: float = 0.2, dropna=False, is_alpha=False) -> Tuple[pd.Series, pd.Series]: """ calculate the precision for long and short operation :param pred/label: index is **pd.MultiIndex**, index name is **[datetime, instruments]**; columns names is **[score]**. .. code-block:: python score datetime instrument 2020-12-01 09:30:00 SH600068 0.553634 SH600195 0.550017 SH600276 0.540321 SH600584 0.517297 SH600715 0.544674 label : label date_col : date_col Returns ------- (pd.Series, pd.Series) long precision and short precision in time level """ if is_alpha: label = label - label.mean(level=date_col) if int(1 / quantile) >= len(label.index.get_level_values(1).unique()): raise ValueError("Need more instruments to calculate precision") df = pd.DataFrame({"pred": pred, "label": label}) if dropna: df.dropna(inplace=True) group = df.groupby(level=date_col) N = lambda x: int(len(x) * quantile) # find the top/low quantile of prediction and treat them as long and short target long = group.apply( lambda x: x.nlargest(N(x), columns="pred").label).reset_index( level=0, drop=True) short = group.apply( lambda x: x.nsmallest(N(x), columns="pred").label).reset_index( level=0, drop=True) groupll = long.groupby(date_col) l_dom = groupll.apply(lambda x: x > 0) l_c = groupll.count() groups = short.groupby(date_col) s_dom = groups.apply(lambda x: x < 0) s_c = groups.count() return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)
def fill_outliers(col: pd.Series): """ Remove outliers of each col """ mean, std = col.mean(), col.std() upper, lower = mean + 3 * std, mean - 3 * std col[col > upper] = np.floor(upper) col[col < lower] = np.floor(lower) return col.values
def _mask_outliers(vec: pd.Series, stdv_times): vec_mean = vec.mean() vec_stdv = vec.std() upper = vec_mean + vec_stdv * stdv_times lower = vec_mean - vec_stdv * stdv_times vec[((lower > vec) | (vec > upper))] = np.nan return vec
def from_series(feature_name: str, series: Series): """从pandas.Series中构造""" assert types.is_numeric_dtype(series), series.dtypes return NumericColumn(feature_name=feature_name, min_value=series.min(), max_value=series.max(), mean_value=series.mean(), std_value=series.std())
def sharpe_ratio(corrs: pd.Series) -> np.float32: """ Calculate the Sharpe ratio for Numerai by using grouped per-era data :param corrs: A Pandas Series containing the Spearman correlations for each era :return: A float denoting the Sharpe ratio of your predictions. """ return corrs.mean() / corrs.std()
def before_after_3sigma(data: pd.Series) -> pd.Series: miu = data.mean() sigma = data.std() threshold_down = miu - 3 * sigma threshold_up = miu + 3 * sigma data[data.ge(threshold_up)] = threshold_up data[data.le(threshold_down)] = threshold_down return data
def saveDictionaryToFile(my_dict, file_name): '''THis function will write the values of a dictionary into a csv, BUT it will also append the mean value as the last row''' data = Series(my_dict, index=my_dict.keys()) mean_value = data.mean() data['AVG'] = mean_value data.sort_index(axis=0, inplace=True) data.to_csv(file_name)
def pd_03(): df=DataFrame(np.random.randn(6,3)) df.ix[2:,1]=np.nan df.ix[4:,2]=np.nan print df print df.fillna(method='ffill') print df.fillna(method='ffill',limit=2) data=Series([1.,None,3.5,None,7]) print data.fillna(data.mean()) print df.fillna(df.mean())
def print_avg_time(timeseries,month=0, wake=True): seconds_list = [] if wake == True: if month == 0: abrv_month = 'all time' timeseries = timeseries['wake_time'] else: given_month = '2016/' + str(month) abrv_month = month_dict[month] timeseries = timeseries.ix[given_month]['wake_time'] for t_stamp in timeseries: ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. seconds_list.append(ts_seconds) timeseries_temp = Series(seconds_list,index = timeseries.index) ts_m = timeseries_temp.mean() ts_m = int(ts_m) result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60) print 'average wake time in ' + abrv_month + ': ' + result.strftime('%H:%M:%S') else: if month == 0: abrv_month = 'all time' timeseries = timeseries['bed_time'] else: given_month = '2016/' + str(month) abrv_month = month_dict[month] timeseries = timeseries.ix[given_month]['bed_time'] for t_stamp in timeseries: if t_stamp.hour <= 12: ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. seconds_list.append(ts_seconds) else: ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. - 24*3600 seconds_list.append(ts_seconds) timeseries_temp = Series(seconds_list,index = timeseries.index) ts_m = timeseries_temp.mean() if ts_m > 0: ts_m = int(ts_m) else: ts_m = int(ts_m) + 24*3600 result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60) print 'average bed time in %s is %s' % (abrv_month, result.strftime('%H:%M:%S')) return result
def Calls(self): rows = [] for name,callTimes in self.times['call'].iteritems(): s = Series(callTimes) func,loc = formatName(name) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append((func,loc,callCount,meanTime,totalTime)) columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
def Phases(self): rows = [] for prefix in ('parse', 'compile', 'run'): for name,callTimes in self.times[prefix].iteritems(): s = Series(callTimes) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime)) columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
def test_nanmean_overflow(self): # GH 10155 # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy for a in [2 ** 55, -2 ** 55, 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() assert result == a assert result == np_result assert result.dtype == np.float64
def FollowsInfoByCode(code): filename = 'follows_history/'+code+'.csv' if not os.path.exists(filename): # print filename, 'not exit' return None reader = csv.reader(file(filename,'rb')) follows_chg_list = [] rt_line = [] for row in reader: name, date, follows, follows_chg = row follows_chg = int(follows_chg) follows_chg_list.append((follows_chg)) if date > '2014-10-08' and date < '2014-12-01': df = Series(follows_chg_list) FollowsMultiple = round((follows_chg)/df.mean(), 1) if FollowsMultiple > 4 and FollowsMultiple < 50: d = dateutil.parser.parse(date) if d.weekday() < 5: print code, name, d, ',', follows, follows_chg, round(df.mean(),1), str(FollowsMultiple)+'x' line = AnalyseHistoryPrice(code, name, FollowsMultiple, date) if line != None: rt_line.append(line) return rt_line
def test_nanmean_overflow(self): # GH 10155 # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy # numpy < 1.9.0 is not computing this correctly if not _np_version_under1p9: for a in [2 ** 55, -2 ** 55, 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() assert result == a assert result == np_result assert result.dtype == np.float64
def get_summary_indicators_from_hist(sf, hist, int_index=False): seriesHist = Series(hist) maxs = { 'freq': dict() } means = {'freq': seriesHist.mean()} medians = {'freq': seriesHist.median()} stds = {'freq': seriesHist.std()} maxs['freq']['freq'] = seriesHist.max() maxs['freq']['index'] = seriesHist.idxmax() index_total = 'NA' if int_index: index = seriesHist.index index = index.astype(int) index_list = index.tolist() index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))]) index_series = Series(index_list) means['index'] = index_series.mean() medians['index'] = index_series.median() stds['index'] = index_series.std() maxs['freq']['index'] = int(maxs['freq']['index']) maxs['index'] = dict() maxs['index']['index'] = max(index_list) maxs['index']['freq'] = hist[str(maxs['index']['index'])] return { 'means': means, 'medians': medians, 'stds': stds, 'max': maxs, 'index_total': index_total }
def summary_df(self): lower_quantile = self.config['lower_quantile'] upper_quantile = self.config['upper_quantile'] vals = Series(self.summary) lower_bound = vals.quantile(lower_quantile) upper_bound = vals.quantile(upper_quantile) median = vals.quantile(0.5) mean = vals.mean() column_names = [ "Mean" , "Median" , "%d_Percentile" % (lower_quantile*100), "%d_Percentile" % (upper_quantile*100)] df = pd.DataFrame(dict(zip(column_names, [mean, median, lower_bound, upper_bound])), index=[0]) return df
def calc_avg(data: pd.Series, prec=None) -> float: """Calculate average. Args: data: data to analyze (pd.Series) prec: precision if rounding (int) Returns: average (float) """ average = data.mean(axis=0) if prec is not None: average = round(number=average, ndigits=prec) return average
def test_nanmean_overflow(self): # GH 10155 # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy # numpy < 1.9.0 is not computing this correctly from distutils.version import LooseVersion if LooseVersion(np.__version__) >= '1.9.0': for a in [2 ** 55, -2 ** 55, 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() self.assertEqual(result, a) self.assertEqual(result, np_result) self.assertTrue(result.dtype == np.float64)
def main(): """ Handling of not applicable values """ string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) print string_data print string_data.isnull() string_data[0] = None print string_data.isnull() print None is np.nan, None == np.nan # not same # Exclude N/A print '','' NA = np.nan data = Series([1, NA, 3.5, NA, 7]) print data.dropna() print data[data.notnull()] data = DataFrame([ [1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.] ]) cleaned = data.dropna() # row that all value is not NA print data print cleaned print data.dropna(how='all') data[4] = None print data.dropna(axis=1, how='all') print data.dropna(thresh=2) # non NA is more 2 # Fill NA print '','' print data.fillna(0) print data.fillna({1: 0.5, 2: -1}) _ = data.fillna(0, inplace=True) print data print '','' df = DataFrame(np.arange(18).reshape((6, 3))) df.ix[2:, 1] = NA; df.ix[4:, 2] = NA print df print df.fillna(method='ffill') print df.fillna(method='ffill', limit=2) data = Series([1., NA, 3.5, NA, 7]) print data.fillna(data.mean())
def generate_value_length_distribution(self, variable_value): """ variable map to it's distribution(mean, variance). self.__value_length_distribution :param variable_value: {path: {variable: [value]}} :return: {path: {variable: {'mean': m, 'variance': v}}} """ for path, variable_dict in variable_value.items(): self._m['value_length_distribution'][path] = {} for variable, value_list in variable_dict.items(): length_list = [len(value) for value in value_list] length_series = Series(length_list) mean = length_series.mean() var = length_series.var() self._m['value_length_distribution'][path][variable] = {'mean': mean, 'variance': var}
def GetFollowsMeanByCode(dirfilelist, code = '(SH:600036)'): follows_chg_list = [] last_follows = 0 for one in dirfilelist: # print one df_curr = pd.read_csv(one, names = ['name', 'code', 'follows'], skiprows=[0]) name, follows = GetFollowsByCode(df_curr, code) if follows > 0: if last_follows == 0: last_follows = follows else: diff = abs(follows - last_follows) last_follows = follows follows_chg_list.append(diff) # print follows_chg_list df = Series(follows_chg_list) # print df.mean() return (df.mean())
def __report_bots_metadata_results_excel(self, writer): """ writer -> None writer: ExcelWriter | ExcelWriter object containing buffer for eventual output .xlsx file """ npsim = self.get_npsim() ## Some bookeeping # get percent unique bots percentUniqueBots = round( float(self._calc_num_unique_bots()) / float(npsim.get_n()), 4) percentUniqueBotsString = "{0:.0f}%".format(100 * percentUniqueBots) # get percent mulligans used numMulUsed = 0 for bot in npsim.get_bots(): if bot.has_used_mulligan(): numMulUsed += 1 percentMulUsed = round(float(numMulUsed) / float(npsim.get_n()), 4) percentMulUsedString = "{0:.0f}%".format(100 * percentMulUsed) # for constructing "one item columns" enoughEmptyRows = ["" for i in range(npsim.get_n()-1)] ## Create series that correspond to columns in output excel file ## takes advantage of the fact that self.get_bots() is in sorted order npsim.get_bots().sort(key=lambda bot: bot.get_max_streak_length(), reverse=True) botS = Series([bot.get_index() for bot in npsim.get_bots()], name='Bot') maxStreakS = Series([bot.get_max_streak_length() for bot in \ npsim.get_bots()], name='maxStreak') aveStreakS = Series([maxStreakS.mean()] + enoughEmptyRows, name="aveMaxStreak") uniqueBotS = Series([percentUniqueBotsString] + \ enoughEmptyRows, name='Unique Bots(%)') percentMulUsedS = Series([percentMulUsedString] + \ enoughEmptyRows, name='Mul Used (%)') ## construct dataframe to write to excel file df = concat( [botS, maxStreakS, aveStreakS, uniqueBotS, percentMulUsedS], axis=1) ## put df info on excel buffer df.to_excel(writer, index=False, sheet_name='Bots Meta')
# look up by index s2[["a", "d"]] # or by numerical index s2[[0, 2]] s2.index # use series to introduce time series data dates = pd.date_range("2014-08-01", "2014-08-06") dates # must be the same length temps1 = Series([80, 82, 85, 90, 83, 87], index=dates) temps1 temps1.mean() temps2 = Series([70, 75, 69, 83, 79, 77], index=dates) temp_diffs = temps1 - temps2 temp_diffs # or by date temp_diffs["2014-08-03"] # or by integer position temp_diffs[2] ##### # DateFrame # multiple columns of heterogenous data, but each column of same type #####
# Check for missingness totaldf.isnull().sum() # Lots of missing data for Smoking, Physical Activity, Malaria and HIV, so won't use # 14 missing values in Life Expectancy so will delete them # Will delete all non-complete rows once I get rid of the above 4 columns totaldf = totaldf.drop(['Smoking', 'PhysicalActivity', 'Malaria', 'HIV'], axis=1) totaldf = totaldf.dropna() totaldf.isnull().sum() totaldf.shape # Explore the data a bit totaldf.ix[totaldf['AlcConsumption'].idxmax()] totaldf.ix[totaldf['ImprovedWater'].idxmin()] Series.mean(totaldf['Suicide']) print(totaldf.loc[totaldf['Country'].isin(['Panama', 'Guatemala', 'Australia'])]) # Work out if can use linear regression corr_df = totaldf.corr(method='pearson') print("--------------- CORRELATIONS ---------------") print(corr_df.head(corr_df.shape[1])) s = corr_df.unstack() so = DataFrame(s.sort_values(kind="quicksort")) so.loc[(so[0] >= .8) & (so[0] < 1)] import matplotlib.pyplot as plt plt.hist(totaldf['LifeExpectancy']) plt.title("Life Expectancy Histogram")
frame['e'].map(format) ## Sorting and ranking obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame frame.sort_index() frame.sort_index(axis=1) frame.sort_index(axis=1, ascending=False) frame.sort_index(axis=1, ascending=True) frame obj = Series([4, 7, -3, 2]) obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) obj.order() frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame frame.sort_index(by='b') frame.sort_index(by=['a', 'b']) # sort by multiple columns obj = Series([7, -5, 7, 4, 2, 0, 4]) obj.rank() obj.mean() obj.rank(method='first') obj.rank(ascending=False, method='max') frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) frame frame.rank(axis=1) get_ipython().system(u'ls')
df = DataFrame(np.random.randn(6,3)) df.ix[2:, 1]=NA df.ix[4:, 2]=NA print(df) print('\n') print(df.fillna(method='ffill')) print('\n') print(df.fillna(method='ffill', limit=2)) print('\n') ############################################################### data = Series([1.,NA,3.5,NA,7]) print(data.fillna(data.mean())) print('\n') ############################################################### data = Series(np.random.randn(10), index = [['a','a','a','b','b','b','c','c','d','d'], [1,2,3,1,2,3,1,2,2,3]]) print(data) print('\n') print(data.index) print('\n') print(data['b']) print('\n')
def adfTest(spread): ADF_p_value = ts.adfuller(spread, 1)[1] return ADF_p_value ADF_p_value = adfTest(spread) if ADF_p_value <= 0.05: print "The spread is likely mean-reverting." else: print "The spread is not mean-reverting." spread = Series(spread) signalMean = spread.mean() signalDev = spread.std() openMult = 1.0 closeMult = 0.5 stopLossMult = 4.0 openSignal = signalDev * openMult; closeSignal = signalDev * closeMult; stopLossSignal = signalDev * stopLossMult; residSpread = spread - signalMean residSpread.plot() openSignalUp = openSignal * (residSpread * 0 + 1) openSignalDown = -openSignal * (residSpread * 0 + 1)
test_data_class = test_data['class'] test_data = test_data[test_columns] correct_num = 0 all_num = len(test_data_index) for i in test_data_index: # print i, test_data.ix[i] if predict(test_data.ix[i], train_data, k) == test_data_class[i]: correct_num += 1 knn_accuracy = correct_num / all_num print 'accuracy:' , knn_accuracy knn_accuracy_list.append(knn_accuracy) knn_accuracy_series = Series(knn_accuracy_list) print 'k: ', k, 'knn average accuracy:', knn_accuracy_series.mean() average_result.append(knn_accuracy_series.mean()) print 'all average accuracy result for each k: ', average_result fig, axes = plt.subplots(nrows = 2, ncols = 1) fig.suptitle('KNN Result') axes[0].set_title('simple line plot result') axes[0].plot(k_list, average_result) axes[0].set_xlabel('k') axes[0].set_ylabel('Average Accuracy') axes[0].tick_params(axis='x', labelsize=11) axes[0].tick_params(axis='y', labelsize=11) axes[1].set_title('scatter plot result') axes[1].scatter(k_list, average_result)
account.userID, account.password ) traderChannel #%% 定长队列 from collections import deque q = deque(maxlen=10) for i in range(10): q.append(i) #%% from pandas import Series s = Series(range(10)) s.mean() s.std() #%% 交易结果导出到excel import os os.chdir('/home/duhan/github/CTPTrader') from comhelper import setDjangoEnvironment setDjangoEnvironment() from database.models import * from django_pandas.io import read_frame from pandas.io.excel import ExcelWriter df = read_frame(ModelPosition.objects.filter(state='close')) writer = ExcelWriter('/tmp/output.xls') df.to_excel(writer) writer.save()
grouped = frame.data2.groupby(factor) grouped.apply(get_stats).unstack() grouping = pd.qcut(frame.data1, 10, labels=False) grouped = frame.data2.groupby(grouping) grouped.apply(get_stats).unstack() # ### 用特定于分组的值填充缺失值 s = Series(np.random.randn(6)) s[::2] = np.nan s s.fillna(s.mean()) states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho'] group_key = ['East'] * 4 + ['West'] * 4 data = Series(np.random.randn(8), index=states) data[['Vermont', 'Nevada', 'Idaho']] = np.nan data data.groupby(group_key).mean() fill_mean = lambda g: g.fillna(g.mean()) data.groupby(group_key).apply(fill_mean) fill_values = {'East': 0.5, 'West': -1} fill_func = lambda g: g.fillna(fill_values[g.name])
format = lambda x: '%.2f' % x data['popm'].map(format) # sort on values data2 = data.sort_index(by='popm') data2.head() data2.tail() # summary statistics # d1990.sum() data.describe() data.std() # how much did total population change between 1990 and 2010? # TODO: how many people did the average congressperson represent in 1990? # we could also represent a single variable as a series with hierarchical indexing p = Series(data['pop'].values, index=[data['st'], data['yr']]) p['North Carolina'] p.mean(level='st') # TODO: calculate standard deviation by year p.swaplevel('st', 'yr') # correlation data['pop'].corr(data['ev']) # estimate a linear model model = pd.ols(y=data['ev'], x=data['popm']) print model model.beta