def globaldepth(coverage_hist): coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum() weighted_stats = DescrStatsW(coverage_hist.DP - 1, weights=coverage_hist.BPs, ddof=0) global_depth = {} global_depth.update({'mean_DP': round(weighted_stats.mean, signif)}) global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]}) global_depth.update({'std_DP': round(weighted_stats.std, signif)}) global_depth.update({'q25_DP': weighted_stats.quantile(0.25).values[0]}) global_depth.update({'q75_DP': weighted_stats.quantile(0.75).values[0]}) global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]}) global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]}) global_depth.update( {'dp>=1': round(depth_fraction(coverage_hist, thr=1), signif)}) global_depth.update( {'dp>=10': round(depth_fraction(coverage_hist, thr=10), signif)}) global_depth.update( {'dp>=20': round(depth_fraction(coverage_hist, thr=20), signif)}) global_depth.update( {'dp>=30': round(depth_fraction(coverage_hist, thr=30), signif)}) global_depth.update( {'dp>=50': round(depth_fraction(coverage_hist, thr=50), signif)}) global_depth.update( {'dp>=100': round(depth_fraction(coverage_hist, thr=100), signif)}) return (global_depth)
def trades_to_bar(ticks: pd.DataFrame, bar_trigger: str='fixed') -> dict: if type(ticks) != pd.DataFrame: ticks = pd.DataFrame(ticks) bar = {'bar_trigger': bar_trigger} # time bar['open_at'] = ticks['utc_dt'].iloc[0] bar['close_at'] = ticks['utc_dt'].iloc[-1] bar['duration_td'] = bar['close_at'] - bar['open_at'] # volume bar['tick_count'] = ticks.shape[0] bar['volume'] = ticks.volume.sum() bar['dollars'] = (ticks.volume * ticks.price).sum() # price bar['price_open'] = ticks.price.values[0] bar['price_close'] = ticks.price.values[-1] bar['price_low'] = ticks.price.min() bar['price_high'] = ticks.price.max() bar['price_range'] = bar['price_high'] - bar['price_low'] bar['price_return'] = bar['price_close'] - bar['price_close'] # volume weighted price dsw = DescrStatsW(data=ticks.price, weights=ticks.volume) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values bar['price_wq10'] = qtiles[0] bar['price_wq50'] = qtiles[1] bar['price_wq90'] = qtiles[2] bar['price_wq_range'] = bar['price_wq90'] - bar['price_wq10'] bar['price_wmean'] = dsw.mean bar['price_wstd'] = dsw.std # jma bar['jma_open'] = ticks.jma.values[0] bar['jma_close'] = ticks.jma.values[-1] bar['jma_low'] = ticks.jma.min() bar['jma_high'] = ticks.jma.max() bar['jma_range'] = bar['jma_high'] - bar['jma_low'] bar['jma_return'] = bar['jma_close'] - bar['jma_open'] # volume weighted jma dsw = DescrStatsW(data=ticks.jma, weights=ticks.volume) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values bar['jma_wq10'] = qtiles[0] bar['jma_wq50'] = qtiles[1] bar['jma_wq90'] = qtiles[2] bar['jma_wq_range'] = bar['jma_wq90'] - bar['jma_wq10'] bar['jma_wmean'] = dsw.mean bar['jma_wstd'] = dsw.std # tick/vol/dollar/imbalance bar['tick_imbalance'] = ticks.side.sum() bar['volume_imbalance'] = (ticks.volume * ticks.side).sum() bar['dollar_imbalance'] = (ticks.volume * ticks.price * ticks.side).sum() return bar
def title_len_stat(mongo_db): len_counter_db = collections.Counter() len_counter_cr = collections.Counter() for col_name in mongo_db.collection_names(): if col_name not in PAPER_COLLECTIONS: continue col = mongo_db[col_name] query_w_doi = col.find({'doi': {'$exists': True}}) for doc in query_w_doi: if ('metadata' in doc and 'title' in doc['metadata'] and isinstance(doc['metadata']['title'], str)): len_counter_db[len(doc['metadata']['title'])] += 1 if ('crossref_raw_result' in doc and 'title' in doc['crossref_raw_result'] and isinstance(doc['crossref_raw_result']['title'], list) and len(doc['crossref_raw_result']['title']) == 1): len_counter_cr[len( doc['crossref_raw_result']['title'][0])] += 1 # stat for db titles sorted_len = sorted(len_counter_db.keys()) weights = [len_counter_db[l] for l in sorted_len] weighted_stats = DescrStatsW(sorted_len, weights=weights) sns.barplot(sorted_len, weights) percentile = weighted_stats.quantile(probs=[ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99 ]) print('len_counter_db') pprint(len_counter_db) print('weighted_stats.mean', weighted_stats.mean) print('weighted_stats.std', weighted_stats.std) print('percentile') print(percentile) # stat for cr titles sorted_len = sorted(len_counter_cr.keys()) weights = [len_counter_cr[l] for l in sorted_len] weighted_stats = DescrStatsW(sorted_len, weights=weights) # sns.barplot(sorted_len, weights) percentile = weighted_stats.quantile(probs=[ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99 ]) print('len_counter_cr') pprint(len_counter_cr) print('weighted_stats.mean', weighted_stats.mean) print('weighted_stats.std', weighted_stats.std) print('percentile') print(percentile) return len_counter_db, len_counter_cr
def weighted_percentiles(data, weights, percentiles): """Return the weighted percentiles. Args: data (np.ndarray) : Bin variable (e.g. temperature, salinity) weights (np.ndarray): Weights (e.g. cell volume, area) percentiles (np.ndarray): Array of requested percentiles (e.g. 0-1 by 0.01) """ assert percentiles.max() <= 1.0 assert percentiles.min() >= 0.0 wq = DescrStatsW(data=data, weights=weights) bin_edges = wq.quantile(probs=percentiles, return_pandas=False) # manual method does not give a clean results... #ix = np.argsort(data) #data = data[ix] # sort data #weights = weights[ix] # sort weights #cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function #perc = np.arange(0, 1.01, 0.01) #test2 = np.interp(perc, cdf, data) return bin_edges
def globaldepth(coverage_hist): coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum() weighted_stats = DescrStatsW(coverage_hist.DP - 1, weights=coverage_hist.BPs, ddof=0) ##que diferencia hay con coverage_hist.DP.mean()?????????? global_depth = {} b, bases_20x, depth_20X = depth_fraction(coverage_hist, thr=20) global_depth.update({'bases_totales': int(b)}) global_depth.update({'mean_DP': round(weighted_stats.mean, signif)}) global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]}) #global_depth.update({'std_DP':round(weighted_stats.std,signif)}) #global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]}) #global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]}) #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) #global_depth.update({'dp>=1':round(depth_fraction(coverage_hist,thr=1),signif)}) #global_depth.update({'dp>=10':round(depth_fraction(coverage_hist,thr=10),signif)}) global_depth.update({'bases_20X': int(bases_20x)}) #global_depth.update({'bases_20X(%)':(100*(bases_20x/b)}) global_depth.update({'dp>=20': round(depth_20X, 3)}) #global_depth.update({'dp>=20':round(depth_fraction(coverage_hist,thr=20),signif)}) #global_depth.update({'dp>=30':round(depth_fraction(coverage_hist,thr=30),signif)}) #global_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)}) #global_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)}) return (global_depth)
def describe_cluster(cluster, columns): """ Generate descriptive statistics for a cluster Parameters: cluster (DataFrame): A dataframe, that contains density informations for every bin in the cluster columns (list of string): The names of the columns for which to generate statistics Returns: Series: All statistics for the selected columns """ values = cluster.values dstats = DescrStatsW( values, cluster["DENSITY"].values if len(values) > 1 else None) mean = dstats.mean std = dstats.std quantiles = dstats.quantile(0.5, return_pandas=False) result_columns = [[ mean[i], std[i], std[i] / abs(mean[i]) * 100, cluster[columns[i]].min(), quantiles[0][i], cluster[columns[i]].max(), ] for i in range(len(columns))] result = list(itertools.chain(*result_columns)) + [ cluster["DENSITY"].count(), cluster["DENSITY"].sum() * 100, ] value_columns = [[ (col, "mean"), (col, "std"), (col, "varC (%)"), (col, "min"), (col, "median"), (col, "max"), ] for col in columns] index = list(itertools.chain(*value_columns)) + [ ("DENSITY", "count"), ("DENSITY", "total"), ] return pd.Series(result, index=pd.MultiIndex.from_tuples(index))
def output_new_bar(state): new_bar = {} if state['tick_count'] == 0: return new_bar new_bar['bar_trigger'] = state['trigger_yet?!'] # time new_bar['open_epoch'] = state['trades']['epoch'][0] new_bar['close_epoch'] = state['trades']['epoch'][-1] new_bar['open_at'] = pd.to_datetime(state['trades']['epoch'][0], unit='ns') new_bar['close_at'] = pd.to_datetime(state['trades']['epoch'][-1], unit='ns') new_bar['duration_dt'] = new_bar['close_at'] - new_bar['open_at'] new_bar['duration_sec'] = state['duration_sec'] new_bar['duration_min'] = new_bar['duration_sec'] / 60 # price new_bar['price_open'] = state['trades']['price'][0] new_bar['price_close'] = state['trades']['price'][-1] new_bar['price_low'] = state['price_min'] new_bar['price_high'] = state['price_max'] new_bar['price_mean'] = np.array(state['trades']['price']).mean() new_bar['price_std'] = np.array(state['trades']['price']).std() new_bar['price_q10'] = np.quantile(state['trades']['price'], q=0.1) new_bar['price_q50'] = np.quantile(state['trades']['price'], q=0.5) new_bar['price_q90'] = np.quantile(state['trades']['price'], q=0.9) new_bar['price_range'] = state['price_range'] new_bar['bar_return'] = state['bar_return'] # volume weighted price dsw = DescrStatsW(data=state['trades']['price'], weights=state['trades']['volume']) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values new_bar['price_wq10'] = qtiles[0] new_bar['price_wq50'] = qtiles[1] new_bar['price_wq90'] = qtiles[2] new_bar['price_wmean'] = dsw.mean new_bar['price_wstd'] = dsw.std # tick/vol/dollar/imbalance new_bar['tick_count'] = state['tick_count'] new_bar['volume_sum'] = state['volume_sum'] new_bar['dollar_sum'] = state['dollar_sum'] new_bar['tick_imbalance'] = state['tick_imbalance'] new_bar['volume_imbalance'] = state['volume_imbalance'] new_bar['dollar_imbalance'] = state['dollar_imbalance'] new_bar['tick_imbalance_run'] = state['tick_run'] new_bar['volume_imbalance_run'] = state['volume_run'] new_bar['dollar_imbalance_run'] = state['dollar_run'] return new_bar
def abs_len_stat(mongo_db): len_counter = collections.Counter() for col_name in mongo_db.collection_names(): if col_name not in PAPER_COLLECTIONS: continue col = mongo_db[col_name] query_w_doi = col.find({'abstract': {'$exists': True}}) for doc in query_w_doi: # get abstract abstract = None if 'abstract' in doc and len(doc['abstract']) > 0: abstract = '' for fragment in doc['abstract']: if ('text' in fragment and isinstance(fragment['text'], str) and len(fragment['text']) > 0): abstract += fragment['text'].strip() + ' ' abstract = abstract.strip() if len(abstract) == 0: abstract = None if abstract is not None: # print(abstract) len_counter[len(abstract)] += 1 # stat for db abs sorted_len = sorted(len_counter.keys()) weights = [len_counter[l] for l in sorted_len] weighted_stats = DescrStatsW(sorted_len, weights=weights) sns.barplot(sorted_len, weights) percentile = weighted_stats.quantile(probs=[ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99 ]) print('len_counter') pprint(len_counter) print('weighted_stats.mean', weighted_stats.mean) print('weighted_stats.std', weighted_stats.std) print('percentile') print(percentile) return len_counter
def update_thrs(self): thrs = {} # firstly compute and collect thresholds for vs_type in self.vs_types: if self.require_wp_vs_others: # apply loosest WP from the previous iteration taus = self.apply_wp_vs_others(vs_type) else: taus = self._taus weighted_score = DescrStatsW(data=np.array( taus[f'score_vs_{vs_type}'], dtype=np.float32), weights=np.array(taus['weight'], dtype=np.float32)) thrs[vs_type] = weighted_score.quantile(probs=1 - self.tpr, return_pandas=False) # then update them in the class for vs_type, WPs in self.wp_definitions.items(): for wp_cfg in WPs.values(): idx = (self.tpr >= wp_cfg["eff"]).argmax() wp_cfg['thrs'].append(thrs[vs_type][idx])
def state_to_bar(state: dict) -> dict: new_bar = {} if state['stat']['tick_count'] < 11: return new_bar new_bar['bar_trigger'] = state['bar_trigger'] # time new_bar['open_at'] = state['trades']['utc_dt'][0] new_bar['close_at'] = state['trades']['utc_dt'][-1] new_bar['duration_td'] = new_bar['close_at'] - new_bar['open_at'] # volume new_bar['tick_count'] = state['stat']['tick_count'] new_bar['volume'] = state['stat']['volume'] new_bar['dollars'] = state['stat']['dollars'] # price new_bar['price_open'] = state['trades']['price'][0] new_bar['price_close'] = state['trades']['price'][-1] new_bar['price_low'] = state['stat']['price_min'] new_bar['price_high'] = state['stat']['price_max'] new_bar['price_range'] = state['stat']['price_range'] new_bar['price_return'] = state['stat']['price_return'] # volume weighted price dsw = DescrStatsW(data=state['trades']['price'], weights=state['trades']['volume']) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values new_bar['price_wq10'] = qtiles[0] new_bar['price_wq50'] = qtiles[1] new_bar['price_wq90'] = qtiles[2] new_bar['price_wq_range'] = new_bar['price_wq90'] - new_bar['price_wq10'] new_bar['price_wmean'] = dsw.mean new_bar['price_wstd'] = dsw.std # jma new_bar['jma_open'] = state['trades']['jma'][0] new_bar['jma_close'] = state['trades']['jma'][-1] new_bar['jma_low'] = state['stat']['jma_min'] new_bar['jma_high'] = state['stat']['jma_max'] new_bar['jma_range'] = state['stat']['jma_range'] new_bar['jma_return'] = state['stat']['jma_return'] # volume weighted jma dsw = DescrStatsW(data=state['trades']['jma'], weights=state['trades']['volume']) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values new_bar['jma_wq10'] = qtiles[0] new_bar['jma_wq50'] = qtiles[1] new_bar['jma_wq90'] = qtiles[2] new_bar['jma_wq_range'] = new_bar['jma_wq90'] - new_bar['jma_wq10'] new_bar['jma_wmean'] = dsw.mean new_bar['jma_wstd'] = dsw.std # tick/vol/dollar/imbalance new_bar['tick_imbalance'] = state['stat']['tick_imbalance'] new_bar['volume_imbalance'] = state['stat']['volume_imbalance'] new_bar['dollar_imbalance'] = state['stat']['dollar_imbalance'] if False: new_bar['n_tick_count'] = len(state['trades']['price']) new_bar['n_volume'] = sum(state['trades']['volume']) new_bar['n_dollars'] = new_bar['price_wq50'] * new_bar['volume'] new_bar['n_tick_imbalance'] = sum(state['trades']['side']) new_bar['n_open_at'] = state['trades']['utc_dt'][0] new_bar['n_close_at'] = state['trades']['utc_dt'][-1] # new_bar['n_volume_imbalance'] = # new_bar['n_dollar_imbalance'] = return new_bar
def test_weightstats_len_1(): x1 = [1] w1 = [1] d1 = DescrStatsW(x1, w1) assert (d1.quantile([0.0, 0.5, 1.0]) == 1).all()
def weighted(x): stats = DescrStatsW(x["quantity"], x["sold"]) return {"median": stats.quantile(0.5)[0.5], "std": stats.std}
def tabulate_march_inequality(year): """ # For years 1964-2009 (year is March year, not earnings year), tabulate: These inequality metrics: - 90/50, 50/10, 90/10, Vln - 60/50, 70/50, 80/50, 95/50, 97/50 - 50/3, 50/5, 50/20, 50/30, 50/40 For these samples - Males - Females - Both For these wage measures - All hourly For these conditioning variables - raw wage inequality - residual wage inequality Also note: - Always dropping allocators where possible D. Autor, 2/24/2004 D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods M. Anderson, 12/13/2005 - Updated for new quantiles and years D. Autor, 9/5/2006. Updated for 2005 March M. Wasserman, 10/14/2009 Updated for 2007/8 March # """ df = tabulate_march_basic(year) df = df.eval(""" lnwinc = log(winc_ws) + log(gdp) lnhinc = log(hinc_ws) + log(gdp) """) # Full-time and hourly samples df = df.eval("ftfy = fulltime*fullyear") df.ftfy.describe().to_frame().T df = df.eval(""" ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1) hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1) """) # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit # @ hrsamp: hourly real wage not none + above hourly real wage limit df.loc[df.ftsamp == 0, "lnwinc"] = np.nan df.loc[df.hrsamp == 0, "lnhinc"] = np.nan df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T df = df.query("ftsamp == 1 | hrsamp == 1") # Generate experience categories df = df.assign(expcat=(df.exp/3).astype(int) + 1) df.loc[df.expcat == 17, "expcat"] = 16 assert df.eval("1<= expcat <= 16").all() df.groupby("expcat")["exp"].agg(["mean", "min", "max"]) # interaction terms - 80 of these # @ move to residual wage part # Drop reference group's interaction term: HSG with 0-2 years of experience # @ simiarly skip now df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"]) ###################################################################### # Summarize raw inequality ###################################################################### pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97]) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) tot_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_mf"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_m"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_f"] = [wq.mean, wq.var] dt = df.query("hrsamp==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_mf"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==0") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_m"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_f"] = [wq.mean, wq.var] df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False) ###################################################################### # Summarize residual inequality - Weekly & Hourly ###################################################################### res_pct = pd.DataFrame(index=pctiles) res_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_mf"] = [wq.mean, wq.var] # @ mean is not necessary but to be consistent res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_m"] = [wq.mean, wq.var] res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_f"] = [wq.mean, wq.var] res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1") y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_mf"] = [wq.mean, wq.var] res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==0") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_m"] = [wq.mean, wq.var] res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==1") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_f"] = [wq.mean, wq.var] res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) df_stat_ = pd.concat([res_stat, res_pct], axis=0) df_stat = pd.concat([df_stat, df_stat_], axis=1) # march-ineq-data-`1' df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year) # @ tidy data ###################################################################### # Percentiles of weekly earnings ###################################################################### # @ simply generate more percentiles under full-time samples # @ note here year is march census year thus minus one to be earnings year pctiles = pd.Series(range(3, 98)) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) # march-pctile-`yr' tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1) # @ tidy data # @ the code then combine 1963-2008 generated files # @ we remove this as not sure necessary # @ actually this part can be combined with #Summarize raw inequality# return df_stat, tot_pct
def getWeightedMeanQuantiles(feature,weights): weighted_stats = DescrStatsW(feature, weights=weights, ddof=0) return weighted_stats.mean, weighted_stats.quantile([0.25,0.50,0.75],return_pandas=False)
# is selected or a blank value, then the algorithm will replace with an # array of 1's with length equal to the endog. # WARNING: Using weights is not verified yet for all possible options # and results, see Notes. # In[182]: df1=pd.DataFrame({ 'x':range(1,101), 'wt':range(1,101) }) from statsmodels.stats.weightstats import DescrStatsW wdf = DescrStatsW(df1.x, weights=df1.wt, ddof=1) print('without weight, the mean value is: ', np.mean(df1.x)) print( 'with weight, the mean value is: ', wdf.mean ) print( wdf.std ) print( wdf.quantile([0.25,0.50,0.75]) ) # In[201]: # 'COVIDFOL_W66. How closely have you been following news about the outbreak of the coronavirus import statsmodels.api as sm X = sm.add_constant(df[['F_INCOME','F_AGECAT', 'edu', 'republic','COVIDCOVER1_W66', 'MH_TRACK_a_W66', 'MH_TRACK_b_W66', 'MH_TRACK_d_W66', 'MH_TRACK_d_W66', 'MH_TRACK_e_W66']]) y = df['COVIDFOL_W66'] reg = sm.OLS(y,X, freq_weights=df['WEIGHT_W66']) results = reg.fit() reg1 = sm.GLM(y,X) results1 = reg1.fit() reg2 = sm.GLM(y,X, freq_weights=df['WEIGHT_W66'])
# ## Compute global statistics bases_without_reads = depth_fraction(coverage,ZeroDepth=True) bases_greater1 = depth_fraction(coverage,thr=1) bases_greater10 = depth_fraction(coverage,thr=10) bases_greater20 = depth_fraction(coverage,thr=20) bases_greater30 = depth_fraction(coverage,thr=30) # uso estadisticos pesados popr la longitud de cada intervalo de profundidad constante weighted_stats = DescrStatsW(coverage['count'], weights=coverage.count_length, ddof=0) global_depth={} global_depth.update({'mean_DP':round(weighted_stats.mean,signif)}) global_depth.update({'median_DP':weighted_stats.quantile(0.5).values[0]}) global_depth.update({'std_DP':round(weighted_stats.std,signif)}) global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]}) global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]}) global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) global_depth.update({'dp>=1':bases_greater1}) global_depth.update({'dp>=10':bases_greater10}) global_depth.update({'dp>=20':bases_greater20}) global_depth.update({'dp>=30':bases_greater30}) global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) #global_depth.update({'DP=0':round(bases_without_reads,signif)}) res = pd.Series(global_depth).to_frame() res.columns=[sample] global_statistics = res.loc[[u'dp>=1',u'dp>=10', u'dp>=20', u'dp>=30',
def test_weightstats_2d_w1(): x1 = [[1], [2]] w1 = [[1], [2]] d1 = DescrStatsW(x1, w1) print(len(np.array(w1).shape)) assert (d1.quantile([0.5, 1.0]) == 2).all().all()
def test_weightstats_2d_w2(): x1 = [[1]] w1 = [[1]] d1 = DescrStatsW(x1, w1) assert (d1.quantile([0, 0.5, 1.0]) == 1).all().all()
weights.append(1) #last is last minues before * 0.5 #start analysis allPrint(latestFileHandler, "Analysis for " + item) allPrint(latestFileHandler, "=============" + "=" * len(item)) allPrint(latestFileHandler, "Data count: " + str(len(times))) stats = DescrStatsW(prices, weights) allPrint(latestFileHandler, "Weighted Average Price: " + "{:,.2f}".format(stats.mean)) allPrint(latestFileHandler, "Weighted Stdev: " + "{:,.2f}".format(stats.std)) allPrint(latestFileHandler, ) allPrint(latestFileHandler, "Percentiles:") allPrint(latestFileHandler, "5% : " + "{:,.2f}".format(stats.quantile(0.05, False)[0])) allPrint(latestFileHandler, "15% : " + "{:,.2f}".format(stats.quantile(0.15, False)[0])) allPrint(latestFileHandler, "50% : " + "{:,.2f}".format(stats.quantile(0.50, False)[0])) allPrint(latestFileHandler, "85% : " + "{:,.2f}".format(stats.quantile(0.85, False)[0])) allPrint(latestFileHandler, "95% : " + "{:,.2f}".format(stats.quantile(0.95, False)[0])) allPrint(latestFileHandler, ) profits[item] = round(stats.quantile(tresholdPercentile, False)[0], 2) - round(prices[-1], 2) profitsPercent[item] = ( round(stats.quantile(tresholdPercentile, False)[0], 2) - round(prices[-1], 2)) / prices[-1] allPrint(latestFileHandler,
def describe_cluster(cluster_df, features, weight_column, oven_refills): """ Create the statistics for a cluster. Datapoints that are part of a breakdown period are excluded. Parameters ---------- cluster_df : DataFrame A dataframe that contains all points of the cluster you want to describe. features : list of source features All source feature for which the statistics should be generated weight_column : string Name of the column to use for weighting data points, typically `datapoint_duration` (``ProcessingFeatures.DATAPOINT_DURATION``) oven_refills : list of timestamp End of the oven refill periods Returns ------- Series A Series of the following statistics For each parameter in `features`: 1. mean 2. std 3. std% (std in percent of mean) 4. avg_dev (average deviation of mean) 5. min 6. 25% (lower quartile) 7. median 8. 75% (upper quartile) 9. max Once for the cluster: 10. Density/count (number of data points in the cluster) 11. Duration/in_hours (total duration of cluster) 12. Duration/longest (duration of longest fragment) 13. Duration/num_splits (number of fragments) 14. Refill/index (index of oven refill that came directly before the beginning of the longest fragment) 15. Refill/delta_in_hours (delta from the end of the closest oven refill) 16. num_breakdowns/per_hour (number of breakdowns per hour) """ values = ["mean", "std", "std%", "avg_dev", "min", "25%", "median", "75%", "max"] index = pd.MultiIndex.from_tuples( [(p, v) for p in features for v in values] + [ ("DENSITY", "count"), ("DURATION", "in_hours"), ("DURATION", "longest_in_hours"), ("DURATION", "num_splits"), ("REFILL", "index"), ("REFILL", "delta_in_hours"), ("num_breakdowns", "per_hour"), ] ) data = cluster_df.loc[ (cluster_df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0), features ].values # TODO maybe only include non breakdown here??? weights = cluster_df.loc[ (cluster_df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0), weight_column ].values if data.size == 0: return None stats = DescrStatsW(data, weights, ddof=1) mean = np.array(stats.mean) # np.mean(data, axis=0) std = np.array(stats.std) # np.std(data, axis=0) quantiles = stats.quantile([0, 0.25, 0.5, 0.75, 1], return_pandas=False) # np.quantile(data, [0, 0.25, 0.5, 0.75, 1], axis=0) avg_dev = np.dot(weights, np.absolute(data - mean)) / np.sum(weights) count = len(data) duration_in_seconds = cluster_df[ProcessingFeatures.DATAPOINT_DURATION].sum() duration_in_hours = duration_in_seconds / 3600 ( duration_longest_start, duration_longest, duration_num_splits, ) = get_cluster_duration(cluster_df, weight_column) duration_longest /= 3600 closest_refill = None for i, refill in reversed(list(enumerate(oven_refills))): if duration_longest_start > refill: closest_refill = i break refill_delta = -1 if not closest_refill is None: refill_delta = ( pd.Timestamp(duration_longest_start) - oven_refills[closest_refill] ).total_seconds() / 3600 description = [ [ mean[i], std[i], np.abs(std[i] / mean[i]) * 100, avg_dev[i], quantiles[0][i], quantiles[1][i], quantiles[2][i], quantiles[3][i], quantiles[4][i], ] for i in range(len(features)) ] description = [item for sublist in description for item in sublist] description.append(count) description.append(duration_in_hours) description.append(duration_longest) description.append(duration_num_splits) description.append(closest_refill) description.append(refill_delta) description.append( cluster_df.loc[ cluster_df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0, ProcessingFeatures.HT_SPARKS_COUNTER, ].nunique() / duration_in_hours ) return pd.Series(description, index=index)