def test_weightstats_ddof_tests(self): # explicit test that ttest and confint are independent of ddof # one sample case x1_2d = self.x1_2d w1 = self.w1 d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0) d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1) d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2) #check confint independent of user ddof res0 = d1w_d0.ttest_mean() res1 = d1w_d1.ttest_mean() res2 = d1w_d2.ttest_mean() # concatenate into one array with np.r_ assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) res0 = d1w_d0.ttest_mean(0.5) res1 = d1w_d1.ttest_mean(0.5) res2 = d1w_d2.ttest_mean(0.5) assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) #check confint independent of user ddof res0 = d1w_d0.tconfint_mean() res1 = d1w_d1.tconfint_mean() res2 = d1w_d2.tconfint_mean() assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
def weighted_stat(stock_trading_df): if (stock_trading_df.shape[0] == 1): return pd.Series([ 0, 0, stock_trading_df['price'][0], stock_trading_df['price'][0], stock_trading_df['price'][0], 1, stock_trading_df['turnover'][0] ], index=[ 'price_var', 'price_std', 'price_mean', 'price_min', 'price_max', 'no_of_txn', 'turnover' ]) else: return pd.Series([ DescrStatsW(stock_trading_df['price'], stock_trading_df['volume']).var, DescrStatsW(stock_trading_df['price'], stock_trading_df['volume']).std, DescrStatsW(stock_trading_df['price'], stock_trading_df['volume']).mean, min(stock_trading_df['price']), max(stock_trading_df['price']), DescrStatsW(stock_trading_df['price']).nobs, sum(stock_trading_df['turnover']) ], index=[ 'price_var', 'price_std', 'price_mean', 'price_min', 'price_max', 'no_of_txn', 'turnover' ])
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1) * 21. / 20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] #d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) #TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def setup_class(cls): cls.x1 = np.array( [7.8, 6.6, 6.5, 7.4, 7.3, 7., 6.4, 7.1, 6.7, 7.6, 6.8]) cls.x2 = np.array([4.5, 5.4, 6.1, 6.1, 5.4, 5., 4.1, 5.5]) cls.d1 = DescrStatsW(cls.x1) cls.d2 = DescrStatsW(cls.x2) cls.cm = CompareMeans(cls.d1, cls.d2)
def test_weightstats_2(self): x1, x2 = self.x1, self.x2 w1, w2 = self.w1, self.w2 d1 = DescrStatsW(x1) d1w = DescrStatsW(x1, weights=w1) d2w = DescrStatsW(x2, weights=w2) x1r = d1w.asrepeats() x2r = d2w.asrepeats() # print 'random weights' # print ttest_ind(x1, x2, weights=(w1, w2)) # print stats.ttest_ind(x1r, x2r) assert_almost_equal( ttest_ind(x1, x2, weights=(w1, w2))[:2], stats.ttest_ind(x1r, x2r), 14) #not the same as new version with random weights/replication # assert x1r.shape[0] == d1w.sum_weights # assert x2r.shape[0] == d2w.sum_weights assert_almost_equal(x2r.mean(0), d2w.mean, 14) assert_almost_equal(x2r.var(), d2w.var, 14) assert_almost_equal(x2r.std(), d2w.std, 14) #note: the following is for 1d assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14) #assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19) #TODO: exception in corrcoef (scalar case) #one-sample tests # print d1.ttest_mean(3) # print stats.ttest_1samp(x1, 3) # print d1w.ttest_mean(3) # print stats.ttest_1samp(x1r, 3) assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11) assert_almost_equal( d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
def test_weightstats_3(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d w1, w2 = self.w1, self.w2 d1w_2d = DescrStatsW(x1_2d, weights=w1) d2w_2d = DescrStatsW(x2_2d, weights=w2) x1r_2d = d1w_2d.asrepeats() x2r_2d = d2w_2d.asrepeats() assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14) assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14) assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14) assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14) assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14) # print d1w_2d.ttest_mean(3) # #scipy.stats.ttest is also vectorized # print stats.ttest_1samp(x1r_2d, 3) t, p, d = d1w_2d.ttest_mean(3) assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11) #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T] cm = CompareMeans(d1w_2d, d2w_2d) ressm = cm.ttest_ind() resss = stats.ttest_ind(x1r_2d, x2r_2d) assert_almost_equal(ressm[:2], resss, 14)
def fit(self): """Once the exposure and outcome models are specified, we can estimate the risk ratio and risk difference. Returns ------- Gains `risk_difference`, `risk_difference_ci`, and `risk_ratio` values """ if (self._fit_exposure_ is False) or (self._fit_outcome_ is False): raise ValueError( 'The exposure and outcome models must be specified before the doubly robust estimate can ' 'be generated') # Doubly robust estimator under all treated a_obs = self.df[self.exposure] y_obs = self.df[self.outcome] ps = self.df['_ps_'] py_a1 = self.df['_pY1_'] py_a0 = self.df['_pY0_'] dr_a1 = np.where(a_obs == 1, (y_obs / ps) - ((py_a1 * (1 - ps)) / ps), py_a1) # Doubly robust estimator under all untreated dr_a0 = np.where(a_obs == 1, py_a0, (y_obs / (1 - ps) - ((py_a0 * ps) / (1 - ps)))) # Generating estimates for the risk difference and risk ratio zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) if self._weight_ is None: if self._continuous_outcome: self.average_treatment_effect = np.mean(dr_a1) - np.mean(dr_a0) var_ic = np.var( (dr_a1 - dr_a0) - self.average_treatment_effect, ddof=1) / self.df.shape[0] self.average_treatment_effect_se = np.sqrt(var_ic) self.average_treatment_effect_ci = [ self.average_treatment_effect - zalpha * np.sqrt(var_ic), self.average_treatment_effect + zalpha * np.sqrt(var_ic) ] else: self.risk_difference = np.mean(dr_a1) - np.mean(dr_a0) self.risk_ratio = np.mean(dr_a1) / np.mean(dr_a0) var_ic = np.var((dr_a1 - dr_a0) - self.risk_difference, ddof=1) / self.df.shape[0] self.risk_difference_se = np.sqrt(var_ic) self.risk_difference_ci = [ self.risk_difference - zalpha * np.sqrt(var_ic), self.risk_difference + zalpha * np.sqrt(var_ic) ] else: dr_m1 = DescrStatsW(dr_a1, weights=self.df[self._weight_]).mean dr_m0 = DescrStatsW(dr_a0, weights=self.df[self._weight_]).mean if self._continuous_outcome: self.average_treatment_effect = dr_m1 - dr_m0 else: self.risk_difference = dr_m1 - dr_m0 self.risk_ratio = dr_m1 / dr_m0
def get_angle(self, flip = True): angles = get_angles() angle_dict = dict() for a in ['r','l']: phi = angles[[f'inc_{a}']] flip = angles[[f'inc_flip_{a}']] err = 1 phi_des = DescrStatsW(phi, weights = 1/(np.array([err, err, err]))**2) phi_mean = phi_des.mean phi_mean_err = phi_des.std / np.sqrt(3) flip_des = DescrStatsW(flip, weights = 1/(np.array([err, err, err]))**2) flip_mean = flip_des.mean flip_mean_err = flip_des.std / np.sqrt(3) true_phi = (phi_mean + flip_mean) / 2 true_phi_err = np.sqrt(phi_mean_err**2 + flip_mean_err**2) / 2 angle_dict.update({f'phi_{a}': [true_phi, true_phi_err]}) self.phi_r_mu = angle_dict['phi_r'][0] self.phi_r_std = angle_dict['phi_r'][1] self.phi_l_mu = angle_dict['phi_l'][0] self.phi_l_std = angle_dict['phi_l'][1] self.dphi_mu_g = (self.phi_r_mu - self.phi_l_mu) / 2 self.dphi_std_g = np.sqrt(self.phi_r_std**2 + self.phi_l_std**2) self.dphi_mu_err_g = self.dphi_std_g / np.sqrt(2) + 0.05 upper = (self.a_dict['a_L'][0] - self.a_dict['a_R'][0]) * np.sin(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0])) lower = (self.a_dict['a_L'][0] + self.a_dict['a_R'][0]) * np.cos(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0])) self.dphi_mu_a = upper / lower x = np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0]) L = self.a_dict['a_L'][0] R = self.a_dict['a_R'][0] err1 = (np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_l_std**2 err2 = (-np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_r_std**2 err3 = ((L-R)/(L+R)+(L-R)*np.sin(x)**2/((L+R)*np.cos(x)**2))**2 * angle_dict[f'phi_{self.orientation.lower()}'][1]**2 self.dphi_std_a = np.sqrt(err1 + err2 + err3) + 0.05 combined_dphi = DescrStatsW([self.dphi_mu_g, self.dphi_mu_a], weights=1/(np.array([self.dphi_mu_err_g, self.dphi_std_a]))**2) self.dphi_mu = combined_dphi.mean self.dphi_std = combined_dphi.std / np.sqrt(2) self.get_chi2(f'dphi_{self.orientation}', np.array([self.dphi_mu_a,self.dphi_mu_g]), np.array([self.dphi_std_a, self.dphi_mu_err_g]))
def _similarity_helper_limited(user1_id, user2_id, solr): user1_vector, user2_vector, weight_vector = get_vector_limited( user1_id, user2_id) data = column_stack((user1_vector, user2_vector)) result = DescrStatsW(data)[1][0] result2 = DescrStatsW(data, weights=weight_vector).corrcoef[1][0] print('Pearson similarity ' + str(user2_id) + ' ' + str(result)) print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))
def trades_to_bar(ticks: pd.DataFrame, bar_trigger: str='fixed') -> dict: if type(ticks) != pd.DataFrame: ticks = pd.DataFrame(ticks) bar = {'bar_trigger': bar_trigger} # time bar['open_at'] = ticks['utc_dt'].iloc[0] bar['close_at'] = ticks['utc_dt'].iloc[-1] bar['duration_td'] = bar['close_at'] - bar['open_at'] # volume bar['tick_count'] = ticks.shape[0] bar['volume'] = ticks.volume.sum() bar['dollars'] = (ticks.volume * ticks.price).sum() # price bar['price_open'] = ticks.price.values[0] bar['price_close'] = ticks.price.values[-1] bar['price_low'] = ticks.price.min() bar['price_high'] = ticks.price.max() bar['price_range'] = bar['price_high'] - bar['price_low'] bar['price_return'] = bar['price_close'] - bar['price_close'] # volume weighted price dsw = DescrStatsW(data=ticks.price, weights=ticks.volume) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values bar['price_wq10'] = qtiles[0] bar['price_wq50'] = qtiles[1] bar['price_wq90'] = qtiles[2] bar['price_wq_range'] = bar['price_wq90'] - bar['price_wq10'] bar['price_wmean'] = dsw.mean bar['price_wstd'] = dsw.std # jma bar['jma_open'] = ticks.jma.values[0] bar['jma_close'] = ticks.jma.values[-1] bar['jma_low'] = ticks.jma.min() bar['jma_high'] = ticks.jma.max() bar['jma_range'] = bar['jma_high'] - bar['jma_low'] bar['jma_return'] = bar['jma_close'] - bar['jma_open'] # volume weighted jma dsw = DescrStatsW(data=ticks.jma, weights=ticks.volume) qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values bar['jma_wq10'] = qtiles[0] bar['jma_wq50'] = qtiles[1] bar['jma_wq90'] = qtiles[2] bar['jma_wq_range'] = bar['jma_wq90'] - bar['jma_wq10'] bar['jma_wmean'] = dsw.mean bar['jma_wstd'] = dsw.std # tick/vol/dollar/imbalance bar['tick_imbalance'] = ticks.side.sum() bar['volume_imbalance'] = (ticks.volume * ticks.side).sum() bar['dollar_imbalance'] = (ticks.volume * ticks.price * ticks.side).sum() return bar
def title_len_stat(mongo_db): len_counter_db = collections.Counter() len_counter_cr = collections.Counter() for col_name in mongo_db.collection_names(): if col_name not in PAPER_COLLECTIONS: continue col = mongo_db[col_name] query_w_doi = col.find({'doi': {'$exists': True}}) for doc in query_w_doi: if ('metadata' in doc and 'title' in doc['metadata'] and isinstance(doc['metadata']['title'], str)): len_counter_db[len(doc['metadata']['title'])] += 1 if ('crossref_raw_result' in doc and 'title' in doc['crossref_raw_result'] and isinstance(doc['crossref_raw_result']['title'], list) and len(doc['crossref_raw_result']['title']) == 1): len_counter_cr[len( doc['crossref_raw_result']['title'][0])] += 1 # stat for db titles sorted_len = sorted(len_counter_db.keys()) weights = [len_counter_db[l] for l in sorted_len] weighted_stats = DescrStatsW(sorted_len, weights=weights) sns.barplot(sorted_len, weights) percentile = weighted_stats.quantile(probs=[ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99 ]) print('len_counter_db') pprint(len_counter_db) print('weighted_stats.mean', weighted_stats.mean) print('weighted_stats.std', weighted_stats.std) print('percentile') print(percentile) # stat for cr titles sorted_len = sorted(len_counter_cr.keys()) weights = [len_counter_cr[l] for l in sorted_len] weighted_stats = DescrStatsW(sorted_len, weights=weights) # sns.barplot(sorted_len, weights) percentile = weighted_stats.quantile(probs=[ 0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99 ]) print('len_counter_cr') pprint(len_counter_cr) print('weighted_stats.mean', weighted_stats.mean) print('weighted_stats.std', weighted_stats.std) print('percentile') print(percentile) return len_counter_db, len_counter_cr
def angular_linear_correlation(self, angles, data, weights=None, double_peak=False): """ This function computes an angular-linear correlation. When expecting the data to have two symmetrical, opposite peaks (e.g. a non-direction selective effect such as horizontal vs. vertical instead of up vs down), double_peak should be set to True. :param angles: input angles :type angles: 1-D array :param data: input data :type data: 1-D array, same shape as angles :param weights: weights to use for correlation :type weights: 1-D array, same shape as angles :param double_peak: when True, angles are doubled :type double_peak: bool :return corr: circular correlation :type corr: float """ # set weights to one if weights is None: weights = np.ones_like(angles) # In cases of expected periodicity (e.g. data peaks at two opposite angles), # the angular data should be scaled: if double_peak: angles = self.collapse_angles_symmetrically(angles) # use formula from the pycircstat package to calculate circular correlation: rxs = DescrStatsW(data=np.vstack([data, np.sin(angles)]).T, weights=weights).corrcoef[0, 1] rxc = DescrStatsW(data=np.vstack([data, np.cos(angles)]).T, weights=weights).corrcoef[0, 1] rcs = DescrStatsW(data=np.vstack([np.sin(angles), np.cos(angles)]).T, weights=weights).corrcoef[0, 1] # rxs = self.functions.wpearson(these_data,np.sin(doubled_angles),weights) # rxc = self.functions.wpearson(these_data,np.cos(doubled_angles),weights) # rcs = self.functions.wpearson(np.sin(doubled_angles),np.cos(doubled_angles),weights) # compute angular-linear correlation (equ. 27.47) corr = np.sqrt((rxc**2 + rxs**2 - 2 * rxc * rxs * rcs) / (1 - rcs**2)) return corr
def setup_class(self): np.random.seed(9876789) n1, n2 = 20, 30 m1, m2 = 1, 1.2 x1 = m1 + np.random.randn(n1, 3) x2 = m2 + np.random.randn(n2, 3) w1 = np.random.randint(1, 4, n1) w2 = np.random.randint(1, 4, n2) self.x1, self.x2 = x1, x2 self.w1, self.w2 = w1, w2 self.d1w = DescrStatsW(x1, weights=w1, ddof=0) self.d2w = DescrStatsW(x2, weights=w2, ddof=1) self.x1r = self.d1w.asrepeats() self.x2r = self.d2w.asrepeats()
def _get_weighted_stats(self, X, y, weights): """Gets the weighted mean and standard deviation for each variable in X and y, based on an array of weights.""" Xw_stat_obj = DescrStatsW(self._X, weights=self._w, ddof=1) # Weighted standard deviation for X vars: std_Xw = np.sqrt(np.abs(Xw_stat_obj.var_ddof(1))) # abs for w_sum <1 mean_Xw = Xw_stat_obj.mean # Numpy array shape: (regressors, ) yw_stat_obj = DescrStatsW(self._y, weights=self._w, ddof=1) # Weighted standard deviation for y: std_yw = np.sqrt(np.abs(yw_stat_obj.var_ddof(1))) # abs for w_sum <1 mean_yw = yw_stat_obj.mean # Numpy array shape: (regressors, ) return mean_Xw, mean_yw, std_Xw, std_yw
def setup_class(cls): np.random.seed(9876789) n1, n2 = 20, 20 m1, m2 = 1, 1.2 x1 = m1 + np.random.randn(n1, 3) x2 = m2 + np.random.randn(n2, 3) w1 = np.random.randint(1, 4, n1) w2 = np.random.randint(1, 4, n2) cls.x1, cls.x2 = x1, x2 cls.w1, cls.w2 = w1, w2 cls.d1w = DescrStatsW(x1, weights=w1, ddof=1) cls.d2w = DescrStatsW(x2, weights=w2, ddof=1) cls.x1r = cls.d1w.asrepeats() cls.x2r = cls.d2w.asrepeats()
def globaldepth(coverage_hist): coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum() weighted_stats = DescrStatsW(coverage_hist.DP - 1, weights=coverage_hist.BPs, ddof=0) ##que diferencia hay con coverage_hist.DP.mean()?????????? global_depth = {} b, bases_20x, depth_20X = depth_fraction(coverage_hist, thr=20) global_depth.update({'bases_totales': int(b)}) global_depth.update({'mean_DP': round(weighted_stats.mean, signif)}) global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]}) #global_depth.update({'std_DP':round(weighted_stats.std,signif)}) #global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]}) #global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]}) #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]}) #global_depth.update({'dp>=1':round(depth_fraction(coverage_hist,thr=1),signif)}) #global_depth.update({'dp>=10':round(depth_fraction(coverage_hist,thr=10),signif)}) global_depth.update({'bases_20X': int(bases_20x)}) #global_depth.update({'bases_20X(%)':(100*(bases_20x/b)}) global_depth.update({'dp>=20': round(depth_20X, 3)}) #global_depth.update({'dp>=20':round(depth_fraction(coverage_hist,thr=20),signif)}) #global_depth.update({'dp>=30':round(depth_fraction(coverage_hist,thr=30),signif)}) #global_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)}) #global_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)}) return (global_depth)
def fit_single(self, pos_left, pos_right, weights, tol=1e-4, maxiter=4000, verbose=False): left, right = np.asarray(pos_left), np.asarray(pos_right) debugs = list() if verbose else None centers = (left + right) / 2.0 statsW = DescrStatsW(centers, weights=np.array(weights)) init_paras = self._paras_compose_([statsW.mean], [statsW.cov], [1.0]) method = 'Nelder-Mead' res = opt.minimize(self._single_optpara, init_paras, args=(left, right, weights, debugs), method=method, tol=tol, options={ 'maxiter': maxiter, 'disp': verbose }) if verbose: print("Method:{}; Initial parameter: {};".format( method, init_paras)) print("Converged Parameter: {}".format(res.x)) mus, covs, ws = self._paras_decompose_(res.x, 1) if det(covs[0]) == 0.0: print("Warning: covariance processed:") print("\t pre-optimal mus: {}, cov: {}".format(mus[0], covs[0])) covs[0] = self._cov_process_(covs[0]) return mus, covs, ws, res.fun
def Newey_West(ret, q=2, tao=252): ''' Newey_West方差调整 时序上存在相关性时,使用Newey_West调整协方差估计 factor_ret: DataFrame, 行为时间,列为因子收益 q: 假设因子收益为q阶MA过程 tao: 算协方差时的半衰期 ''' from functools import reduce from statsmodels.stats.weightstats import DescrStatsW T = ret.shape[0] #时序长度 K = ret.shape[1] #因子数 if T <= q or T <= K: raise Exception("T <= q or T <= K") names = ret.columns weights = 0.5**(np.arange(T - 1, -1, -1) / tao) #指数衰减权重 weights = weights / sum(weights) w_stats = DescrStatsW(ret, weights) ret = ret - w_stats.mean ret = np.matrix(ret.values) Gamma0 = [weights[t] * ret[t].T @ ret[t] for t in range(T)] Gamma0 = reduce(np.add, Gamma0) V = Gamma0 #调整后的协方差矩阵 for i in range(1, q + 1): Gammai = [weights[i + t] * ret[t].T @ ret[i + t] for t in range(T - i)] Gammai = reduce(np.add, Gammai) V = V + (1 - i / (1 + q)) * (Gammai + Gammai.T) return (pd.DataFrame(V, columns=names, index=names))
def compute_summary_statistics(dbm: database_manager.DatabaseManager, tbl_name: str) -> Optional[Dict[str, Tuple]]: """ Computes summary statistics for given table. :param dbm: A DatabaseManager instance. :param tbl_name: name of the table to compute monthly return for. :return: dictionary containing various statistics. """ df, info, start_date = finance_metrics.compute_monthly_returns(dbm, tbl_name) if df is not None and info is not None: stat = {} dsw = DescrStatsW(df['Monthly_Return'].values) stat['table_name'] = tbl_name stat['contract_name'] = info[1] stat['type'] = info[3] if info[3] is not None else None stat['subtype'] = info[4] if info[4] is not None else None stat['start-date'] = start_date stat['ar'] = df['Monthly_Return'].mean() * 12 stat['vol'] = df['Monthly_Return'].std() * np.sqrt(12) stat['t-stat'] = dsw.ttest_mean(alternative='larger')[0] stat['p-value'] = dsw.ttest_mean(alternative='larger')[1] stat['kurt'] = df['Monthly_Return'].kurt() stat['skew'] = df['Monthly_Return'].skew() return stat return None
def calc_statistics(_totalVals, _distances, name, mask): if len(_totalVals) == 0 or len(mask) == 0: print name, "- No data available!\n" return (name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan) totalVals = _totalVals[mask, :] distances = _distances[mask] #distances = np.array(distances); if (len(distances) != len(totalVals)): raise ValueError("distances must be the same length as data values") weights = np.transpose(np.array([distances] * totalVals.shape[1])) notnan = np.where(np.isnan(totalVals) == False) totalStats = DescrStatsW(totalVals[notnan], weights=weights[notnan], ddof=0) totalGrandMean = totalStats.mean totalGrandSD = totalStats.std #print name, "total onto-shelf current (m/s):", totalGrandMean, "+/-", totalGrandSD; #Return the mean ontoshelf current for this region (weighted by segment distance) #Return the SD of the same. return (name, totalGrandMean, totalGrandSD)
def weighted_percentiles(data, weights, percentiles): """Return the weighted percentiles. Args: data (np.ndarray) : Bin variable (e.g. temperature, salinity) weights (np.ndarray): Weights (e.g. cell volume, area) percentiles (np.ndarray): Array of requested percentiles (e.g. 0-1 by 0.01) """ assert percentiles.max() <= 1.0 assert percentiles.min() >= 0.0 wq = DescrStatsW(data=data, weights=weights) bin_edges = wq.quantile(probs=percentiles, return_pandas=False) # manual method does not give a clean results... #ix = np.argsort(data) #data = data[ix] # sort data #weights = weights[ix] # sort weights #cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function #perc = np.arange(0, 1.01, 0.01) #test2 = np.interp(perc, cdf, data) return bin_edges
def spatial_correlation(self, field_A, field_B, method=None, selection=None): """ correlate two 2D fields """ if np.shape(field_A) != np.shape(field_B): # have to regrid A, B = self.regrid_to_lower_resolution(field_A, field_B) else: A, B = field_A, field_B assert np.shape(A) == np.shape(B) domain = self.determine_domain(A) AREA = xr_AREA(domain) MASK = boolean_mask(domain=domain, mask_nr=0) if type(selection) == int: MASK = boolean_mask(domain=domain, mask_nr=selection) elif type(selection) == dict: MASK, AREA = MASK.sel(selection), AREA.sel(selection) A, B = A.sel(selection), B.sel(selection) D = np.any(np.array( [np.isnan(A).values, np.isnan(B).values, (MASK == 0).values]), axis=0) A = xr.where(D, np.nan, A).stack(z=('latitude', 'longitude')).dropna(dim='z') B = xr.where(D, np.nan, B).stack(z=('latitude', 'longitude')).dropna(dim='z') C = xr.where(D, np.nan, AREA).stack(z=('latitude', 'longitude')).dropna(dim='z') d = DescrStatsW(np.array([A.values, B.values]).T, weights=C) spatial_corr_coef = d.corrcoef[0, 1] return spatial_corr_coef
def branch_scan_MinVar_general(modifier, ds_dists, us_dists, all_weights): """ This is the function to minimize in order to optimaly situate the root on the putative branch. Note that this function is only valid for minimizing the variance of schemes where the weights do not change with regard to changing the root. Input/s: modifier - This is the parameter to be optimized! Essentially a float of how much to shift the root left or right so as to minimize the root-to-tip variance ds_dists - array of downstream root-to-tip distances us_dists - array of upstream root-to-tip distances all_weights - array of downstream and upstream terminal weights Output/s: dsw.var - weighted variance """ #Adjust the downstream and upstream root-to-tip distances with the modifier temp_ds_dists = ds_dists + modifier temp_us_dists = us_dists - modifier all_dists = np.concatenate((temp_ds_dists, temp_us_dists)) #Calculate weighted variance and return dsw = DescrStatsW(all_dists, all_weights) return dsw.var
def plot_fill_between(data, data_dir, label, n_dis, color='magenta'): plt.figure(figsize=(10, 8)) left_edge = [] right_edge = [] mean_res = [] iters = [] for key, value in data.items(): left, right = DescrStatsW(value).tconfint_mean() left_edge.append(left) right_edge.append(right) iters.append(key + 1) mean_res.append(np.mean(value)) plt.fill_between(iters, left_edge, right_edge, color='violet') plt.plot(iters, mean_res, color=color, lw=5) plt.xlabel('iteration', fontsize=18) plt.ylabel(label, fontsize=18) plt.xlim([1, len(iters)]) plt.ylim([min(mean_res) - 0.05, max(mean_res) + 0.05]) plt.xticks(list(plt.xticks()[0][1:]) + [1]) plt.tick_params(axis='both', which='major', labelsize=18) plt.legend(['TestCV'], fontsize=18, loc=2) plt.savefig('{}/{}:{}_ARTM_smart.eps'.format(data_dir, label, n_dis)) plt.show() return mean_res
def get_diameter(self): self.get_chi2(f'Diameter', self.diameter.values, np.array([0.05, 0.05, 0.05])) all_diameter = DescrStatsW(self.diameter, weights=1/(np.array([0.05, 0.05, 0.05]))**2) self.diameter_mean = all_diameter.mean self.diameter_mean_err = all_diameter.std/np.sqrt(len(self.diameter)) self.diameter_std = all_diameter.std print(f' The diameter of the {self.ball} is {1000*self.diameter_mean:.2f} +- {1000*self.diameter_mean_err:.2f}')
def get_lower_upper_CI(scores): if scores.shape[1] > 1: #then 2-D lower_bound, upper_bound = DescrStatsW( scores.T).tconfint_mean() - scores.mean(axis=1) else: lower_bound, upper_bound = 0, 0 return abs(lower_bound)
def globaldepth(coverage_hist): coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum() weighted_stats = DescrStatsW(coverage_hist.DP - 1, weights=coverage_hist.BPs, ddof=0) global_depth = {} global_depth.update({'mean_DP': round(weighted_stats.mean, signif)}) global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]}) global_depth.update({'std_DP': round(weighted_stats.std, signif)}) global_depth.update({'q25_DP': weighted_stats.quantile(0.25).values[0]}) global_depth.update({'q75_DP': weighted_stats.quantile(0.75).values[0]}) global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]}) global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]}) global_depth.update( {'dp>=1': round(depth_fraction(coverage_hist, thr=1), signif)}) global_depth.update( {'dp>=10': round(depth_fraction(coverage_hist, thr=10), signif)}) global_depth.update( {'dp>=20': round(depth_fraction(coverage_hist, thr=20), signif)}) global_depth.update( {'dp>=30': round(depth_fraction(coverage_hist, thr=30), signif)}) global_depth.update( {'dp>=50': round(depth_fraction(coverage_hist, thr=50), signif)}) global_depth.update( {'dp>=100': round(depth_fraction(coverage_hist, thr=100), signif)}) return (global_depth)
def test_comparemeans_convenient_interface(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d d1 = DescrStatsW(x1_2d) d2 = DescrStatsW(x2_2d) cm1 = CompareMeans(d1, d2) # smoke test for summary from statsmodels.iolib.table import SimpleTable for use_t in [True, False]: for usevar in ['pooled', 'unequal']: smry = cm1.summary(use_t=use_t, usevar=usevar) assert_(isinstance(smry, SimpleTable)) # test for from_data method cm2 = CompareMeans.from_data(x1_2d, x2_2d) assert_(str(cm1.summary()) == str(cm2.summary()))
def localdepth(coverage_hist): coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum() weighted_stats = DescrStatsW(coverage_hist.DP - 1, weights=coverage_hist.BPs, ddof=0) local_depth = {} local_depth.update({'mean_DP': round(weighted_stats.mean, signif)}) #local_depth.update({'median_DP':weighted_stats.quantile(0.5).values[0]}) local_depth.update({'std_DP': round(weighted_stats.std, signif)}) local_depth.update( {'dp>=1': (round(depth_fraction(coverage_hist, thr=1), signif)) * 100}) local_depth.update( {'dp>=5': (round(depth_fraction(coverage_hist, thr=5), signif)) * 100}) local_depth.update({ 'dp>=10': (round(depth_fraction(coverage_hist, thr=10), signif)) * 100 }) local_depth.update({ 'dp>=20': (round(depth_fraction(coverage_hist, thr=20), signif)) * 100 }) local_depth.update({ 'dp>=30': (round(depth_fraction(coverage_hist, thr=30), signif)) * 100 }) #local_depth.update({'mean_DP':round(weighted_stats.mean,signif)}) #local_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)}) #local_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)}) return pd.Series(local_depth)
def _similarity_helper_2(user1_id, user2_id, user1_vector, solr): query = 'doc_type:score AND users:({} AND {})'.format(user1_id, user2_id) solr.delete(q=query) user2_vector, depth_vector = get_vector_tf_idf(user2_id) data = column_stack((user1_vector, user2_vector)) # result = DescrStatsW(data).corrcoef[1][0] result2 = DescrStatsW(data, weights=depth_vector).corrcoef[1][0] mutual_friends = get_mutual_friends(user1_id, user2_id, solr) new_score = [{ 'doc_type': 'score', 'users': [user1_id, user2_id], 'similarity': result2, 'mutual_friends': mutual_friends, 'friends_count': len(mutual_friends) }] solr.add(new_score) solr.commit() # print('Pearson similarity ' + str(user2_id) + ' ' + str(result)) print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))