def test_weightstats_3(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d w1, w2 = self.w1, self.w2 d1w_2d = DescrStatsW(x1_2d, weights=w1) d2w_2d = DescrStatsW(x2_2d, weights=w2) x1r_2d = d1w_2d.asrepeats() x2r_2d = d2w_2d.asrepeats() assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14) assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14) assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14) assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14) assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14) # print d1w_2d.ttest_mean(3) # #scipy.stats.ttest is also vectorized # print stats.ttest_1samp(x1r_2d, 3) t, p, d = d1w_2d.ttest_mean(3) assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11) # print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T] cm = CompareMeans(d1w_2d, d2w_2d) ressm = cm.ttest_ind() resss = stats.ttest_ind(x1r_2d, x2r_2d) assert_almost_equal(ressm[:2], resss, 14)
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1)*21./20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] # d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def setup_class(self): np.random.seed(9876789) n1, n2 = 20,30 m1, m2 = 1, 1.2 x1 = m1 + np.random.randn(n1, 3) x2 = m2 + np.random.randn(n2, 3) w1 = np.random.randint(1,4, n1) w2 = np.random.randint(1,4, n2) self.x1, self.x2 = x1, x2 self.w1, self.w2 = w1, w2 self.d1w = DescrStatsW(x1, weights=w1, ddof=0) self.d2w = DescrStatsW(x2, weights=w2, ddof=1) self.x1r = self.d1w.asrepeats() self.x2r = self.d2w.asrepeats()
def test_weightstats_3(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d w1, w2 = self.w1, self.w2 d1w_2d = DescrStatsW(x1_2d, weights=w1) d2w_2d = DescrStatsW(x2_2d, weights=w2) x1r_2d = d1w_2d.asrepeats() x2r_2d = d2w_2d.asrepeats() # print d1w_2d.ttest_mean(3) # #scipy.stats.ttest is also vectorized # print stats.ttest_1samp(x1r_2d, 3) t,p,d = d1w_2d.ttest_mean(3) assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11) #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T] ressm = CompareMeans(d1w_2d, d2w_2d).ttest_ind() resss = stats.ttest_ind(x1r_2d, x2r_2d) assert_almost_equal(ressm[:2], resss, 14)
def test_weightstats_2(self): x1, x2 = self.x1, self.x2 w1, w2 = self.w1, self.w2 d1 = DescrStatsW(x1) d1w = DescrStatsW(x1, weights=w1) d2w = DescrStatsW(x2, weights=w2) x1r = d1w.asrepeats() x2r = d2w.asrepeats() # print 'random weights' # print ttest_ind(x1, x2, weights=(w1, w2)) # print stats.ttest_ind(x1r, x2r) assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2], stats.ttest_ind(x1r, x2r), 14) #not the same as new version with random weights/replication # assert x1r.shape[0] == d1w.sum_weights # assert x2r.shape[0] == d2w.sum_weights assert_almost_equal(x2r.var(), d2w.var, 14) assert_almost_equal(x2r.std(), d2w.std, 14) #one-sample tests # print d1.ttest_mean(3) # print stats.ttest_1samp(x1, 3) # print d1w.ttest_mean(3) # print stats.ttest_1samp(x1r, 3) assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11) assert_almost_equal(d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
def test_weightstats_2(self): x1, x2 = self.x1, self.x2 w1, w2 = self.w1, self.w2 d1 = DescrStatsW(x1) d1w = DescrStatsW(x1, weights=w1) d2w = DescrStatsW(x2, weights=w2) x1r = d1w.asrepeats() x2r = d2w.asrepeats() # print 'random weights' # print ttest_ind(x1, x2, weights=(w1, w2)) # print stats.ttest_ind(x1r, x2r) assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2], stats.ttest_ind(x1r, x2r), 14) # not the same as new version with random weights/replication # assert x1r.shape[0] == d1w.sum_weights # assert x2r.shape[0] == d2w.sum_weights assert_almost_equal(x2r.mean(0), d2w.mean, 14) assert_almost_equal(x2r.var(), d2w.var, 14) assert_almost_equal(x2r.std(), d2w.std, 14) # note: the following is for 1d assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14) # assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19) # TODO: exception in corrcoef (scalar case) # one-sample tests # print d1.ttest_mean(3) # print stats.ttest_1samp(x1, 3) # print d1w.ttest_mean(3) # print stats.ttest_1samp(x1r, 3) assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11) assert_almost_equal(d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
class TestWeightstats2d_ddof(CheckWeightstats2dMixin): @classmethod def setup_class(self): np.random.seed(9876789) n1, n2 = 20,20 m1, m2 = 1, 1.2 x1 = m1 + np.random.randn(n1, 3) x2 = m2 + np.random.randn(n2, 3) w1 = np.random.randint(1,4, n1) w2 = np.random.randint(1,4, n2) self.x1, self.x2 = x1, x2 self.w1, self.w2 = w1, w2 self.d1w = DescrStatsW(x1, weights=w1, ddof=1) self.d2w = DescrStatsW(x2, weights=w2, ddof=1) self.x1r = self.d1w.asrepeats() self.x2r = self.d2w.asrepeats()
def weighted_std_from_stats(matrix, axis=0, halflife=90): Tn = matrix.shape[axis] # number of Time period w = create_weight_by_halflife(n=Tn, halflife=halflife) weighted_stats = DescrStatsW(matrix, weights=w, ddof=0) return weighted_stats.std
def compute_scores( coder_df, coder1, coder2, outcome_column, document_column, coder_column, weight_column=None, pos_label=None, ): """ Computes a variety of inter-rater reliability scores, including Cohen's kappa, Krippendorf's alpha, precision, and recall. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns: - A column with values that indicate the coder (like a name) - A column with values that indicate the document (like an ID) - A column with values that indicate the code value - (Optional) A column with document weights This function will return a :py:class:`pandas.DataFrame` with agreement scores between the two specified coders. :param coder_df: A :py:class:`pandas.DataFrame` of codes :type coder_df: :py:class:`pandas.DataFrame` :param coder1: The value in ``coder_column`` for rows corresponding to the first coder :type coder1: str or int :param coder2: The value in ``coder_column`` for rows corresponding to the second coder :type coder2: str or int :param outcome_column: The column that contains the codes :type outcome_column: str :param document_column: The column that contains IDs for the documents :type document_column: str :param coder_column: The column containing values that indicate which coder assigned the code :type coder_column: str :param weight_column: The column that contains sampling weights :type weight_column: str :param pos_label: The value indicating a positive label (optional) :type pos_label: str or int :return: A dictionary of scores :rtype: dict .. note:: If using a multi-class (non-binary) code, some scores may come back null or not compute as expected. \ We recommend running the function separately for each specific code value as a binary flag by providing \ each unique value to the ``pos_label`` argument. If ``pos_label`` is not provided for multi-class codes, \ this function will attempt to compute scores based on support-weighted averages. Usage:: from pewanalytics.stats.irr import compute_scores import pandas as pd df = pd.DataFrame([ {"coder": "coder1", "document": 1, "code": "2"}, {"coder": "coder2", "document": 1, "code": "2"}, {"coder": "coder1", "document": 2, "code": "1"}, {"coder": "coder2", "document": 2, "code": "2"}, {"coder": "coder1", "document": 3, "code": "0"}, {"coder": "coder2", "document": 3, "code": "0"}, ]) >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': None, 'coder1_mean_unweighted': 1.0, 'coder1_std_unweighted': 0.5773502691896257, 'coder2_mean_unweighted': 1.3333333333333333, 'coder2_std_unweighted': 0.6666666666666666, 'alpha_unweighted': 0.5454545454545454, 'accuracy': 0.6666666666666666, 'f1': 0.5555555555555555, 'precision': 0.5, 'recall': 0.6666666666666666, 'precision_recall_min': 0.5, 'matthews_corrcoef': 0.6123724356957946, 'roc_auc': None, 'pct_agree_unweighted': 0.6666666666666666} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="0") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '0', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.3333333333333333, 'coder2_std_unweighted': 0.3333333333333333, 'alpha_unweighted': 1.0, 'cohens_kappa': 1.0, 'accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0, 'precision_recall_min': 1.0, 'matthews_corrcoef': 1.0, 'roc_auc': 1.0, 'pct_agree_unweighted': 1.0} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="1") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '1', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.0, 'coder2_std_unweighted': 0.0, 'alpha_unweighted': 0.0, 'cohens_kappa': 0.0, 'accuracy': 0.6666666666666666, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'precision_recall_min': 0.0, 'matthews_corrcoef': 1.0, 'roc_auc': None, 'pct_agree_unweighted': 0.6666666666666666} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="2") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '2', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.6666666666666666, 'coder2_std_unweighted': 0.3333333333333333, 'alpha_unweighted': 0.4444444444444444, 'cohens_kappa': 0.3999999999999999, 'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666, 'precision': 0.5, 'recall': 1.0, 'precision_recall_min': 0.5, 'matthews_corrcoef': 0.5, 'roc_auc': 0.75, 'pct_agree_unweighted': 0.6666666666666666} """ old_np_settings = np.seterr(all="raise") coder_df = copy.deepcopy(coder_df) if pos_label: coder_df[outcome_column] = ( coder_df[outcome_column] == pos_label).astype(int) coder1_df = coder_df[coder_df[coder_column] == coder1] coder1_df.index = coder1_df[document_column] coder2_df = coder_df[coder_df[coder_column] == coder2] coder2_df.index = coder2_df[document_column] coder1_df = coder1_df[coder1_df.index.isin(coder2_df.index)] coder2_df = coder2_df[coder2_df.index.isin( coder1_df.index)].loc[coder1_df.index] row = { "coder1": coder1, "coder2": coder2, "n": len(coder1_df), "outcome_column": outcome_column, "pos_label": pos_label, } for labelsetname, labelset in [ ("coder1", coder1_df[outcome_column]), ("coder2", coder2_df[outcome_column]), ]: if weight_column: try: weighted_stats = DescrStatsW(labelset, weights=coder1_df[weight_column]) if weighted_stats: row["{}_mean".format(labelsetname)] = weighted_stats.mean row["{}_std".format( labelsetname)] = weighted_stats.std_mean except (TypeError, ValueError): try: weighted_stats = DescrStatsW( labelset.astype(int), weights=coder1_df[weight_column]) if weighted_stats: row["{}_mean".format( labelsetname)] = weighted_stats.mean row["{}_std".format( labelsetname)] = weighted_stats.std_mean except (TypeError, ValueError): pass try: unweighted_stats = DescrStatsW(labelset, weights=[1.0 for x in labelset]) if unweighted_stats: row["{}_mean_unweighted".format( labelsetname)] = unweighted_stats.mean row["{}_std_unweighted".format( labelsetname)] = unweighted_stats.std_mean except (TypeError, ValueError): try: unweighted_stats = DescrStatsW(labelset.astype(int), weights=[1.0 for x in labelset]) if unweighted_stats: row["{}_mean_unweighted".format( labelsetname)] = unweighted_stats.mean row["{}_std_unweighted".format( labelsetname)] = unweighted_stats.std_mean except (TypeError, ValueError): pass alpha = AnnotationTask( data=coder_df[[coder_column, document_column, outcome_column]].values) try: alpha = alpha.alpha() except (ZeroDivisionError, ValueError): alpha = None row["alpha_unweighted"] = alpha labels = np.unique(coder_df[outcome_column]) if len(labels) <= 2: try: row["cohens_kappa"] = cohen_kappa_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, ) except FloatingPointError: row["cohens_kappa"] = 1.0 try: row["accuracy"] = accuracy_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, ) except ValueError: row["accuracy"] = None try: row["f1"] = f1_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["f1"] = None try: row["precision"] = precision_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["precision"] = None try: row["recall"] = recall_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["recall"] = None if is_not_null(row["precision"]) and is_not_null(row["recall"]): row["precision_recall_min"] = min([row["precision"], row["recall"]]) else: row["precision_recall_min"] = None try: row["matthews_corrcoef"] = matthews_corrcoef( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, ) except ValueError: row["matthews_corrcoef"] = None except FloatingPointError: row["matthews_corrcoef"] = 1.0 try: row["roc_auc"] = (roc_auc_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else None, ) if len(np.unique(coder1_df[outcome_column])) > 1 and len(np.unique(coder2_df[outcome_column])) > 1 else None) except TypeError: try: row["roc_auc"] = (roc_auc_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, average="weighted" if not pos_label else None, ) if len(np.unique(coder1_df[outcome_column])) > 1 and len(np.unique(coder2_df[outcome_column])) > 1 else None) except (ValueError, TypeError): row["roc_auc"] = None except (ValueError, TypeError): row["roc_auc"] = None row["pct_agree_unweighted"] = np.average([ 1 if c[0] == c[1] else 0 for c in zip(coder1_df[outcome_column], coder2_df[outcome_column]) ]) for k, v in row.items(): if type(v) == tuple: row[k] = v[0] # For some weird reason, some of the sklearn scorers return 1-tuples sometimes np.seterr(**old_np_settings) return row
def varlist_initial(indata, target, weight, varlist): #1 getting started import statsmodels dir(statsmodels.base) print(statsmodels.base._model_params_doc) base._missing_param_doc """ 问题2: 逻辑回归加权重解决方案: 1)好像logit函数本身不支持。 2)目前可用下列GLM大类函数计算,但是有警告 3)其他解决手段? 1-用scipy 2-看看statsmodels的作者有什么手段?? """ import statsmodels.api as sm import numpy as np spector_data = sm.datasets.spector.load_pandas() spector_data_df = spector_data.data spector_data_df['wt'] = spector_data_df.apply(lambda x: np.random.randint(1,10), axis=1) spector_data_df.to_csv("D:\\Analysis\\SEMMA_project\\spector_data.csv") spector_data_df = sm.add_constant(spector_data_df) spector_data.exog = sm.add_constant(spector_data.exog) trainingdata_x = spector_data.exog trainingdata_y = spector_data.endog #下面这种写法,weight不起作用 res = sm.Logit(spector_data_df['GRADE'], \ spector_data_df[['const', 'PSI']], \ freq_weights=spector_data_df['wt']).fit() print(res.summary()) print(res.summary2()) #这种写法的结果与sas一致,但报警告: #__main__:3: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated. # Use an instance of a link class instead. logmodel=sm.GLM(spector_data_df['GRADE'], \ spector_data_df[['const', 'PSI']], \ family=sm.families.Binomial(sm.families.links.logit),\ freq_weights=spector_data_df['wt']).fit() print(logmodel.summary()) print(logmodel.summary2()) trainingdata_y = pd.DataFrame() trainingdata_y['Successes'] = spector_data.endog.apply(lambda x: x*np.random.randint(1,10) \ if x == 1 else 0) trainingdata_y['Failures'] = spector_data.endog.apply(lambda x: np.random.randint(1,10) \ if x == 0 else 0) df['true_cum']=df['a'].map(lambda x: if_true(x)).cumsum() import statsmodels.api as sm logmodel=sm.GLM(trainingdata_y[['Successes', 'Failures']], \ trainingdata_x, \ family=sm.families.Binomial(sm.families.links.logit)).fit() print(logmodel.summary()) print(logmodel.summary2()) trainingdata_x['wt'] = trainingdata_x.apply(lambda x: np.random.randint(1,10), axis=1) logmodel=sm.GLM(trainingdata_y, \ trainingdata_x[['const', 'GPA', 'TUCE', 'PSI']], \ family=sm.families.Binomial(sm.families.links.logit),\ freq_weights=trainingdata_x['wt']).fit() print(logmodel.summary()) print(logmodel.summary2()) trainingdata_x.wt.sum() """ 问题3:corr的权重计算法,和筛选法: 1)需和sas比对,看是否存在样本计算修正的问题(/n-1) 2)如何查看核输出结果。 """ from statsmodels.stats.weightstats import DescrStatsW def corr_check(indata_x, corr_threshold=0.75, weights=None): mask = list(indata_x.columns) if 'const' in mask mask.remove('const') if weights in mask: mask.remove(weights) d1_wt = DescrStatsW(indata_x[mask], weights=indata_x[weights]) d1_wt_corr = d1_wt.corrcoef #相关系数 corr_check = 'pass' for i in range(d1_wt_corr.shape[0]): for j in range(d1_wt_corr.shape[0] - i): if i != j and d1_wt_corr[i,j] > corr_threshold: var_i = mask[i] var_j = mask[j] print('correlation of %s and %s is higher than %.2f !'%(var_i, var_j, corr_threshold)) corr_check = 'fail' print(corr_check) # example 1 np.random.seed(0) x1_2d = 1.0 + np.random.randn(20, 3) w1 = np.random.randint(1, 4, 20) d1 = DescrStatsW(x1_2d, weights=w1) d1.mean d1.var d1.std_mean # example 2 mask = list(data1.columns) mask.remove('wt') d1_wt = DescrStatsW(data1[mask], weights=data1['wt']) d1_wt_corr = d1_wt.corrcoef #相关系数 d1_wt_corr[d1_wt_corr > 0.2] corr_threshold = 0.2 corr_threshold = 0.3 corr_threshold = 0.5 corr_check = 0 for i in range(d1_wt_corr.shape[0]): for j in range(d1_wt_corr.shape[0] - i): if i != j and d1_wt_corr[i,j] > corr_threshold: var_i = mask[i] var_j = mask[j] print('correlation of %s and %s is higher than %.2f !'%(var_i, var_j, corr_threshold)) corr_check = 1 print(corr_check) var1 = mask[0] var2 = mask[1] print('correlation of % and s% is higher than %.2f !'%(var1, var2, corr_threshold)) """ """ %matplotlib inline import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std np.random.seed(9876789) nsample = 100 x = np.linspace(0, 10, 100) X1 = np.column_stack((x)).T X2 = np.column_stack((x, x**2)) beta = np.array([1, 0.1, 10]) e = np.random.normal(size=nsample) X1 = sm.add_constant(X1) X2 = sm.add_constant(X2) y = np.dot(X2, beta) + e model1 = sm.OLS(y, X1) model2 = sm.OLS(y, X2) results1 = model1.fit() results2 = model2.fit() results1.compare_lm_test(results2) print(results.summary()) dir(results) dir(model) sm.regression.linear_model.RegressionResults.compare_lm_test(model) sm.regression.linear_model.RegressionResults.compare_f_test(results) results.compare_lm_test(results) X, y = load_iris(return_X_y=True) print(sm.datasets.__doc__) dir(sm.datasets) data = sm.datasets.anes96.load_pandas() df = sm.datasets.anes96.load_pandas().data y, X = dmatrices('vote ~ logpopul + TVnews + selfLR + ClinLR + age + educ + income',\ data=df, return_type='dataframe') mod = sm.Logit(y, X) res = mod.fit() print(res.summary()) print(res.summary2()) print(res.wald_test.__doc__) dir(res) dir(mod) print(mod.score(res.params)) print(mod.score_obs(res.params)) df.to_csv("D:\\Analysis\\SEMMA_project\\anes96.csv") y, X = dmatrices('vote ~ selfLR',\ data=df, return_type='dataframe') y, X = dmatrices('vote ~ selfLR + ClinLR',\ data=df, return_type='dataframe') mod = sm.Logit(y, X) res = mod.fit() print(res.summary()) print(res.summary2()) r = np.zeros_like(res.params) r[1:] = [1] A = np.identity(len(res.params)) A = A[1:,:] print(res.wald_test(A)) print(res.t_test(A)) #这个wald可用 print(res.wald_test_terms(skip_single=False)) print(mod.score(res.params)) dir(res.wald_test_terms(skip_single=False)) res.wald_test_terms(skip_single=False).col_names res.wald_test_terms(skip_single=False).statistic res.wald_test_terms(skip_single=False).summary_frame res.wald_test_terms(skip_single=False).table[index='selfLR']['statistic'] res.wald_test_terms(skip_single=False).dist_args res.wald_test_terms(skip_single=False).table.loc['ClinLR', 'statistic'] #lm统计量 1)假设共x1-xn个自变量,要检验其中x1-xq个。原假设为该q个变量系数均为0; 2)y与x1-xq回归得到约束方程,和残差u 3)u与x1-xn回归,得到R^2,并计算样本容量n 4)LM=n*R^2 ~ chi-square(q)分布,比较其于该分布临界值c的关系,大于则推翻原假设。 # 信息矩阵 = 负的hessian矩阵 logit的information未实现 U = mod.score(res.params) info_matrix = -1 * mod.hessian(res.params) I = np.linalg.inv(info_matrix) Score1 = np.dot(U.T,I) Score = np.dot(Score1,U) U_VAR1 = mod.score(res.params)[2:] stats.diagnostic.linear_lm(resid= , exog=[selfLR + ClinLR]) dir(mod.information.__doc__) print(res.llr_pvalue) print(res.llnull) print(res.llf) print(r) dir(mod.score(res.params)) dir(res.wald_test(A)) print(res.wald_test(A).conf_int) print(res.t_test(r)) print(res.wald_test(r)) print(res.llr) res.compare_lm_test(res) dir(sm.regression.linear_model.RegressionResults.compare_lm_test()) dir(sm.test) dir(sm.stats) dir(res.wald_test) res.wald_test() sm.webdoc() sm.webdoc('glm') RegressionResults.compare_lm_test(restricted, demean=True, use_lr=False) RegressionResults.compare_lm_test(restricted, demean=True, use_lr=False)[source] statsmodels.regression.linear_model.RegressionResults.compare_lm_test import numpy import statsmodels.api as sm # Random data with two (identical) groups with ten members each # and there are 1000 repetitions of this setup data = numpy.random.random( (20, 1000) ) model = sm.add_constant(numpy.array([0]*10 + [1]*10)) restricted_model = numpy.ones((20,1)) fit = sm.OLS(data, model).fit() print(fit.summary()) restricted_fit = sm.OLS(data, restricted_model).fit() # The following raises a ValueError exception # but should instead have the same results as the method shown below fs, ps, dfs = fit.compare_f_test(restricted_fit) ## The current way you have to run this, running one at a time: fs, ps, dfs = numpy.empty(1000), numpy.empty(1000), numpy.empty(1000) for i in range(1000): fit = sm.OLS(data[:,i], model).fit() restricted_fit = sm.OLS(data[:,i], restricted_model).fit() fs[i], ps[i], dfs[i] = fit.compare_f_test(restricted_fit) statsmodels.stats.diagnostic.linear_lm statsmodels.stats.diagnostic.linear_lm(resid, exog, func=None) exog = np.array(X.columns) exog = data.exog.T dir(res.resid_dev) print(res.resid_dev.__doc__) lm, lm_pval, ftest = sm.stats.diagnostic.linear_lm(res.resid_dev, exog, func=None) if func is None: def func(x): return np.power(x, 2)
def test_weightstats_2d_w2(): x1 = [[1]] w1 = [[1]] d1 = DescrStatsW(x1, w1) assert (d1.quantile([0, 0.5, 1.0]) == 1).all().all()
def mean_var(throu, delay): thr_, delay_ = np.array(throu), np.array(delay) weighted_stats = DescrStatsW(thr_, weights=delay_) return weighted_stats.mean, weighted_stats.std
def _normalize(self, style_factor: np.ndarray): weighted_stats = DescrStatsW(style_factor, weights=self._mkt_cap.flatten()) weighted_mu = weighted_stats.mean factor_std = np.std(style_factor, axis=0, ddof=1) return (style_factor - weighted_mu) / factor_std
def weighted(x): stats = DescrStatsW(x["quantity"], x["sold"]) return {"median": stats.quantile(0.5)[0.5], "std": stats.std}
def print_statistics(data, distances, name, mask, gasTransferParameterisation=False): if len(data.xcoords) == 0 or len(mask) == 0: print name, "- No data available!\n"; return (name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan); eks = data.ekmanVals[mask,:]; geo = data.geostrophicVals[mask,:]; stokes = data.stokesVals[mask,:]; #distances = np.array(distances); if (len(distances) != len(eks)): raise ValueError("distances must be the same length as data values"); weights = np.transpose(np.array([distances]*eks.shape[1])); if gasTransferParameterisation: ks = data.kVals[mask,:]; notnan = np.where(np.isnan(ks)==False); kStats = DescrStatsW(ks[notnan], weights=weights[notnan], ddof=0); kMean = kStats.mean; kSD = kStats.std; print name, "Gas transfer velocity stats"; print "\tk: ", kMean, "+/-", kSD; totals = np.abs(eks) + np.abs(geo) + np.abs(stokes); eksProps = np.abs(eks) / totals; geoProps = np.abs(geo) / totals; stokesProps = np.abs(stokes) / totals; notnan = np.where(np.isnan(eksProps)==False); eksStats = DescrStatsW(eksProps[notnan], weights=weights[notnan], ddof=0); eksMean = eksStats.mean; eksSD = eksStats.std; notnan = np.where(np.isnan(geoProps)==False); geoStats = DescrStatsW(geoProps[notnan], weights=weights[notnan], ddof=0); geoMean = geoStats.mean; geoSD = geoStats.std; notnan = np.where(np.isnan(stokesProps)==False); stokesStats = DescrStatsW(stokesProps[notnan], weights=weights[notnan], ddof=0); stokesMean = stokesStats.mean; stokesSD = stokesStats.std; print name, "mean proportions" print "\tEkman: ", eksMean, "+/-", eksSD; print "\tGeostrophic: ", geoMean, "+/-", geoSD; print "\tStokes: ", stokesMean, "+/-", stokesSD; print "\t", "total:", eksMean+geoMean+stokesMean; totalOntoShelf = eks+geo+stokes; notnan = np.where(np.isnan(totalOntoShelf)==False); totalOntoShelfStats = DescrStatsW(totalOntoShelf[notnan], weights=weights[notnan], ddof=0); totalMean = totalOntoShelfStats.mean; totalSD = totalOntoShelfStats.std; eksPercent = eks / totalOntoShelf * 100.0; geoPercent = geo / totalOntoShelf * 100.0; stokesPercent = stokes / totalOntoShelf * 100.0; notnan = np.where(np.isnan(eksPercent)==False); eksPercentStats = DescrStatsW(eksPercent[notnan], weights=weights[notnan], ddof=0); eksPercentMean = eksPercentStats.mean; eksPercentSD = eksPercentStats.std; notnan = np.where(np.isnan(geoPercent)==False); geoPercentStats = DescrStatsW(geoPercent[notnan], weights=weights[notnan], ddof=0); geoPercentMean = geoPercentStats.mean; geoPercentSD = geoPercentStats.std; notnan = np.where(np.isnan(stokesPercent)==False); stokesPercentStats = DescrStatsW(stokesPercent[notnan], weights=weights[notnan], ddof=0); stokesPercentMean = stokesPercentStats.mean; stokesPercentSD = stokesPercentStats.std; print name, "percentage total onto-shelf current"; print "\tEkman: ", eksPercentMean, "+/-", eksPercentSD; print "\tGeostrophic: ", geoPercentMean, "+/-", geoPercentSD; print "\tStokes: ", stokesPercentMean, "+/-", stokesPercentSD; print "\t", "total onto-shelf current (m/s):", totalMean, "+/-", totalSD; print ""; if gasTransferParameterisation: return (name, totalMean, totalSD, eksMean, eksSD, geoMean, geoSD, stokesMean, stokesSD, eksPercentMean, eksPercentSD, geoPercentMean, geoPercentSD, stokesPercentMean, stokesPercentSD, kMean, kSD); else: return (name, totalMean, totalSD, eksMean, eksSD, geoMean, geoSD, stokesMean, stokesSD, eksPercentMean, eksPercentSD, geoPercentMean, geoPercentSD, stokesPercentMean, stokesPercentSD, np.nan, np.nan);
def conf_int(a): print ('Confidence Intervall') print (DescrStatsW(a).tconfint_mean())
def derive(data, params): """ Derives connectivity from the data. A lot of data is inherently built with edges (e.g. communication between two individuals). However other networks are derived from the covariance of time series (e.g. brain networks between two regions). Covariance based metrics deriving time-resolved networks can be done in multiple ways. There are other methods apart from covariance based. Derive a weight vector for each time point and then the corrrelation coefficient for each time point. Paramters -------- data : array Time series data to perform connectivity derivation on. (Default dimensions are: (time as rows, nodes as columns). Change params{'dimord'} if you want it the other way (see below). params : dict Parameters for each method (see below). Necessary paramters =================== method : str method: "distance","slidingwindow", "taperedslidingwindow", "jackknife", "multiplytemporalderivative". Alternatively, method can be a weight matrix of size time x time. **Different methods have method specific paramaters (see below)** Params for all methods (optional) ================================= postpro : "no" (default). Other alternatives are: "fisher", "boxcox", "standardize" and any combination seperated by a + (e,g, "fisher+boxcox"). See postpro_pipeline for more information. dimord : str Dimension order: 'node,time' (default) or 'time,node'. People like to represent their data differently and this is an easy way to be sure that you are inputing the data in the correct way. analysis_id : str or int add to identify specfic analysis. Generated report will be placed in './report/' + analysis_id + '/derivation_report.html report : bool False by default. If true, A report is saved in ./report/[analysis_id]/derivation_report.html if "yes" report_path : str String where the report is saved. Default is ./report/[analysis_id]/derivation_report.html Methods specific parameters =========================== method == "distance" ~~~~~~~~~~~~~~~~~~~ Distance metric calculates 1/Distance metric weights, and scales between 0 and 1. W[t,t] is excluded from the scaling and then set to 1. params['distance']: str Distance metric (e.g. 'euclidean'). See teneto.utils.getDistanceFunction for more info When method == "slidingwindow" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ params['windowsize'] : int Size of window. When method == "taperedslidingwindow" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ params['windowsize'] : int Size of window. params['distribution'] : str Scipy distribution (e.g. 'norm','expon'). Any distribution here: https://docs.scipy.org/doc/scipy/reference/stats.html params['distribution_params'] : list Each parameter, excluding the data "x" (in their scipy function order) to generate pdf. NOTE !!!!!!!!!! The data x should be considered to be centered at 0 and have a length of window size. (i.e. a window size of 5 entails x is [-2, -1, 0, 1, 2] a window size of 6 entails [-2.5, -1.5, 0.5, 0.5, 1.5, 2.5]) Given x params['distribution_params'] contains the remaining parameters. e.g. normal distribution requires pdf(x, loc, scale) where loc=mean and scale=std. This means that the mean and std have to be provided in distribution_params. Say we have a gaussian distribution, a window size of 21 and params['distribution_params'] is [0,5]. This will lead to a gaussian with its peak at in the middle of each window with a standard deviation of 5. Instead, if we set params['distribution_params'] is [10,5] this will lead to a half gaussian with its peak at the final time point with a standard deviation of 5. When method == "temporalderivative" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ params['windowsize'] : int Size of window. When method == "jackknife" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ No parameters are necessary. Optional parameters: params['weight-var'] : array, (optional) NxN array to weight the JC estimates (standerdized-JC*W). If weightby is selected, do not standerdize in postpro. params['weight-mean'] : array, (optional) NxN array to weight the JC estimates (standerdized-JC+W). If weightby is selected, do not standerdize in postpro. Returns ------- G : array Connectivity estimates (nodes x nodes x time) READ MORE --------- About the general weighted pearson approach used for most methods, see: Thompson & Fransson (2019) A common framework for the problem of deriving estimates of dynamic functional brain connectivity. Neuroimage. (https://doi.org/10.1016/j.neuroimage.2017.12.057) SEE ALSO -------- *postpro_pipeline*, *gen_report* """ report = {} if 'dimord' not in params.keys(): params['dimord'] = 'node,time' if 'report' not in params.keys(): params['report'] = False if 'analysis_id' not in params.keys(): params['analysis_id'] = '' if 'postpro' not in params.keys(): params['postpro'] = 'no' if params['report'] == 'yes' or params['report'] == True: if 'analysis_id' not in params.keys(): params['analysis_id'] = '' if 'report_path' not in params.keys(): params['report_path'] = './report/' + params['analysis_id'] if 'report_filename' not in params.keys(): params['report_filename'] = 'derivation_report.html' if params['dimord'] == 'node,time': data = data.transpose() if isinstance(params['method'], str): if params['method'] == 'jackknife': weights, report = weightfun_jackknife(data.shape[0], report) relation = 'weight' elif params['method'] == 'sliding window' or params[ 'method'] == 'slidingwindow': weights, report = weightfun_sliding_window(data.shape[0], params, report) relation = 'weight' elif params['method'] == 'tapered sliding window' or params[ 'method'] == 'taperedslidingwindow': weights, report = weightfun_tapered_sliding_window( data.shape[0], params, report) relation = 'weight' elif params['method'] == 'distance' or params[ 'method'] == "spatial distance" or params[ 'method'] == "node distance" or params[ 'method'] == "nodedistance" or params[ 'method'] == "spatialdistance": weights, report = weightfun_spatial_distance(data, params, report) relation = 'weight' elif params['method'] == 'mtd' or params[ 'method'] == 'multiply temporal derivative' or params[ 'method'] == 'multiplytemporalderivative' or params[ 'method'] == 'temporal derivative' or params[ 'method'] == "temporalderivative": R, report = temporal_derivative(data, params, report) relation = 'coupling' else: raise ValueError( 'Unrecognoized method. See derive_with_weighted_pearson documentation for predefined methods or enter own weight matrix' ) else: try: weights = np.array(params['method']) relation = 'weight' except: raise ValueError( 'Unrecognoized method. See documentation for predefined methods' ) if weights.shape[0] != weights.shape[1]: raise ValueError("weight matrix should be square") if weights.shape[0] != data.shape[0]: raise ValueError("weight matrix must equal number of time points") if relation == 'weight': # Loop over each weight vector and calculate pearson correlation. # Note, should see if this can be made quicker in future. R = np.array([ DescrStatsW(data, weights[i, :]).corrcoef for i in range(0, weights.shape[0]) ]) # Make node,node,time R = R.transpose([1, 2, 0]) # Correct jackknife direction if params['method'] == 'jackknife': # Correct inversion R = R * -1 jc_z = 0 if 'weight-var' in params.keys(): R = np.transpose(R, [2, 0, 1]) R = (R - R.mean(axis=0)) / R.std(axis=0) jc_z = 1 R = R * params['weight-var'] R = R.transpose([1, 2, 0]) if 'weight-mean' in params.keys(): R = np.transpose(R, [2, 0, 1]) if jc_z == 0: R = (R - R.mean(axis=0)) / R.std(axis=0) R = R + params['weight-mean'] R = np.transpose(R, [1, 2, 0]) R = teneto.utils.set_diagonal(R, 1) if params['postpro'] != 'no': R, report = teneto.derive.postpro_pipeline(R, params['postpro'], report) R = teneto.utils.set_diagonal(R, 1) if params['report'] == 'yes' or params['report'] == True: teneto.derive.gen_report(report, params['report_path'], params['report_filename']) return R
def main(job_no, suffix, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + '_' + job_no + ".log" LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module".ljust(30)) LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' + statsmodels.__version__, label="Imported module".ljust(30)) result = pd.DataFrame( columns=['kmer', 'variance', 'Marginalise over central base?']) #Find variance due to CpG filename = dir + '/var_counts_1' + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as var_counts: var_counts = pickle.load(var_counts) filename = dir + '/context_counts_1' + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as context_counts: context_counts = pickle.load(context_counts) cpg_contexts = context_counts.loc['CG', 'C'] + context_counts.loc['TG', 'C'] + context_counts.loc['AG', 'C'] + \ context_counts.loc['GG', 'C'] + \ context_counts.loc['CC', 'G'] + context_counts.loc['CT', 'G'] + context_counts.loc['CA', 'G'] + \ context_counts.loc['CG', 'G'] CpG_ratio = cpg_contexts / context_counts.values.sum() non_cpg_contexts = context_counts.values.sum() - cpg_contexts print('Total CpG sites : ', cpg_contexts) print('Total intronic sites : ', context_counts.values.sum()) print('Proportion CpG sites : ', CpG_ratio) var_counts[ 'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G'] var_counts[ 'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C'] CpG_vars = var_counts.loc['CG', 'C'] + var_counts.loc['TG', 'C'] + var_counts.loc['AG', 'C'] + \ var_counts.loc['GG', 'C'] + \ var_counts.loc['CC', 'G'] + var_counts.loc['CT', 'G'] + var_counts.loc['CA', 'G'] + \ var_counts.loc['CG', 'G'] print('Total CpG variants : ', CpG_vars) non_CpG_vars = var_counts.values.sum() - CpG_vars m1 = CpG_vars / cpg_contexts m0 = non_CpG_vars / non_cpg_contexts m_ave = var_counts.values.sum() / context_counts.values.sum() print('SNV density at CpG sites : ', m1) print('SNV density at other sites: ', m0) print('Average SNV density : ', m_ave) t1 = CpG_ratio * (m1 - m_ave)**2 t2 = (1 - CpG_ratio) * (m0 - m_ave)**2 print('Variance due to CpG sites : ', t1 + t2) LOGGER.log_message("%.2e" % (t1 + t2), label="Variance due to CpG".ljust(50)) #Deal with the 1-mer case. var_counts[ 'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G'] var_counts[ 'T'] = var_counts['T->C'] + var_counts['T->A'] + var_counts['T->G'] var_counts[ 'A'] = var_counts['A->T'] + var_counts['A->C'] + var_counts['A->G'] var_counts[ 'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C'] variant_counts = var_counts.sum(axis=0) variant_counts = variant_counts[variant_counts.index.isin( ['C', 'T', 'A', 'G'])] con_counts = context_counts.sum(axis=0) mut_rates = variant_counts / con_counts w = DescrStatsW(mut_rates, weights=con_counts, ddof=0) row = np.array([1, w.var, 'no']) row = pd.Series(row, index=result.columns, name=0) result = result.append(row) row = np.array([1, 0.0, 'yes']) row = pd.Series(row, index=result.columns, name=1) result = result.append(row) i = 2 for kmer_variable in [1, 2, 3]: filename = dir + '/var_counts_' + str(kmer_variable) + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as var_counts: var_counts = pickle.load(var_counts) filename = dir + '/context_counts_' + str( kmer_variable) + suffix + '.pklz' infile = open(filename, 'r') LOGGER.input_file(infile.name) infile.close() with gzip.open(filename, 'rb') as context_counts: context_counts = pickle.load(context_counts) #Reformat context counts by repeating columns to match snv_densities dataframe. extended_context_counts, cols = probpoly_bayes.reformat_context_counts( context_counts, var_counts) extended_context_counts.set_index(context_counts.index, inplace=True) snv_densities = var_counts / extended_context_counts #Calculate variance across mutation types, marginalising over the central base. context_ratios = context_counts.div(context_counts.sum(axis=1), axis=0) extended_context_ratios, cols = probpoly_bayes.reformat_context_counts( context_ratios, snv_densities) extended_context_ratios.set_index(context_ratios.index, inplace=True) con_weighted = (snv_densities * extended_context_ratios).sum(axis=1) u = DescrStatsW(con_weighted, weights=context_counts.sum(axis=1), ddof=0) print( 'Marginalised variance due to ' + str(2 * kmer_variable + 1) + ' -mers = ', u.var) row = np.array([2 * kmer_variable + 1, u.var, 'yes']) row = pd.Series(row, index=result.columns, name=i) result = result.append(row) i += 1 #Calculate variance conditioned on kmer, not marginalising over the central base. #Firstly we reorganise the SNV densities table so that rows correspond to kmers # (including central base) and columns correspond to the derived base. contexts_generator = product('ACGT', repeat=2 * kmer_variable + 1) contexts = tuple(''.join(context) for context in contexts_generator) kmer_densities = np.zeros((len(contexts), 4)) kmer_densities = pd.DataFrame(kmer_densities, index=contexts, columns=['C', 'T', 'A', 'G']) for context in snv_densities.index: for mut in snv_densities.columns: ref = mut[0] derived = mut[3] kmer = context[0:kmer_variable] + ref + context[ kmer_variable:2 * kmer_variable] kmer_densities.loc[kmer, derived] = snv_densities.loc[context, mut] #We also reorganise context counts into counts of kmers. kmer_counts = np.zeros((len(contexts))) kmer_counts = pd.Series(kmer_counts, index=contexts) for kmer in kmer_counts.index: context = kmer[0:kmer_variable] + kmer[kmer_variable + 1:2 * kmer_variable + 1] ref = kmer[kmer_variable] kmer_counts[kmer] = context_counts.loc[context, ref] #Calculate the weighted variance over the full kmer. v = DescrStatsW(kmer_densities.sum(axis=1), weights=kmer_counts, ddof=0) print( 'Unmarginalised variance due to ' + str(2 * kmer_variable + 1) + ' -mers = ', v.var) row = np.array([2 * kmer_variable + 1, v.var, 'no']) row = pd.Series(row, index=result.columns, name=i) result = result.append(row) i += 1 print(result) filename = dir + "/aggregated_results" + job_no + ".csv" result.to_csv(filename, sep=',') outfile = open(filename, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="Run duration (minutes)".ljust(50))
def bootstrap_correlation_difference(self, x_data_1, x_data_2, y_data_1, y_data_2, weights=None, corr_type='pearson', test_value=0, ci_factor=1.96, two_tailed=True, detect_inliers=True, outlier_num_stds=3, generate_diagnostic=False): """ This function performs either pearson or spearman correlation on two sets of data. Within each fold, it does so for two sets of data (same permutation, different variables), and subtracts the correlation coefficients. Finally, it tests whether the correlations are different from each other (whether difference in correlations is different from 0). :param x_data_1: input x_data :type x_data_1: 1-D array :param y_data_1: input y_data :type y_data_1: 1-D array with same shape as x_data :param x_data_2: input x_data :type x_data_2: 1-D array :param y_data_2: input y_data :type y_data_2: 1-D array with same shape as x_data :param weights: possible input weights when calculating mean :type weights: 1-D array with same n_observations as data :param reps: amount of bootstrap repetitions :type reps: int :param test_value: value to test bootstrap distribution against :type test_value: float :param ci_factor: z-score to multiply std of bootstrap distr with :type ci_factor: float :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val :type two_tailed: bool :param detect_inliers: uses outlier detection when True :type detect_inliers: bool :param outlier_num_stds: number of stds from median in outlier detection :type outlier_num_stds: float :param generate_diagnostics: if True, make plot of the bootstrap distr :type generate_diagnostics: bool :return center: central estimate of bootstrap distribution :type center: float :return ci: ci of central estimate :type ci: float :return p-val: p-val for difference with test_value :type p-val: float """ # set weights to one if none provided if weights is None: weights = np.ones(len(x_data)) if detect_inliers: x_1_inliers = self.detect_inliers_mad(x_data_1, outlier_num_stds) y_1_inliers = self.detect_inliers_mad(y_data_1, outlier_num_stds) x_2_inliers = self.detect_inliers_mad(x_data_2, outlier_num_stds) y_2_inliers = self.detect_inliers_mad(y_data_2, outlier_num_stds) all_inliers = x_1_inliers * x_2_inliers * y_1_inliers * y_2_inliers x_data_1 = x_data_1[all_inliers] y_data_1 = y_data_1[all_inliers] x_data_2 = x_data_2[all_inliers] y_data_2 = y_data_2[all_inliers] weights = weights[all_inliers] N = len(x_data_1) # get random ints for random indices permute_indices = np.random.randint(0, len(x_data_1), size=(len(x_data_1), int(self.reps))).T # rank transform data if spearman is requested if corr_type == 'spearman': x_data_1 = stats.rankdata(x_data_1) y_data_1 = stats.rankdata(y_data_1) x_data_2 = stats.rankdata(x_data_2) y_data_2 = stats.rankdata(y_data_2) bootstrap_distr = [] bootstrap_distr_z = [] # loop over permutes for fold in permute_indices: r_1 = DescrStatsW(data=np.vstack([x_data_1[fold], y_data_1[fold]]).T, weights=weights[fold]).corrcoef[0, 1] r_2 = DescrStatsW(data=np.vstack([x_data_2[fold], y_data_2[fold]]).T, weights=weights[fold]).corrcoef[0, 1] bootstrap_distr.append(r_1 - r_2) # fisher transform correlations zr1 = np.arctanh(r_1) zr2 = np.arctanh(r_2) bootstrap_distr_z.append(zr1 - zr2) # calculate p-val p = self.p_val_from_bootstrap_dist(bootstrap_distr_z, test_value, two_tailed) # compute central corr on all data r1 = DescrStatsW(data=np.vstack([x_data_1, y_data_1]).T, weights=weights).corrcoef[0, 1] r2 = DescrStatsW(data=np.vstack([x_data_2, y_data_2]).T, weights=weights).corrcoef[0, 1] r_diff = r1 - r2 # return standard deviation of bootstrap distro as CI r_diff_ci = np.std(bootstrap_distr) * ci_factor if generate_diagnostic: f = pl.figure(figsize=(5, 5)) s = f.add_subplot(111) pl.title('input data') pl.hist(bootstrap_distr, 100) pl.axvline(corr, color='k', label='center', lw=5) pl.axvline(corr + corr_ci, color='r', label='ci', lw=5) pl.axvline(corr - corr_ci, color='r', label='ci', lw=5) pl.legend(loc='best') sn.despine(offset=2) pl.savefig( '/home/vanes/temp/plots/corr_bootstrap_distr_outlier_detection_%s_%d.pdf' % (detect_inliers, np.random.randint(1e8))) pl.close() return r_diff, r_diff_ci, p, N
def bootstrap_correlation(self, x_data, y_data, weights=None, corr_type='pearson', test_value=0, ci_factor=1.96, two_tailed=True, detect_inliers=True, outlier_num_stds=3, generate_diagnostic=False): """ Fits linear regression to data, either weighted or not. Returns bootstrapped CIs for slope and intercept :param x_data: input x_data :type x_data: 1-D array :param y_data: input y_data :type y_data: 1-D array with same shape as x_data :param weights: possible input weights when calculating mean :type weights: 1-D array with same n_observations as data :param reps: amount of bootstrap repetitions :type reps: int :param test_value: value to test bootstrap distribution against :type test_value: float :param ci_factor: z-score to multiply std of bootstrap distr with :type ci_factor: float :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val :type two_tailed: bool :param detect_inliers: uses outlier detection when True :type detect_inliers: bool :param outlier_num_stds: number of stds from median in outlier detection :type outlier_num_stds: float :param generate_diagnostics: if True, make plot of the bootstrap distr :type generate_diagnostics: bool :return center: central estimate of bootstrap distribution :type center: float :return ci: ci of central estimate :type ci: float :return p-val: p-val for difference with test_value :type p-val: float """ # remove nans from signal if present # set weights to one if none provided if weights is None: weights = np.ones(len(x_data)) if detect_inliers: x_inliers = self.detect_inliers_mad(x_data, outlier_num_stds) y_inliers = self.detect_inliers_mad(y_data, outlier_num_stds) x_data = x_data[x_inliers * y_inliers] y_data = y_data[x_inliers * y_inliers] weights = weights[x_inliers * y_inliers] N = len(x_data) # get random ints for random indices permute_indices = np.random.randint(0, len(x_data), size=(len(x_data), int(self.reps))).T # rank transform data if spearman is requested if corr_type == 'spearman': x_data = stats.rankdata(x_data) y_data = stats.rankdata(y_data) bootstrap_distr = [] bootstrap_distr_z = [] # loop over permutes for fold in permute_indices: r = DescrStatsW(data=np.vstack([x_data[fold], y_data[fold]]).T, weights=weights[fold]).corrcoef[0, 1] z = np.arctanh(r) bootstrap_distr.append(r) bootstrap_distr_z.append(z) # calculate p-val p = self.p_val_from_bootstrap_dist(bootstrap_distr_z, test_value, two_tailed) # compute central corr on all data corr = DescrStatsW(data=np.vstack([x_data, y_data]).T, weights=weights).corrcoef[0, 1] # return standard deviation of bootstrap distro as CI # corr_ci = np.std(bootstrap_distr)*ci_factor corr_ci = self.get_ci(bootstrap_distr, ci_factor) if generate_diagnostic: f = pl.figure(figsize=(5, 5)) s = f.add_subplot(111) pl.title('input data') pl.hist(bootstrap_distr, 100) pl.axvline(corr, color='k', label='center', lw=5) pl.axvline(corr + corr_ci, color='r', label='ci', lw=5) pl.axvline(corr - corr_ci, color='r', label='ci', lw=5) pl.legend(loc='best') sn.despine(offset=2) pl.savefig( '/home/vanes/temp/plots/corr_bootstrap_distr_outlier_detection_%s_%d.pdf' % (detect_inliers, np.random.randint(1e8))) pl.close() return corr, corr_ci, p, N
def bootstrap(self, data, center_estimate='mean', weights=None, test_value=0, ci_factor=1.96, two_tailed=True, detect_inliers=True, outlier_num_stds=3, generate_diagnostic=False, return_d=False): """ This finds a distribution of (weighted) average or regular median of the data, returning the median of this distribution along with std from the median. :param data: input data :type data: array of shape: [n_variables,n_observations] :param center_estimate: central measure: mean or median :type center_estimate: string :param weights: possible input weights when calculating mean :type weights: 1-D array with same n_observations as data :param reps: amount of bootstrap repetitions :type reps: int :param test_value: value to test bootstrap distribution against :type test_value: float :param ci_factor: z-score to multiply std of bootstrap distr with :type ci_factor: float :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val :type two_tailed: bool :param detect_inliers: uses outlier detection when True :type detect_inliers: bool :param outlier_num_stds: number of stds from median in outlier detection :type outlier_num_stds: float :param generate_diagnostics: if True, make plot of the bootstrap distr :type generate_diagnostics: bool :return center: central estimate of bootstrap distribution :type center: float :return ci: ci of central estimate :type ci: float :return p-val: p-val for difference with test_value :type p-val: float """ means = [] ps = [] Ns = [] cis = [] cohen_d = [] # put single array in iterable container if np.ndim(data) == 1: data = [data] # loop over different variables for di in range(np.shape(data)[0]): # if there's no data in here, or when all values are nans, set results to nans: if (len(data[di][np.invert(np.isnan(data[di]))]) == 0): means.append(np.nan) ps.append(np.nan) cohen_d.append(np.nan) Ns.append(np.nan) # else do bootstrap else: # set weights to one if none provided if weights is None: these_weights = np.ones(len(data[di])) else: these_weights = copy.copy(weights) # remove nan values from data and weights valid_values = np.invert(np.isnan(data[di])) these_data = data[di][valid_values] these_weights = these_weights[valid_values] # remove outliers from data and weights if detect_inliers: inliers = self.detect_inliers_mad( these_data, outlier_num_stds ) #,weights=weights,center_estimate=center_estimate) # inliers = self.detect_inliers_std(these_data,outlier_num_stds,weights=weights) these_data = these_data[inliers] these_weights = these_weights[inliers] Ns.append(len(these_data)) # get random ints for random indices permute_indices = np.random.randint(0, len(these_data), size=(len(these_data), int(self.reps))) # now average over all these random draws if center_estimate == 'mean': # in weighted fashion in case of average bootstrap_distr = np.average( these_data[permute_indices], weights=these_weights[permute_indices], axis=0) elif center_estimate == 'median': # or regular median bootstrap_distr = np.median(these_data[permute_indices], axis=0) elif center_estimate == 'std': bootstrap_distr = np.std(these_data[permute_indices], axis=0) # calculate p-val ps.append( self.p_val_from_bootstrap_dist(bootstrap_distr, test_value, two_tailed)) # get ci cis.append(self.get_ci(bootstrap_distr, ci_factor)) # return standard deviation of bootstrap distro for plotting if center_estimate == 'mean': means.append(np.average(these_data, weights=these_weights)) elif center_estimate == 'median': means.append(np.median(these_data)) elif center_estimate == 'std': means.append(np.std(these_data)) # ses.append(np.std(bootstrap_distr)*ci_factor) if generate_diagnostic: f = pl.figure(figsize=(5, 5)) s = f.add_subplot(111) pl.title('input data') pl.hist(bootstrap_distr, 100) pl.axvline(np.average(these_data, weights=these_weights), color='k', label='center', lw=5) pl.axvline(np.average(these_data, weights=these_weights) + np.std(bootstrap_distr) * ci_factor, color='r', label='ci', lw=5) pl.axvline(np.average(these_data, weights=these_weights) - np.std(bootstrap_distr) * ci_factor, color='r', label='ci', lw=5) pl.legend(loc='best') sn.despine(offset=2) pl.savefig( '/home/vanes/temp/plots/bootstrap_distr_outlier_detection_%s_%d.pdf' % (detect_inliers, np.random.randint(1e8))) pl.close() # calculate cohen's d: cohen_d.append( (np.average(these_data, weights=these_weights) - test_value) / DescrStatsW(data=these_data, weights=these_weights).std) if return_d: return np.squeeze(means), np.squeeze(cis), np.squeeze( ps), np.squeeze(Ns), np.squeeze(cohen_d) else: return np.squeeze(means), np.squeeze(cis), np.squeeze( ps), np.squeeze(Ns)
def tabulate_march_inequality(year): """ # For years 1964-2009 (year is March year, not earnings year), tabulate: These inequality metrics: - 90/50, 50/10, 90/10, Vln - 60/50, 70/50, 80/50, 95/50, 97/50 - 50/3, 50/5, 50/20, 50/30, 50/40 For these samples - Males - Females - Both For these wage measures - All hourly For these conditioning variables - raw wage inequality - residual wage inequality Also note: - Always dropping allocators where possible D. Autor, 2/24/2004 D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods M. Anderson, 12/13/2005 - Updated for new quantiles and years D. Autor, 9/5/2006. Updated for 2005 March M. Wasserman, 10/14/2009 Updated for 2007/8 March # """ df = tabulate_march_basic(year) df = df.eval(""" lnwinc = log(winc_ws) + log(gdp) lnhinc = log(hinc_ws) + log(gdp) """) # Full-time and hourly samples df = df.eval("ftfy = fulltime*fullyear") df.ftfy.describe().to_frame().T df = df.eval(""" ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1) hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1) """) # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit # @ hrsamp: hourly real wage not none + above hourly real wage limit df.loc[df.ftsamp == 0, "lnwinc"] = np.nan df.loc[df.hrsamp == 0, "lnhinc"] = np.nan df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T df = df.query("ftsamp == 1 | hrsamp == 1") # Generate experience categories df = df.assign(expcat=(df.exp/3).astype(int) + 1) df.loc[df.expcat == 17, "expcat"] = 16 assert df.eval("1<= expcat <= 16").all() df.groupby("expcat")["exp"].agg(["mean", "min", "max"]) # interaction terms - 80 of these # @ move to residual wage part # Drop reference group's interaction term: HSG with 0-2 years of experience # @ simiarly skip now df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"]) ###################################################################### # Summarize raw inequality ###################################################################### pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97]) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) tot_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_mf"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_m"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_f"] = [wq.mean, wq.var] dt = df.query("hrsamp==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_mf"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==0") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_m"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_f"] = [wq.mean, wq.var] df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False) ###################################################################### # Summarize residual inequality - Weekly & Hourly ###################################################################### res_pct = pd.DataFrame(index=pctiles) res_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_mf"] = [wq.mean, wq.var] # @ mean is not necessary but to be consistent res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_m"] = [wq.mean, wq.var] res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_f"] = [wq.mean, wq.var] res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1") y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_mf"] = [wq.mean, wq.var] res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==0") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_m"] = [wq.mean, wq.var] res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==1") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_f"] = [wq.mean, wq.var] res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) df_stat_ = pd.concat([res_stat, res_pct], axis=0) df_stat = pd.concat([df_stat, df_stat_], axis=1) # march-ineq-data-`1' df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year) # @ tidy data ###################################################################### # Percentiles of weekly earnings ###################################################################### # @ simply generate more percentiles under full-time samples # @ note here year is march census year thus minus one to be earnings year pctiles = pd.Series(range(3, 98)) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) # march-pctile-`yr' tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1) # @ tidy data # @ the code then combine 1963-2008 generated files # @ we remove this as not sure necessary # @ actually this part can be combined with #Summarize raw inequality# return df_stat, tot_pct
EA_err_2 = np.array([0.8, 0.7]) x_03 = np.arange(5, 7, 1) EA_3 = np.array([0.4716198, 0.4716227]) * 1e6 EA_err_3 = np.array([0.3, 1.4]) x_04 = np.arange(7, 10, 1) EA_4 = np.array([0.4716217, 0.4716233, 0.4716227]) * 1e6 EA_err_4 = np.array([0.36, 0.56, 0.26]) x_05 = np.arange(10, 14, 1) EA_5 = np.array([0.4716183, 0.4716189, 0.4716212, 0.4716210]) * 1e6 EA_err_5 = np.array([0.9, 0.3, 0.4, 0.8]) EA_T = EA_2.tolist() + EA_3.tolist() + EA_4.tolist() + EA_5.tolist() EA_err_T = EA_err_2.tolist() + EA_err_3.tolist() + EA_err_4.tolist( ) + EA_err_5.tolist() np.average(EA_T, weights=EA_err_T) w_stats = DescrStatsW(EA_T, weights=EA_err_T, ddof=0) EA_6 = np.array([0.4716115, 0.471626]) * 1e6 EA_err_6 = np.array([1, 25]) x_06 = np.arange(14, 16, 1) plt.figure(40) # plt.scatter(x_01,EA_1, color = 'blue',label = 'Feb-11, Feb-12') # plt.errorbar(x_01, EA_1, yerr=EA_err_1, fmt='.k',color='blue', capthick=0.5,capsize=5,elinewidth=0.5) plt.scatter(x_02, EA_2, color='black', label='Feb-16') plt.errorbar(x_02, EA_2, yerr=EA_err_2, fmt='.k', color='black', capthick=0.5,
def test_weightstats_ddof_tests(self): # explicit test that ttest and confint are independent of ddof # one sample case x1_2d = self.x1_2d w1 = self.w1 d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0) d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1) d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2) # check confint independent of user ddof res0 = d1w_d0.ttest_mean() res1 = d1w_d1.ttest_mean() res2 = d1w_d2.ttest_mean() # concatenate into one array with np.r_ assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) res0 = d1w_d0.ttest_mean(0.5) res1 = d1w_d1.ttest_mean(0.5) res2 = d1w_d2.ttest_mean(0.5) assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) # check confint independent of user ddof res0 = d1w_d0.tconfint_mean() res1 = d1w_d1.tconfint_mean() res2 = d1w_d2.tconfint_mean() assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
def _create_histogram_distribution(self, df, min_x=None, max_x=None, extend_x_proportion_percentage=20, postfix_label=None, obs_weights=None, denormalised=True): # get min/max values for our histogram min_hist_x = df.min() max_hist_x = df.max() extend_x_proportion_percentage = 1.0 + ( float(extend_x_proportion_percentage) / 100.0) # extend axes for PDF, so just outside histogram if min_x is not None: min_x = min(min_x, min_hist_x) * extend_x_proportion_percentage else: min_x = min_hist_x if max_x is not None: max_x = max(max_x, max_hist_x) * extend_x_proportion_percentage else: max_x = max_hist_x if denormalised: density = False vals = df.T.values.astype(np.float64) # Create a histogram with 10 buckets hist, bins = np.histogram(vals, bins=10, range=[float(min_hist_x), float(max_hist_x)], density=density, weights=obs_weights) bin_cent = (bins[1:] + bins[:-1]) * 0.5 number_of_elements = len(df.values) dist_space = np.linspace(min_x, max_x, 100) if postfix_label is None: postfix_label = '' else: postfix_label = ": " + postfix_label if number_of_elements > 1: # Create a best fit PDF using Gaussian KDE model (forcibly cast to float64) if obs_weights is None: kde = gaussian_kde(vals) else: kde = gaussian_weighted_kde(vals, weights=obs_weights.values.astype( np.float64)) # Sometimes need to transpose so the dimensions are consistent try: pdf_fit = kde(dist_space) except: pdf_fit = kde(dist_space.T) if obs_weights is None: # Calculated normal PDF weighted_stats = DescrStatsW(df.values, ddof=0) else: weighted_stats = DescrStatsW(df.values, weights=obs_weights.T.values, ddof=0) mu = weighted_stats.mean std = weighted_stats.std normal_pdf_fit = norm.pdf(dist_space, mu, std) # Scale pdf_fit (and normal PDF) by total/bin size if denormalised: bin_width = abs(bins[1] - bins[0]) N = np.sum(hist) pdf_fit = pdf_fit * (bin_width * N) normal_pdf_fit = normal_pdf_fit * (bin_width * N) df_hist = pd.DataFrame(index=bin_cent, data=hist, columns=['Histogram' + postfix_label]) df_pdf = pd.DataFrame(index=dist_space, data=pdf_fit, columns=['KDE-PDF' + postfix_label]) df_pdf['Norm-PDF' + postfix_label] = normal_pdf_fit else: return pd.DataFrame(), pd.DataFrame() return df_hist, df_pdf
def get_descriptives(cls, ddof=0): cls.descriptive = DescrStatsW(cls.data, cls.weights, ddof)
diagnostico_b = df.query("diagnosis == 'B'") # Efetuando o Zteste para a média (Comparando os resultados) ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean()) ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean()) # Gerando o intervalo de confiança zconfint(diagnostico_m['mean_radius']) zconfint(diagnostico_b['mean_radius']) """---------------------------------------------------------------------------- T Test """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Aplicando o teste resultados_m = DescrStatsW(diagnostico_m['mean_radius']) resultados_b = DescrStatsW(diagnostico_b['mean_radius']) # Gerando o intervalo de confiança resultados_m.tconfint_mean() resultados_b.tconfint_mean()
def test_weightstats_len_1(): x1 = [1] w1 = [1] d1 = DescrStatsW(x1, w1) assert (d1.quantile([0.0, 0.5, 1.0]) == 1).all()
def fit(self): """Calculate the augmented inverse probability weights and effect measures from the predicted exposure probabilities and predicted outcome values. Note ---- Exposure and outcome models must be specified prior to `fit()` Returns ------- For binary outcomes, gains `risk_difference`, `risk_difference_ci`, and `risk_ratio` attributes. For continuous outcomes, gains `average_treatment_effect` and `average_treatment_effect_ci` attributes """ if (self._fit_exposure_ is False) or (self._fit_outcome_ is False): raise ValueError( 'The exposure and outcome models must be specified before the doubly robust estimate can ' 'be generated') if self._miss_flag and not self._fit_missing_: warnings.warn( "All missing outcome data is assumed to be missing completely at random. To relax this " "assumption to outcome data is missing at random please use the `missing_model()` " "function", UserWarning) # Doubly robust estimator under all treated a_obs = self.df[self.exposure] y_obs = self.df[self.outcome] py_a1 = self.df['_pY1_'] py_a0 = self.df['_pY0_'] if self._fit_missing_: ps_g1 = self.df['_g1_'] * self.df['_ipmw_a1_'] ps_g0 = self.df['_g0_'] * self.df['_ipmw_a0_'] else: ps_g1 = self.df['_g1_'] ps_g0 = self.df['_g0_'] # Doubly robust estimator under all treated dr_a1 = np.where(a_obs == 1, (y_obs / ps_g1) - ((py_a1 * ps_g0) / ps_g1), py_a1) # Doubly robust estimator under all untreated dr_a0 = np.where(a_obs == 1, py_a0, (y_obs / ps_g0 - ((py_a0 * ps_g1) / ps_g0))) # Generating estimates for the risk difference and risk ratio zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) if self._weight_ is None: if self._continuous_outcome: self.average_treatment_effect = np.nanmean(dr_a1) - np.nanmean( dr_a0) var_ic = np.nanvar( (dr_a1 - dr_a0) - self.average_treatment_effect, ddof=1) / self.df.shape[0] self.average_treatment_effect_se = np.sqrt(var_ic) self.average_treatment_effect_ci = [ self.average_treatment_effect - zalpha * np.sqrt(var_ic), self.average_treatment_effect + zalpha * np.sqrt(var_ic) ] else: self.risk_difference = np.nanmean(dr_a1) - np.nanmean(dr_a0) self.risk_ratio = np.nanmean(dr_a1) / np.nanmean(dr_a0) var_ic = np.nanvar((dr_a1 - dr_a0) - self.risk_difference, ddof=1) / self.df.shape[0] self.risk_difference_se = np.sqrt(var_ic) self.risk_difference_ci = [ self.risk_difference - zalpha * np.sqrt(var_ic), self.risk_difference + zalpha * np.sqrt(var_ic) ] else: dr_m1 = DescrStatsW(dr_a1, weights=self.df[self._weight_]).mean dr_m0 = DescrStatsW(dr_a0, weights=self.df[self._weight_]).mean if self._continuous_outcome: self.average_treatment_effect = dr_m1 - dr_m0 else: self.risk_difference = dr_m1 - dr_m0 self.risk_ratio = dr_m1 / dr_m0
def test_weightstats_2d_w1(): x1 = [[1], [2]] w1 = [[1], [2]] d1 = DescrStatsW(x1, w1) print(len(np.array(w1).shape)) assert (d1.quantile([0.5, 1.0]) == 2).all().all()
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1) * 21. / 20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] # d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def _standardized_difference_(df, treatment, var_type, weight, weighted=True): """Background function to calculate the standardized mean difference between the treat and untreated for a specified variable. Useful for checking whether a confounder was balanced between the two treatment groups by the specified IPTW model SMD based on: Austin PC 2011; https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3144483/ """ def _categorical_cov_(a, b): """Turns out, pandas and numpy don't have the correct covariance matrix I need for categorical variables. The covariance matrix is defined as S = [S_{kl}] = (P_{1k}*(1-P_{1k}) + P_{2k}*(1-P{2k})) / 2 if k == l (P_{1k}*P_{1l} + P_{2k}*P_{2l}) / 2 if k != l """ cv2 = [] for i, v in enumerate(a): cv1 = [] if i == 0: pass else: for j, w in enumerate(b): if j == 0: pass elif i == j: cv1.append((v * (1 - v) + w * (1 - w)) / 2) else: cv1.append((a[i] * a[j] + b[i] * b[j]) / -2) cv2.append(cv1) return np.array(cv2) # Pulling out relevant data dft = df.loc[(df[treatment] == 1) & (df[weight].notnull())].copy() dfn = df.loc[(df[treatment] == 0) & (df[weight].notnull())].copy() vcols = list(df.columns) vcols.remove(treatment) vcols.remove(weight) if var_type == 'binary': if weighted: dwt = DescrStatsW(dft[vcols], weights=dft[weight]) wt = dwt.mean dwn = DescrStatsW(dfn[vcols], weights=dfn[weight]) wn = dwn.mean else: wt = np.mean(dft[vcols].dropna(), axis=0) wn = np.mean(dfn[vcols].dropna(), axis=0) return float((wt - wn) / np.sqrt((wt * (1 - wt) + wn * (1 - wn)) / 2)) elif var_type == 'continuous': if weighted: dwt = DescrStatsW(dft[vcols], weights=dft[weight], ddof=1) wmt = dwt.mean wst = dwt.std dwn = DescrStatsW(dfn[vcols], weights=dfn[weight], ddof=1) wmn = dwn.mean wsn = dwn.std else: dwt = DescrStatsW(dft[vcols], ddof=1) wmt = dwt.mean wst = dwt.std dwn = DescrStatsW(dfn[vcols], ddof=1) wmn = dwn.mean wsn = dwn.std return float((wmt - wmn) / np.sqrt((wst ** 2 + wsn ** 2) / 2)) elif var_type == 'categorical': if weighted: wt = np.average(dft[vcols], weights=dft[weight], axis=0) wn = np.average(dfn[vcols], weights=dfn[weight], axis=0) else: wt = np.mean(dft[vcols], axis=0) wn = np.mean(dfn[vcols], axis=0) t_c = wt - wn s_inv = np.linalg.inv(_categorical_cov_(a=wt, b=wn)) return float(np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:])))) else: raise ValueError('Not supported')
def getMean(array): weights = np.ones_like(array) stats = DescrStatsW(array, weights=weights, ddof=0) return stats.mean
def create_stats(self): """Compute statistical properties of column variable This function computes the statistical properties of values in the specified column. It is called by other functions that use the resulting figures to create a statistical overview. """ # reset stats containers self.stat_vars = [] self.stat_vals = {} self.print_lines = [] self.latex_table = [] # determine column properties col_props = self.get_col_props() # get value counts cnt, var_cnt, dist_cnt = (len(self.col), len(self.col_nn), self.col.nunique()) if self.weights_nn is not None: cnt, var_cnt = int(sum(self.weights)), int(sum(self.weights_nn)) for stat_var, stat_val in zip(('count', 'filled', 'distinct'), (cnt, var_cnt, dist_cnt)): self.stat_vars.append(stat_var) self.stat_vals[stat_var] = (stat_val, '{:d}'.format(stat_val)) n_nan = self.col.isnull().sum() if n_nan: self.stat_vars.append('nan') self.stat_vals['nan'] = (n_nan, '{:d}'.format(n_nan)) # add value counts to print lines self.print_lines.append( '{}:'.format(self.label if self.label else self.name)) ratio = (var_cnt / cnt) * 100 if cnt != 0 else 0 self.print_lines.append('{0:d} entries ({1:.0f}%)'.format( var_cnt, ratio)) self.print_lines.append('{0:d} unique entries'.format(dist_cnt)) # convert time stamps to integers if col_props['is_ts']: col_num = self.col_nn.astype(int) else: col_num = self.col_nn # get additional statistics for numeric variables if col_props['is_num'] and len(col_num): stat_vars = ('mean', 'std', 'min', 'max', 'p01', 'p05', 'p16', 'p50', 'p84', 'p95', 'p99') quant_probs = (0, 1, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99) #stat_vals = (col_num.mean(), col_num.std(), col_num.min(), col_num.max())\ # + tuple(col_num.quantile((0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99))) # two lines below also work if weights are None des = DescrStatsW(col_num, self.weights_nn) stat_vals = (des.mean, des.std) + tuple( weighted_quantile(col_num, self.weights_nn, quant_probs)) self.stat_vars += stat_vars for stat_var, stat_val in zip(stat_vars, stat_vals): if not col_props['is_ts']: # value entry for floats and integers self.stat_vals[stat_var] = (stat_val, '{:+g}'.format(stat_val)) else: if stat_var != 'std': # display time stamps as date/time strings self.stat_vals[stat_var] = (pd.Timestamp( int(stat_val)), str(pd.Timestamp(int(stat_val)))) else: # display time-stamp range as number of days stat_val /= NUM_NS_DAY self.stat_vals[stat_var] = (stat_val, '{:g}'.format(stat_val)) # append statistics to print lines name_len = max(len(n) for n in stat_vars) for stat_var in stat_vars: self.print_lines.append( '{{0:{:d}s}} : {{1:s}}'.format(name_len).format( stat_var, self.stat_vals[stat_var][1]))
porcentagem = (sum(intervalos) / total) * 100 porcentagemStr = '%.2f' % (porcentagem) + "%" intervalos2 = DF.loc[DF['Ponto médio'] < 160, ['Nº de Mulheres']].values qtdMulheres = sum(intervalos2) intervalos3 = DF.loc[DF['Ponto médio'] >= 100, ['Nº de Mulheres']].values qtdMulheres = sum(intervalos3) porcentagem2 = (sum(intervalos3) / total) * 100 porcentagemStr2 = '%.2f' % (porcentagem2) + "%" #DF['Nº de Mulheres'].idxmax() pontoM = DF['Ponto médio'].values numMulheres = DF['Nº de Mulheres'].values calcP = DescrStatsW(pontoM, numMulheres) desvioP = calcP.std desvioStr = '%.3f' % (desvioP) mediaPonderada = calcP.mean coeficientV = (desvioP / mediaPonderada) * 100 coeficientStr = '%.3f' % (coeficientV) + '%' #histograma questão 1 NumeroMulheres = DF['Nº de Mulheres'] pressaoS = DF['Ponto médio'] # NumeroMulheres = DF['Nº de Mulheres'] # pressaoS = DF['Ponto médio']
def _standardized_difference(self, variable, var_type, weighted=True): """Calculates the standardized mean difference between the treat/exposed and untreated/unexposed for a specified variable. Useful for checking whether a confounder was balanced between the two treatment groups by the specified IPTW model SMD based on: Austin PC 2011; https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3144483/ For efficiency, it is recommended you use standardized_mean_differences(). That function calculates the standardized mean differences for all variables included in the denominator Parameters --------------- variable : str, list Label for variable to calculate the standardized difference. If categorical variables, it should be a list of variable labels var_type : str Variable type. Options are 'binary' 'continuous' or 'categorical'. For categorical variable should be a list of columns labels weighted : bool, optional Whether to return the weighted standardized mean difference or the unweighted. Default is to return the weighted. Returns -------------- None Prints the positivity results to the console but does not return any objects """ # Pulling out relevant data dft = variable.loc[(variable[self.ex] == 1) & (variable['iptw'].notnull())].copy() dfn = variable.loc[(variable[self.ex] == 0) & (variable['iptw'].notnull())].copy() # removing self.ex and 'iptw' from vars to calculate for vcols = list(variable.columns) vcols.remove(self.ex) vcols.remove('iptw') if var_type == 'binary': if weighted: dwt = DescrStatsW(dft[vcols], weights=dft['iptw']) wt = dwt.mean dwn = DescrStatsW(dfn[vcols], weights=dfn['iptw']) wn = dwn.mean else: wt = np.mean(dft[vcols].dropna(), axis=0) wn = np.mean(dfn[vcols].dropna(), axis=0) return float((wt - wn) / np.sqrt( (wt * (1 - wt) + wn * (1 - wn)) / 2)) if var_type == 'continuous': if weighted: dwt = DescrStatsW(dft[vcols], weights=dft['iptw'], ddof=1) wmt = dwt.mean wst = dwt.std dwn = DescrStatsW(dfn[vcols], weights=dfn['iptw'], ddof=1) wmn = dwn.mean wsn = dwn.std else: dwt = DescrStatsW(dft[vcols], ddof=1) wmt = dwt.mean wst = dwt.std dwn = DescrStatsW(dfn[vcols], ddof=1) wmn = dwn.mean wsn = dwn.std return float((wmt - wmn) / np.sqrt((wst**2 + wsn**2) / 2)) if var_type == 'categorical': if weighted: wt = np.average(dft[vcols], weights=dft['iptw'], axis=0) wn = np.average(dfn[vcols], weights=dfn['iptw'], axis=0) else: wt = np.average(dft[vcols], axis=0) wn = np.mean(dfn[vcols], axis=0) t_c = wt - wn s_inv = np.linalg.inv( self._categorical_cov(treated=wt, untreated=wn)) return float( np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:]))))
import numpy as np np.random.seed(75243) temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1) medias = [temp[0:i].mean() for i in range(1, len(temp))] plt.plot(medias) from statsmodels.stats.weightstats import zconfint zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos) from statsmodels.stats.weightstats import DescrStatsW descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos) descr_todos_com_10_votos.tconfint_mean() """# Vamos ver o filme 1...""" filmes = pd.read_csv("movies.csv") filmes.query("movieId==1") notas1 = notas.query("movieId == 1") notas1.head() ax = sns.distplot(notas1.rating) ax.set(xlabel="Nota", ylabel="Densidade") ax.set_title("Distribuição das notas para o Toy Story") ax = sns.boxplot(notas1.rating)