def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None): """ Consider using one of month_plot or quarter_plot unless you need irregular plotting. Parameters ---------- grouped_x : iterable of DataFrames Should be a GroupBy object (or similar pair of group_names and groups as DataFrames) with a DatetimeIndex or PeriodIndex """ fig, ax = utils.create_mpl_ax(ax) start = 0 ticks = [] for season, df in grouped_x: df = df.copy() # or sort balks for series. may be better way sort_values(df, inplace=True) nobs = len(df) x_plot = np.arange(start, start + nobs) ticks.append(x_plot.mean()) ax.plot(x_plot, df.values, 'k') ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='k') start += nobs ax.set_xticks(ticks) ax.set_xticklabels(xticklabels) ax.set_ylabel(ylabel) ax.margins(.1, .05) return fig
def test_mosaic(close_figures): # make the same analysis on a known dataset # load the data and clean it a bit affairs = datasets.fair.load_pandas() datas = affairs.exog # any time greater than 0 is cheating datas['cheated'] = affairs.endog > 0 # sort by the marriage quality and give meaningful name # [rate_marriage, age, yrs_married, children, # religious, educ, occupation, occupation_husb] datas = sort_values(datas, ['rate_marriage', 'religious']) num_to_desc = {1: 'awful', 2: 'bad', 3: 'intermediate', 4: 'good', 5: 'wonderful'} datas['rate_marriage'] = datas['rate_marriage'].map(num_to_desc) num_to_faith = {1: 'non religious', 2: 'poorly religious', 3: 'religious', 4: 'very religious'} datas['religious'] = datas['religious'].map(num_to_faith) num_to_cheat = {False: 'faithful', True: 'cheated'} datas['cheated'] = datas['cheated'].map(num_to_cheat) # finished cleaning _, ax = plt.subplots(2, 2) mosaic(datas, ['rate_marriage', 'cheated'], ax=ax[0, 0], title='by marriage happiness') mosaic(datas, ['religious', 'cheated'], ax=ax[0, 1], title='by religiosity') mosaic(datas, ['rate_marriage', 'religious', 'cheated'], ax=ax[1, 0], title='by both', labelizer=lambda k:'') ax[1, 0].set_xlabel('marriage rating') ax[1, 0].set_ylabel('religion status') mosaic(datas, ['religious', 'rate_marriage'], ax=ax[1, 1], title='inter-dependence', axes_label=False) plt.suptitle("extramarital affairs (plot 3 of 4)")
def test_mosaic(): # make the same analysis on a known dataset # load the data and clean it a bit affairs = datasets.fair.load_pandas() datas = affairs.exog # any time greater than 0 is cheating datas['cheated'] = affairs.endog > 0 # sort by the marriage quality and give meaningful name # [rate_marriage, age, yrs_married, children, # religious, educ, occupation, occupation_husb] datas = sort_values(datas, ['rate_marriage', 'religious']) num_to_desc = {1: 'awful', 2: 'bad', 3: 'intermediate', 4: 'good', 5: 'wonderful'} datas['rate_marriage'] = datas['rate_marriage'].map(num_to_desc) num_to_faith = {1: 'non religious', 2: 'poorly religious', 3: 'religious', 4: 'very religious'} datas['religious'] = datas['religious'].map(num_to_faith) num_to_cheat = {False: 'faithful', True: 'cheated'} datas['cheated'] = datas['cheated'].map(num_to_cheat) # finished cleaning fig, ax = pylab.subplots(2, 2) mosaic(datas, ['rate_marriage', 'cheated'], ax=ax[0, 0], title='by marriage happiness') mosaic(datas, ['religious', 'cheated'], ax=ax[0, 1], title='by religiosity') mosaic(datas, ['rate_marriage', 'religious', 'cheated'], ax=ax[1, 0], title='by both', labelizer=lambda k:'') ax[1, 0].set_xlabel('marriage rating') ax[1, 0].set_ylabel('religion status') mosaic(datas, ['religious', 'rate_marriage'], ax=ax[1, 1], title='inter-dependence', axes_label=False) pylab.suptitle("extramarital affairs (plot 3 of 4)")
def _ros_sort(df, observations, censorship, warn=False): """ This function prepares a dataframe for ROS. It sorts ascending with left-censored observations first. Censored observations larger than the maximum uncensored observations are removed from the dataframe. Parameters ---------- df : pandas.DataFrame observations : str Name of the column in the dataframe that contains observed values. Censored values should be set to the detection (upper) limit. censorship : str Name of the column in the dataframe that indicates that a observation is left-censored. (i.e., True -> censored, False -> uncensored) Returns ------ sorted_df : pandas.DataFrame The sorted dataframe with all columns dropped except the observation and censorship columns. """ # separate uncensored data from censored data censored = sort_values(df[df[censorship]], observations, axis=0) uncensored = sort_values(df[~df[censorship]], observations, axis=0) if censored[observations].max() > uncensored[observations].max(): censored = censored[ censored[observations] <= uncensored[observations].max()] if warn: msg = ("Dropping censored observations greater than " "the max uncensored observation.") warnings.warn(msg) return censored.append(uncensored)[[observations, censorship]].reset_index(drop=True)
def _ros_sort(df, observations, censorship, warn=False): """ This function prepares a dataframe for ROS. It sorts ascending with left-censored observations first. Censored observations larger than the maximum uncensored observations are removed from the dataframe. Parameters ---------- df : pandas.DataFrame observations : str Name of the column in the dataframe that contains observed values. Censored values should be set to the detection (upper) limit. censorship : str Name of the column in the dataframe that indicates that a observation is left-censored. (i.e., True -> censored, False -> uncensored) Returns ------ sorted_df : pandas.DataFrame The sorted dataframe with all columns dropped except the observation and censorship columns. """ # separate uncensored data from censored data censored = sort_values(df[df[censorship]], observations, axis=0) uncensored = sort_values(df[~df[censorship]], observations, axis=0) if censored[observations].max() > uncensored[observations].max(): censored = censored[censored[observations] <= uncensored[observations].max()] if warn: msg = ("Dropping censored observations greater than " "the max uncensored observation.") warnings.warn(msg) return censored.append(uncensored)[[observations, censorship]].reset_index(drop=True)
print(ols_model.summary()) infl = ols_model.get_influence() student = infl.summary_frame()['student_resid'] print(student) print(student.ix[np.abs(student) > 2]) print(infl.summary_frame().ix['minister']) sidak = ols_model.outlier_test('sidak') sort_values(sidak, 'unadj_p', inplace=True) print(sidak) fdr = ols_model.outlier_test('fdr_bh') sort_values(fdr, 'unadj_p', inplace=True) print(fdr) rlm_model = rlm('prestige ~ income + education', prestige).fit() print(rlm_model.summary()) print(rlm_model.weights)
ax2 = fig.add_subplot(212, xlabel='Education', ylabel='Prestige') ax2.scatter(prestige.education, prestige.prestige) ols_model = ols('prestige ~ income + education', prestige).fit() print(ols_model.summary()) infl = ols_model.get_influence() student = infl.summary_frame()['student_resid'] print(student) print(student.ix[np.abs(student) > 2]) print(infl.summary_frame().ix['minister']) sidak = ols_model.outlier_test('sidak') sort_values(sidak, 'unadj_p', inplace=True) print(sidak) fdr = ols_model.outlier_test('fdr_bh') sort_values(fdr, 'unadj_p', inplace=True) print(fdr) rlm_model = rlm('prestige ~ income + education', prestige).fit() print(rlm_model.summary()) print(rlm_model.weights) #### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points # * Data is on the luminosity and temperature of 47 stars in the direction of Cygnus.
def multiOLS(model, dataframe, column_list=None, method='fdr_bh', alpha=0.05, subset=None, model_type=OLS, **kwargs): """apply a linear model to several endogenous variables on a dataframe Take a linear model definition via formula and a dataframe that will be the environment of the model, and apply the linear model to a subset (or all) of the columns of the dataframe. It will return a dataframe with part of the information from the linear model summary. Parameters ---------- model : string formula description of the model dataframe : pandas.dataframe dataframe where the model will be evaluated column_list : list of strings, optional Names of the columns to analyze with the model. If None (Default) it will perform the function on all the eligible columns (numerical type and not in the model definition) model_type : model class, optional The type of model to be used. The default is the linear model. Can be any linear model (OLS, WLS, GLS, etc..) method: string, optional the method used to perform the pvalue correction for multiple testing. default is the Benjamini/Hochberg, other available methods are: `bonferroni` : one-step correction `sidak` : on-step correction `holm-sidak` : `holm` : `simes-hochberg` : `hommel` : `fdr_bh` : Benjamini/Hochberg `fdr_by` : Benjamini/Yekutieli alpha: float, optional the significance level used for the pvalue correction (default 0.05) subset: boolean array the selected rows to be used in the regression all the other parameters will be directed to the model creation. Returns ------- summary : pandas.DataFrame a dataframe containing an extract from the summary of the model obtained for each columns. It will give the model complexive f test result and p-value, and the regression value and standard deviarion for each of the regressors. The Dataframe has a hierachical column structure, divided as: - params: contains the parameters resulting from the models. Has an additional column named _f_test containing the result of the F test. - pval: the pvalue results of the models. Has the _f_test column for the significativity of the whole test. - adj_pval: the corrected pvalues via the multitest function. - std: uncertainties of the model parameters - statistics: contains the r squared statistics and the adjusted r squared. Notes ----- The main application of this function is on system biology to perform a linear model testing of a lot of different parameters, like the different genetic expression of several genes. See Also -------- statsmodels.stats.multitest contains several functions to perform the multiple p-value correction Examples -------- Using the longley data as dataframe example >>> import statsmodels.api as sm >>> data = sm.datasets.longley.load_pandas() >>> df = data.exog >>> df['TOTEMP'] = data.endog This will perform the specified linear model on all the other columns of the dataframe >>> multiOLS('GNP + 1', df) This select only a certain subset of the columns >>> multiOLS('GNP + 0', df, ['GNPDEFL', 'TOTEMP', 'POP']) It is possible to specify a trasformation also on the target column, conforming to the patsy formula specification >>> multiOLS('GNP + 0', df, ['I(GNPDEFL**2)', 'center(TOTEMP)']) It is possible to specify the subset of the dataframe on which perform the analysis >> multiOLS('GNP + 1', df, subset=df.GNPDEFL > 90) Even a single column name can be given without enclosing it in a list >>> multiOLS('GNP + 0', df, 'GNPDEFL') """ # data normalization # if None take all the numerical columns that aren't present in the model # it's not waterproof but is a good enough criterion for everyday use if column_list is None: column_list = [ name for name in dataframe.columns if dataframe[name].dtype != object and name not in model ] # if it's a single string transform it in a single element list if isinstance(column_list, string_types): column_list = [column_list] if subset is not None: dataframe = dataframe.ix[subset] # perform each model and retrieve the statistics col_results = {} # as the model will use always the same endogenous variables # we can create them once and reuse model_exog = dmatrix(model, data=dataframe, return_type="dataframe") for col_name in column_list: # it will try to interpret the column name as a valid dataframe # index as it can be several times faster. If it fails it # interpret it as a patsy formula (for example for centering) try: model_endog = dataframe[col_name] except KeyError: model_endog = dmatrix(col_name + ' + 0', data=dataframe) # retrieve the result and store them res = _model2dataframe(model_endog, model_exog, model_type, **kwargs) col_results[col_name] = res # mangle them togheter and sort by complexive p-value summary = pd.DataFrame(col_results) # order by the p-value: the most useful model first! summary = sort_values(summary.T, [('pvals', '_f_test')]) summary.index.name = 'endogenous vars' # implementing the pvalue correction method smt = stats.multipletests for (key1, key2) in summary: if key1 != 'pvals': continue p_values = summary[key1, key2] corrected = smt(p_values, method=method, alpha=alpha)[1] # extend the dataframe of results with the column # of the corrected p_values summary['adj_' + key1, key2] = corrected return summary
def multigroup(pvals, groups, exact=True, keep_all=True, alpha=0.05): """Test if the given groups are different from the total partition. Given a boolean array test if each group has a proportion of positives different than the complexive proportion. The test can be done as an exact Fisher test or approximated as a Chi squared test for more speed. Parameters ---------- pvals: pandas series of boolean the significativity of the variables under analysis groups: dict of list the name of each category of variables under exam. each one is a list of the variables included exact: boolean, optional If True (default) use the fisher exact test, otherwise use the chi squared test for contingencies tables. For high number of elements in the array the fisher test can be significantly slower than the chi squared. keep_all: boolean, optional if False it will drop those groups where the fraction of positive is below the expected result. If True (default) it will keep all the significant results. alpha: float, optional the significativity level for the pvalue correction on the whole set of groups (not inside the groups themselves). Returns ------- result_df: pandas dataframe for each group returns: pvals - the fisher p value of the test adj_pvals - the adjusted pvals increase - the log of the odd ratio between the internal significant ratio versus the external one _in_sign - significative elements inside the group _in_non - non significative elements inside the group _out_sign - significative elements outside the group _out_non - non significative elements outside the group Notes ----- This test allow to see if a category of variables is generally better suited to be described for the model. For example to see if a predictor gives more information on demographic or economical parameters, by creating two groups containing the endogenous variables of each category. This function is conceived for medical dataset with a lot of variables that can be easily grouped into functional groups. This is because The significativity of a group require a rather large number of composing elements. Examples -------- A toy example on a real dataset, the Guerry dataset from R >>> url = "http://vincentarelbundock.github.com/" >>> url = url + "Rdatasets/csv/HistData/Guerry.csv" >>> df = pd.read_csv(url, index_col='dept') evaluate the relationship between the variuos paramenters whith the Wealth >>> pvals = multiOLS('Wealth', df)['adj_pvals', '_f_test'] define the groups >>> groups = {} >>> groups['crime'] = ['Crime_prop', 'Infanticide', ... 'Crime_parents', 'Desertion', 'Crime_pers'] >>> groups['religion'] = ['Donation_clergy', 'Clergy', 'Donations'] >>> groups['wealth'] = ['Commerce', 'Lottery', 'Instruction', 'Literacy'] do the analysis of the significativity >>> multigroup(pvals < 0.05, groups) """ pvals = pd.Series(pvals) if not (set(pvals.unique()) <= set([False, True])): raise ValueError("the series should be binary") if hasattr(pvals.index, 'is_unique') and not pvals.index.is_unique: raise ValueError("series with duplicated index is not accepted") results = { 'pvals': {}, 'increase': {}, '_in_sign': {}, '_in_non': {}, '_out_sign': {}, '_out_non': {} } for group_name, group_list in iteritems(groups): res = _test_group(pvals, group_name, group_list, exact) results['pvals'][group_name] = res[0] results['increase'][group_name] = res[1] results['_in_sign'][group_name] = res[2][0] results['_in_non'][group_name] = res[2][1] results['_out_sign'][group_name] = res[2][2] results['_out_non'][group_name] = res[2][3] result_df = sort_values(pd.DataFrame(results), 'pvals') if not keep_all: result_df = result_df[result_df.increase] smt = stats.multipletests corrected = smt(result_df['pvals'], method='fdr_bh', alpha=alpha)[1] result_df['adj_pvals'] = corrected return result_df
def multiOLS(model, dataframe, column_list=None, method='fdr_bh', alpha=0.05, subset=None, model_type=OLS, **kwargs): """apply a linear model to several endogenous variables on a dataframe Take a linear model definition via formula and a dataframe that will be the environment of the model, and apply the linear model to a subset (or all) of the columns of the dataframe. It will return a dataframe with part of the information from the linear model summary. Parameters ---------- model : string formula description of the model dataframe : pandas.dataframe dataframe where the model will be evaluated column_list : list of strings, optional Names of the columns to analyze with the model. If None (Default) it will perform the function on all the eligible columns (numerical type and not in the model definition) model_type : model class, optional The type of model to be used. The default is the linear model. Can be any linear model (OLS, WLS, GLS, etc..) method: string, optional the method used to perform the pvalue correction for multiple testing. default is the Benjamini/Hochberg, other available methods are: `bonferroni` : one-step correction `sidak` : on-step correction `holm-sidak` : `holm` : `simes-hochberg` : `hommel` : `fdr_bh` : Benjamini/Hochberg `fdr_by` : Benjamini/Yekutieli alpha: float, optional the significance level used for the pvalue correction (default 0.05) subset: boolean array the selected rows to be used in the regression all the other parameters will be directed to the model creation. Returns ------- summary : pandas.DataFrame a dataframe containing an extract from the summary of the model obtained for each columns. It will give the model complexive f test result and p-value, and the regression value and standard deviarion for each of the regressors. The Dataframe has a hierachical column structure, divided as: - params: contains the parameters resulting from the models. Has an additional column named _f_test containing the result of the F test. - pval: the pvalue results of the models. Has the _f_test column for the significativity of the whole test. - adj_pval: the corrected pvalues via the multitest function. - std: uncertainties of the model parameters - statistics: contains the r squared statistics and the adjusted r squared. Notes ----- The main application of this function is on system biology to perform a linear model testing of a lot of different parameters, like the different genetic expression of several genes. See Also -------- statsmodels.stats.multitest contains several functions to perform the multiple p-value correction Examples -------- Using the longley data as dataframe example >>> import statsmodels.api as sm >>> data = sm.datasets.longley.load_pandas() >>> df = data.exog >>> df['TOTEMP'] = data.endog This will perform the specified linear model on all the other columns of the dataframe >>> multiOLS('GNP + 1', df) This select only a certain subset of the columns >>> multiOLS('GNP + 0', df, ['GNPDEFL', 'TOTEMP', 'POP']) It is possible to specify a trasformation also on the target column, conforming to the patsy formula specification >>> multiOLS('GNP + 0', df, ['I(GNPDEFL**2)', 'center(TOTEMP)']) It is possible to specify the subset of the dataframe on which perform the analysis >> multiOLS('GNP + 1', df, subset=df.GNPDEFL > 90) Even a single column name can be given without enclosing it in a list >>> multiOLS('GNP + 0', df, 'GNPDEFL') """ # data normalization # if None take all the numerical columns that aren't present in the model # it's not waterproof but is a good enough criterion for everyday use if column_list is None: column_list = [name for name in dataframe.columns if dataframe[name].dtype != object and name not in model] # if it's a single string transform it in a single element list if isinstance(column_list, string_types): column_list = [column_list] if subset is not None: dataframe = dataframe.loc[subset] # perform each model and retrieve the statistics col_results = {} # as the model will use always the same endogenous variables # we can create them once and reuse model_exog = dmatrix(model, data=dataframe, return_type="dataframe") for col_name in column_list: # it will try to interpret the column name as a valid dataframe # index as it can be several times faster. If it fails it # interpret it as a patsy formula (for example for centering) try: model_endog = dataframe[col_name] except KeyError: model_endog = dmatrix(col_name + ' + 0', data=dataframe) # retrieve the result and store them res = _model2dataframe(model_endog, model_exog, model_type, **kwargs) col_results[col_name] = res # mangle them togheter and sort by complexive p-value summary = pd.DataFrame(col_results) # order by the p-value: the most useful model first! summary = sort_values(summary.T, [('pvals', '_f_test')]) summary.index.name = 'endogenous vars' # implementing the pvalue correction method smt = stats.multipletests for (key1, key2) in summary: if key1 != 'pvals': continue p_values = summary[key1, key2] corrected = smt(p_values, method=method, alpha=alpha)[1] # extend the dataframe of results with the column # of the corrected p_values summary['adj_' + key1, key2] = corrected return summary
def multigroup(pvals, groups, exact=True, keep_all=True, alpha=0.05): """Test if the given groups are different from the total partition. Given a boolean array test if each group has a proportion of positives different than the complexive proportion. The test can be done as an exact Fisher test or approximated as a Chi squared test for more speed. Parameters ---------- pvals: pandas series of boolean the significativity of the variables under analysis groups: dict of list the name of each category of variables under exam. each one is a list of the variables included exact: boolean, optional If True (default) use the fisher exact test, otherwise use the chi squared test for contingencies tables. For high number of elements in the array the fisher test can be significantly slower than the chi squared. keep_all: boolean, optional if False it will drop those groups where the fraction of positive is below the expected result. If True (default) it will keep all the significant results. alpha: float, optional the significativity level for the pvalue correction on the whole set of groups (not inside the groups themselves). Returns ------- result_df: pandas dataframe for each group returns: pvals - the fisher p value of the test adj_pvals - the adjusted pvals increase - the log of the odd ratio between the internal significant ratio versus the external one _in_sign - significative elements inside the group _in_non - non significative elements inside the group _out_sign - significative elements outside the group _out_non - non significative elements outside the group Notes ----- This test allow to see if a category of variables is generally better suited to be described for the model. For example to see if a predictor gives more information on demographic or economical parameters, by creating two groups containing the endogenous variables of each category. This function is conceived for medical dataset with a lot of variables that can be easily grouped into functional groups. This is because The significativity of a group require a rather large number of composing elements. Examples -------- A toy example on a real dataset, the Guerry dataset from R >>> url = "https://raw.githubusercontent.com/vincentarelbundock/" >>> url = url + "Rdatasets/csv/HistData/Guerry.csv" >>> df = pd.read_csv(url, index_col='dept') evaluate the relationship between the variuos paramenters whith the Wealth >>> pvals = multiOLS('Wealth', df)['adj_pvals', '_f_test'] define the groups >>> groups = {} >>> groups['crime'] = ['Crime_prop', 'Infanticide', ... 'Crime_parents', 'Desertion', 'Crime_pers'] >>> groups['religion'] = ['Donation_clergy', 'Clergy', 'Donations'] >>> groups['wealth'] = ['Commerce', 'Lottery', 'Instruction', 'Literacy'] do the analysis of the significativity >>> multigroup(pvals < 0.05, groups) """ pvals = pd.Series(pvals) if not (set(pvals.unique()) <= set([False, True])): raise ValueError("the series should be binary") if hasattr(pvals.index, 'is_unique') and not pvals.index.is_unique: raise ValueError("series with duplicated index is not accepted") results = {'pvals': {}, 'increase': {}, '_in_sign': {}, '_in_non': {}, '_out_sign': {}, '_out_non': {}} for group_name, group_list in iteritems(groups): res = _test_group(pvals, group_name, group_list, exact) results['pvals'][group_name] = res[0] results['increase'][group_name] = res[1] results['_in_sign'][group_name] = res[2][0] results['_in_non'][group_name] = res[2][1] results['_out_sign'][group_name] = res[2][2] results['_out_non'][group_name] = res[2][3] result_df = sort_values(pd.DataFrame(results), 'pvals') if not keep_all: result_df = result_df[result_df.increase] smt = stats.multipletests corrected = smt(result_df['pvals'], method='fdr_bh', alpha=alpha)[1] result_df['adj_pvals'] = corrected return result_df