def test_get_boxplot_data(): an = ANOVA(ic50_test) odof = an._get_one_drug_one_feature_data('Drug_1047_IC50','TP53_mut') bb = BoxPlots(odof) data = bb._get_boxplot_data(mode='msi') assert data[1] == ['***MSI-stable neg', '***MSI-stable pos', '**MSI-unstable neg', '**MSI-unstable pos'] expected = [2.0108071495663922e-47, 0.0012564798887037905] assert_list_almost_equal([data[2][0], data[2][1]], expected)
def test_get_boxplot_data(): an = ANOVA(ic50_test) odof = an._get_one_drug_one_feature_data(1047, 'TP53_mut') bb = BoxPlots(odof) data = bb._get_boxplot_data(mode='msi') assert data[1] == [ '***MSI-stable neg', '***MSI-stable pos', '**MSI-unstable neg', '**MSI-unstable pos' ] expected = [2.0108071495663922e-47, 0.0012564798887037905] assert_list_almost_equal([data[2][0], data[2][1]], expected)
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.', fontsize=18): """Compute ABOVA one drug and one feature level :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = odof.drug_id results['DRUG_NAME'] = odof.drug_name results['DRUG_TARGET'] = odof.drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.loc[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith('C(tissue')] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith('C(media')] if len(medias): df.drop(medias[0], axis=1, inplace=True) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict', odof=odof) try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.loc[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory, fontsize=fontsize) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode='media') # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory="."): """ :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. data = data.replace(np.inf, 0) """ if drug_id not in self.drugIds: raise ValueError("Unknown drug name %s. Use e.g., %s" % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError("Unknown feature name %s. Use e.g. one of %s" % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results["FEATURE"] = feature_name results["DRUG_ID"] = odof.drug_id results["DRUG_NAME"] = odof.drug_name results["DRUG_TARGET"] = odof.drug_target results["N_FEATURE_pos"] = odof.Npos results["N_FEATURE_neg"] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof ) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == "PANCAN": # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith("C(tissue")] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith("C(media)")] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith("C(media")] if len(medias): df.drop(medias[0], axis=1, inplace=True) df["C(msi)[T.1]"] = odof.masked_msi.values df["feature"] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) # data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output="dict", odof=odof) try: self.samples1.append(anova_pvalues["msi"]) except: pass self.samples2.append(anova_pvalues["feature"]) try: self.samples3.append(anova_pvalues["tissue"]) except: pass # pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { "error": ff.df_errors.ix[dist].values[0], "params": ff.fitted_param[dist], "feature": feature_name, "N": len(Y), } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == "PANCAN": boxplot.boxplot_pancan(fignum=2, mode="tissue") if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode="msi") if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode="media") # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = { 'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try:self.samples1.append(anova_pvalues['msi']) except:pass self.samples2.append(anova_pvalues['feature']) try:self.samples3.append(anova_pvalues['tissue']) except:pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N':len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = {'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df