def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def anova_all(self, animate=True, drugs=None, multicore=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Calls :meth:`anova_one_drug` for each drug and concatenate all results together. Note that once all data are gathered, :meth:`add_pvalues_correction` is called to fill a new column with FDR corrections. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 gives the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ if self.verbose and len(self.individual_anova): print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method") # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) # pylab.shuffle(drug_names) # ? why if animate is True: pb.animate(0) if multicore: # Note that here, we do not use the buffer multicore_analysis(self, drug_names, multicore) else: for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output="dataframe") self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values("ANOVA_FEATURE_pval", inplace=True) except: df.sort("ANOVA_FEATURE_pval", inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == "global": df = self.add_pvalues_correction(df) else: pass # insert a unique identifier as first column df.insert(0, "ASSOC_ID", range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
def anova_one_drug(self, drug_id, animate=True, output="object"): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first # columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this["ANOVA_FEATURE_pval"] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != "object": df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df) res.settings = ANOVASettings(**self.settings) return res