Python BoxPlots Examples

Programming Language: Python

Namespace/Package Name: gdsctools.boxplots

Class/Type: BoxPlots

Examples at hotexamples.com: 6

Python BoxPlots - 6 examples found. These are the top rated real world Python examples of gdsctools.boxplots.BoxPlots extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BoxPlots(3)

boxplot_association(2)

boxplot_pancan(2)

_get_boxplot_data(1)

Example #1

Show file

File: test_boxplots.py Project: howard-lightfoot/gdsctools

def test_get_boxplot_data():
    an = ANOVA(ic50_test)
    odof = an._get_one_drug_one_feature_data('Drug_1047_IC50','TP53_mut')

    bb = BoxPlots(odof)

    data = bb._get_boxplot_data(mode='msi')
    assert data[1] == ['***MSI-stable neg', '***MSI-stable pos',
                  '**MSI-unstable neg',  '**MSI-unstable pos']
    expected = [2.0108071495663922e-47, 0.0012564798887037905]
    assert_list_almost_equal([data[2][0], data[2][1]], expected)

Example #2

Show file

File: test_boxplots.py Project: shukwong/gdsctools

def test_get_boxplot_data():
    an = ANOVA(ic50_test)
    odof = an._get_one_drug_one_feature_data(1047, 'TP53_mut')

    bb = BoxPlots(odof)

    data = bb._get_boxplot_data(mode='msi')
    assert data[1] == [
        '***MSI-stable neg', '***MSI-stable pos', '**MSI-unstable neg',
        '**MSI-unstable pos'
    ]
    expected = [2.0108071495663922e-47, 0.0012564798887037905]
    assert_list_almost_equal([data[2][0], data[2][1]], expected)

Example #3

Show file

File: anova.py Project: Donnyvdm/gdsctools

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.',
                                   fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id,
                feature_name,
                formula=self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                                                        output='dict',
                                                        odof=odof)
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory,
                               fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df

Example #4

Show file

File: anova.py Project: CancerRxGene/gdsctools

    def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory="."):
        """

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        data = data.replace(np.inf, 0)
        """
        if drug_id not in self.drugIds:
            raise ValueError("Unknown drug name %s. Use e.g., %s" % (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError("Unknown feature name %s. Use e.g. one of %s" % (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results["FEATURE"] = feature_name
            results["DRUG_ID"] = odof.drug_id
            results["DRUG_NAME"] = odof.drug_name
            results["DRUG_TARGET"] = odof.drug_target
            results["N_FEATURE_pos"] = odof.Npos
            results["N_FEATURE_neg"] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id, feature_name, formula=self.settings.regression_formula, odof=odof
            )
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == "PANCAN":
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith("C(tissue")]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith("C(media)")]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith("C(media")]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df["C(msi)[T.1]"] = odof.masked_msi.values
            df["feature"] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                # data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output="dict", odof=odof)
                try:
                    self.samples1.append(anova_pvalues["msi"])
                except:
                    pass
                self.samples2.append(anova_pvalues["feature"])
                try:
                    self.samples3.append(anova_pvalues["tissue"])
                except:
                    pass
                # pb.animate(i+1)
            import fitter

            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                "error": ff.df_errors.ix[dist].values[0],
                "params": ff.fitted_param[dist],
                "feature": feature_name,
                "N": len(Y),
            }

        if show is True:
            boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == "PANCAN":
                boxplot.boxplot_pancan(fignum=2, mode="tissue")
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode="msi")
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode="media")

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df

Example #5

Show file

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution
            # for the raw data is uniform one but if we take the log10,
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha,
                L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=1)

        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output='dict')
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }
            print(self.pvalues_features[key])

        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                     output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {
            'FEATURE': feature_name,
            'DRUG_ID': drug_id,
            'DRUG_NAME': drug_name,
            'DRUG_TARGET': drug_target,
            'N_FEATURE_pos': odof.Npos,
            'N_FEATURE_neg': odof.Nneg,
            'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
            'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
            'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
            'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
            'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
            'FEATURE_IC50_effect_size': odof.effectsize_ic50,
            'FEATURE_pos_Glass_delta': odof.pos_glass,
            'FEATURE_neg_Glass_delta': odof.neg_glass,
            'ANOVA_FEATURE_pval': FEATURE_PVAL,
            'ANOVA_TISSUE_pval': tissue_PVAL,
            'ANOVA_MSI_pval': MSI_PVAL,
            'ANOVA_MEDIA_pval': MEDIA_PVAL,
            'FEATURE_IC50_T_pval': odof.ttest  # pvalues is in index 1
        }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df

Example #6

Show file

File: anova.py Project: saezrodriguez/gdsctools

    def anova_one_drug_one_feature(self, drug_id,
            feature_name, show=False,
            production=False, directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s'
                    % (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s'
                    % (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0: # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if
                        x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution 
            # for the raw data is uniform one but if we take the log10, 
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=1)


        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                    output='dict')
                try:self.samples1.append(anova_pvalues['msi'])
                except:pass
                self.samples2.append(anova_pvalues['feature'])
                try:self.samples3.append(anova_pvalues['tissue'])
                except:pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N':len(Y)
            }
            print(self.pvalues_features[key])


        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                 output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof, savefig=self.settings.savefig,
                    directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {'FEATURE': feature_name,
                'DRUG_ID': drug_id,
                'DRUG_NAME': drug_name,
                'DRUG_TARGET': drug_target,
                'N_FEATURE_pos': odof.Npos,
                'N_FEATURE_neg': odof.Nneg,
                'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
                'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
                'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
                'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
                'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
                'FEATURE_IC50_effect_size': odof.effectsize_ic50,
                'FEATURE_pos_Glass_delta': odof.pos_glass,
                'FEATURE_neg_Glass_delta': odof.neg_glass,
                'ANOVA_FEATURE_pval': FEATURE_PVAL,
                'ANOVA_TISSUE_pval': tissue_PVAL,
                'ANOVA_MSI_pval': MSI_PVAL,
                'ANOVA_MEDIA_pval': MEDIA_PVAL,
                'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1
                }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df