def test_get_boxplot_data():
    an = ANOVA(ic50_test)
    odof = an._get_one_drug_one_feature_data('Drug_1047_IC50','TP53_mut')

    bb = BoxPlots(odof)

    data = bb._get_boxplot_data(mode='msi')
    assert data[1] == ['***MSI-stable neg', '***MSI-stable pos',
                  '**MSI-unstable neg',  '**MSI-unstable pos']
    expected = [2.0108071495663922e-47, 0.0012564798887037905]
    assert_list_almost_equal([data[2][0], data[2][1]], expected)  
Esempio n. 2
0
def test_get_boxplot_data():
    an = ANOVA(ic50_test)
    odof = an._get_one_drug_one_feature_data(1047, 'TP53_mut')

    bb = BoxPlots(odof)

    data = bb._get_boxplot_data(mode='msi')
    assert data[1] == [
        '***MSI-stable neg', '***MSI-stable pos', '**MSI-unstable neg',
        '**MSI-unstable pos'
    ]
    expected = [2.0108071495663922e-47, 0.0012564798887037905]
    assert_list_almost_equal([data[2][0], data[2][1]], expected)
Esempio n. 3
0
    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.',
                                   fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id,
                feature_name,
                formula=self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                                                        output='dict',
                                                        odof=odof)
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory,
                               fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df
Esempio n. 4
0
    def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory="."):
        """

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        data = data.replace(np.inf, 0)
        """
        if drug_id not in self.drugIds:
            raise ValueError("Unknown drug name %s. Use e.g., %s" % (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError("Unknown feature name %s. Use e.g. one of %s" % (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results["FEATURE"] = feature_name
            results["DRUG_ID"] = odof.drug_id
            results["DRUG_NAME"] = odof.drug_name
            results["DRUG_TARGET"] = odof.drug_target
            results["N_FEATURE_pos"] = odof.Npos
            results["N_FEATURE_neg"] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id, feature_name, formula=self.settings.regression_formula, odof=odof
            )
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == "PANCAN":
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith("C(tissue")]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith("C(media)")]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith("C(media")]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df["C(msi)[T.1]"] = odof.masked_msi.values
            df["feature"] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                # data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output="dict", odof=odof)
                try:
                    self.samples1.append(anova_pvalues["msi"])
                except:
                    pass
                self.samples2.append(anova_pvalues["feature"])
                try:
                    self.samples3.append(anova_pvalues["tissue"])
                except:
                    pass
                # pb.animate(i+1)
            import fitter

            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                "error": ff.df_errors.ix[dist].values[0],
                "params": ff.fitted_param[dist],
                "feature": feature_name,
                "N": len(Y),
            }

        if show is True:
            boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == "PANCAN":
                boxplot.boxplot_pancan(fignum=2, mode="tissue")
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode="msi")
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode="media")

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df
Esempio n. 5
0
    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution
            # for the raw data is uniform one but if we take the log10,
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha,
                L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=1)

        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output='dict')
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }
            print(self.pvalues_features[key])

        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                     output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {
            'FEATURE': feature_name,
            'DRUG_ID': drug_id,
            'DRUG_NAME': drug_name,
            'DRUG_TARGET': drug_target,
            'N_FEATURE_pos': odof.Npos,
            'N_FEATURE_neg': odof.Nneg,
            'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
            'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
            'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
            'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
            'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
            'FEATURE_IC50_effect_size': odof.effectsize_ic50,
            'FEATURE_pos_Glass_delta': odof.pos_glass,
            'FEATURE_neg_Glass_delta': odof.neg_glass,
            'ANOVA_FEATURE_pval': FEATURE_PVAL,
            'ANOVA_TISSUE_pval': tissue_PVAL,
            'ANOVA_MSI_pval': MSI_PVAL,
            'ANOVA_MEDIA_pval': MEDIA_PVAL,
            'FEATURE_IC50_T_pval': odof.ttest  # pvalues is in index 1
        }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df
Esempio n. 6
0
    def anova_one_drug_one_feature(self, drug_id,
            feature_name, show=False,
            production=False, directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s'
                    % (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s'
                    % (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0: # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if
                        x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution 
            # for the raw data is uniform one but if we take the log10, 
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                    alpha=self.settings.regression_alpha,
                    L1_wt=1)


        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                    output='dict')
                try:self.samples1.append(anova_pvalues['msi'])
                except:pass
                self.samples2.append(anova_pvalues['feature'])
                try:self.samples3.append(anova_pvalues['tissue'])
                except:pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N':len(Y)
            }
            print(self.pvalues_features[key])


        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                 output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof, savefig=self.settings.savefig,
                    directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {'FEATURE': feature_name,
                'DRUG_ID': drug_id,
                'DRUG_NAME': drug_name,
                'DRUG_TARGET': drug_target,
                'N_FEATURE_pos': odof.Npos,
                'N_FEATURE_neg': odof.Nneg,
                'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
                'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
                'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
                'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
                'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
                'FEATURE_IC50_effect_size': odof.effectsize_ic50,
                'FEATURE_pos_Glass_delta': odof.pos_glass,
                'FEATURE_neg_Glass_delta': odof.neg_glass,
                'ANOVA_FEATURE_pval': FEATURE_PVAL,
                'ANOVA_TISSUE_pval': tissue_PVAL,
                'ANOVA_MSI_pval': MSI_PVAL,
                'ANOVA_MEDIA_pval': MEDIA_PVAL,
                'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1
                }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df