Beispiel #1
0
def fit_quality(df):
    regr_df = df.reset_index()
    day_nanos = 24 * 60 * 60 * 1E9
    nanos = regr_df['date'] - regr_df['date'].min()
    df2 = pandas.DataFrame(
        data=[nanos.astype(int) / day_nanos, regr_df['equity']]).transpose()
    ols2 = OLS(df2['equity'], df2['date'])
    result = ols2.fit()
    return {
        'p-value F-test': result.f_pvalue,
        'r-squared': result.rsquared,
        'p-value x': result.pvalues[0]
    }
Beispiel #2
0
    def compute_regression(self):
        if len(self._y_values) < len(self.securities) - 1:
            # not enough values for regression
            _LOGGER.error('not enough values for regression')

        dependent = pandas.DataFrame({self.securities[0]: self._y_values})
        independent = pandas.DataFrame({key: self._x_values[count] for count, key in enumerate(self.securities[1:])})
        if self._with_constant_term:
            ols = OLS(dependent, add_constant(independent))

        else:
            ols = OLS(dependent, independent)

        self.result = ols.fit()
Beispiel #3
0
def MultiReg(pdf,DV,IV):
    from statsmodels.formula.api import OLS,ols
    df = pd.read_excel(pdf)
    y,x1,x2 = df[DV[0]],df[IV[0]],df[IV[1]]
    print(len(x1))
    print(len(x2))
    x = np.column_stack((x1,x2))
    REG = OLS(y,x).fit()
    reg = ols(formula='y~x1+x2',data=df).fit()
    print(REG.summary())
    print('_'*50)
    print(reg.summary())
    eq1,eq2 = list(REG.params),list(reg.params)
    df['REG'] = eq1[0]*x1 + eq2[1]*x2
    df['reg'] = eq2[0] + eq2[1]*x1 + eq2[2]*x1
    plt.plot(df['REG'],'b-')
    plt.plot(y,'rs')
    plt.show()
    plt.clf()
    plt.plot(df['reg'],'b-')
    plt.plot(y,'rs')
    plt.show()
Beispiel #4
0
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=53)

from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor.fit(x_train, y_train)

y_predictions = linear_regressor.predict(x_test)

from statsmodels.formula.api import OLS
x_new = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1)

x_opt = x_new[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 1, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()
print(regressor_OLS.summary())
Beispiel #5
0
from sklearn.cross_validation import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1 / 5)

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_predict = regressor.predict(X_test)

X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

from statsmodels.formula.api import OLS

ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 1, 3, 4, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 3, 4, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 3, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_train, X_test, Y_train, Y_test = train_test_split(X_opt, Y, test_size=1 / 5)
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# ***** Predicting the Test set results & showing R-Squared *****
y_pred = regressor.predict(X_test)
r_squared = regressor.score(X_test, y_test)

# <<< Building an optimal model using Backward Elimination by statsmodels >>>
from statsmodels.formula.api import OLS
from statsmodels.tools.tools import add_constant

# Adding a column of Ones to X : add_constant method
X = add_constant(X)

X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.945

# Let's compare P to 0.05
# Remove x2 (index 2) [0.99 >> 0.05]
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.946

# Remove x1 (index 1) [0.94 >> 0.05]
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.948

# Remove x2 (index 2) [0.602 >> 0.05]
X_opt = X[:, [0, 3, 5]]
Beispiel #7
0
    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.',
                                   fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id,
                feature_name,
                formula=self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                                                        output='dict',
                                                        odof=odof)
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory,
                               fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df
Beispiel #8
0
    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution
            # for the raw data is uniform one but if we take the log10,
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha,
                L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=1)

        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output='dict')
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }
            print(self.pvalues_features[key])

        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                     output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {
            'FEATURE': feature_name,
            'DRUG_ID': drug_id,
            'DRUG_NAME': drug_name,
            'DRUG_TARGET': drug_target,
            'N_FEATURE_pos': odof.Npos,
            'N_FEATURE_neg': odof.Nneg,
            'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
            'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
            'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
            'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
            'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
            'FEATURE_IC50_effect_size': odof.effectsize_ic50,
            'FEATURE_pos_Glass_delta': odof.pos_glass,
            'FEATURE_neg_Glass_delta': odof.neg_glass,
            'ANOVA_FEATURE_pval': FEATURE_PVAL,
            'ANOVA_TISSUE_pval': tissue_PVAL,
            'ANOVA_MSI_pval': MSI_PVAL,
            'ANOVA_MEDIA_pval': MEDIA_PVAL,
            'FEATURE_IC50_T_pval': odof.ttest  # pvalues is in index 1
        }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df
Beispiel #9
0
def main():
    pyplot.style.use('ggplot')

    secs = ['EWA', 'EWC']
    # get adjusted close prices from Yahoo
    prices_path = os.sep.join(['..', '..', 'data', 'ewa-ewc.pkl', 'w'])
    if os.path.exists(prices_path):
        print('loading from cache')
        prices = pandas.read_pickle(prices_path)

    else:
        print('loading from web')
        prices = data.DataReader(secs, 'yahoo', '2011-12-28', '2016-12-28')['Adj Close']
        prices.to_pickle('prices.pkl')

    prices.reset_index(inplace=True)
    in_sample_prices = prices[prices['Date'] < date(2016, 1, 1)]
    out_sample_prices = prices[prices['Date'] >= date(2016, 1, 1)]
    prices = prices.set_index('Date')
    in_sample_prices = in_sample_prices.set_index('Date')
    out_sample_prices = out_sample_prices.set_index('Date')

    Y = in_sample_prices['EWC']
    X = add_constant(in_sample_prices['EWA'])

    regress = OLS(Y, X).fit()
    print(regress.params)

    # visualize the correlation between assest prices over time
    cm = pyplot.cm.get_cmap('jet')
    count = prices['EWA'].count()
    colors = numpy.linspace(0.1, 1, count)
    sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=30, c=colors, cmap=cm, edgecolor='k', alpha=0.7)
    cb = pyplot.colorbar(sc)
    cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index])
    pyplot.xlabel(prices.columns[0])
    pyplot.ylabel(prices.columns[1])

    delta = 1e-4
    process_noise = delta / (1 - delta) * numpy.eye(2)
    measurement_noise = 1e-5
    obs_mat = numpy.vstack([prices['EWA'], numpy.ones(prices['EWA'].shape)]).T[:, numpy.newaxis]
    initial_state_estimate = numpy.zeros(2)
    initial_error_covariance = numpy.ones((2, 2))
    kf = KalmanFilter(n_dim_obs=1, n_dim_state=2,
                      initial_state_mean=initial_state_estimate,
                      initial_state_covariance=initial_error_covariance,
                      transition_matrices=numpy.eye(2),
                      observation_matrices=obs_mat,
                      observation_covariance=measurement_noise,
                      transition_covariance=process_noise)

    state_means, state_covs = kf.filter(prices['EWC'].values)
    results = {'slope': state_means[:, 0], 'intercept': state_means[:, 1]}
    output_df = pandas.DataFrame(results, index=prices.index)
    output_df.plot(subplots=True)
    pyplot.show()

    # visualize the correlation between assest prices over time
    cm = pyplot.cm.get_cmap('jet')
    colors = numpy.linspace(0.1, 1, count)
    sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=50, c=colors, cmap=cm, edgecolor='k', alpha=0.7)
    cb = pyplot.colorbar(sc)
    cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index])
    pyplot.xlabel(prices.columns[0])
    pyplot.ylabel(prices.columns[1])

    # add regression lines
    step = 100
    xi = numpy.linspace(prices[prices.columns[0]].min(), prices[prices.columns[0]].max(), 2)
    count_states = state_means[::step].size
    colors_l = numpy.linspace(0.1, 1, count_states)
    i = 0
    for beta in state_means[::step]:
        pyplot.plot(xi, beta[0] * xi + beta[1], alpha=.2, lw=1, c=cm(colors_l[i]))
        i += 1

    pyplot.show()

    slopes = state_means.transpose()[0][:out_sample_prices.index.size].transpose()

    portfolio = out_sample_prices['EWC'] - out_sample_prices['EWA'] * slopes
    portfolio.plot()
    pyplot.show()
Beispiel #10
0
 def redraw(self):
     variables = []
     if self.includeallcheckBox.isChecked():
         for i in range(self.interactionlistWidget.count()):
             variables.append(self.interactionlistWidget.item(i).text())
     else:
         for i in range(self.selectedlistWidget.count()):
             variables.append(self.selectedlistWidget.item(i).text())
     nX = len(variables)
     if nX < 1:
         QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     Yname = self.YcomboBox.currentText()
     Lc = DS.Lc[DS.Ic]
     Gc = DS.Gc[DS.Ic]
     Lcy = Lc[Gc]
     Lcx = Lc[-Gc]
     data = DS.Raw.loc[DS.Ir, DS.Ic]
     Y = data[Lcy]
     X = data[Lcx]
     if nX > X.shape[0]:
         QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     ny = self.YcomboBox.currentIndex()
     Y = Y.values.astype('float')
     X = X.values.astype('float')
     Y = Y[:, ny]
     nr = len(Y)
     basey = [Term([LookupFactor(Yname)])]
     basex = []
     for term in variables:
         if term == 'Intercept':
             basex = [INTERCEPT]
             variables.remove(term)
     for term in variables:
         vterm = term.split(':')
         term_lookup = [LookupFactor(x) for x in vterm]
         if len(term_lookup) > 1:
             if vterm[0] == vterm[1]:
                 term_lookup = [EvalFactor(vterm[0] + ' ** 2')]
         basex.append(Term(term_lookup))
     desc = ModelDesc(basey, basex)
     data = np.column_stack((X, Y))
     columns = Lcx.tolist()
     columns.append(Yname)
     data = pd.DataFrame(data, columns=columns)
     y, mx = dmatrices(desc, data, return_type='dataframe')
     dism = np.linalg.inv(np.dot(mx.T.values, mx.values))
     mod = OLS(y, mx)
     DOE.res = mod.fit()
     # calculation of cross-validation
     ypcv = list()
     rcv = list()
     bres = list()
     loo = LeaveOneOut()
     loo.get_n_splits(mx)
     for train_index, test_index in loo.split(mx):
         mx_train = mx.ix[train_index, :]
         mx_test = mx.ix[test_index, :]
         y_train = y.ix[train_index, :]
         y_test = y.ix[test_index, :]
         modcv = OLS(y_train, mx_train)
         rescv = modcv.fit()
         ypcv.append(rescv.predict(mx_test).values[0])
         rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0])
         bres.append((rescv.params - DOE.res.params).values**2)
     bres = pd.DataFrame(bres)
     bres = bres.sum() * nr / (nr - 1)
     bres = np.sqrt(bres.values)
     tres = np.abs(DOE.res.params.values / bres)
     pt = 2 * t.pdf(tres, nr)
     fig = Figure()
     ax = fig.add_subplot(111)
     if self.coefradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (DOE.res.pvalues[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\
             format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1]))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0]
             cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:]
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:]
             cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:]
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.coefpredradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (pt[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title(
                 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.
                 format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr),
                        DOE.res.params[0], DOE.res.params[0] +
                        tres[0] * bres[0] / np.sqrt(nr)))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=tres[1:] * bres[1:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=tres[0:] * bres[0:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.fitradioButton.isChecked():
         yf = DOE.res.fittedvalues.tolist()
         resid = DOE.res.resid.tolist()
         ax.scatter(y, yf, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('Fitted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         df = DOE.res.df_resid
         vares = np.sum(DOE.res.resid**2) / df
         rmsef = np.sqrt(vares)
         vary = np.var(y.values)
         evar = (1 - vares / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSEF {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsef, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         ax.set_xlabel('Measured Values')
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], yf[i]))
     elif self.predradioButton.isChecked():
         ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('CV Predicted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('CV Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         ax.set_xlabel('Measured Values')
         df = DS.Raw.shape[0]
         varcv = np.sum(np.array(rcv)**2) / df
         rmsecv = np.sqrt(varcv)
         vary = np.var(y.values)
         evar = (1 - varcv / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSECV {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsecv, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], ypcv[i]))
     elif self.levradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 20
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = np.diag(np.dot(np.dot(mx, dism), mx.T))
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.surradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 100
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = DOE.res.predict(mx)
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.dismradioButton.isChecked():
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(dism)
         fig.colorbar(cax)
         ax.set_title('Trace = {:10.4f}'.format(np.trace(dism)))
     elif self.inflradioButton.isChecked():
         mxc = preprocessing.scale(mx.values,
                                   with_mean=True,
                                   with_std=False)
         mxc2 = mxc**2
         infl = np.sum(mxc2, axis=0) * np.diag(dism)
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r')
         fig.colorbar(cax)
         ax.yaxis.grid(False)
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
         ax.set_xlabel('Inlaction Factor')
     if self.XcheckBox.isChecked():
         if self.XlineEdit.text():
             ax.set_xlabel(self.XlineEdit.text())
     else:
         ax.set_xlabel('')
     if self.YcheckBox.isChecked():
         if self.YlineEdit.text():
             ax.set_ylabel(self.YlineEdit.text())
     else:
         ax.set_ylabel('')
     if self.XGcheckBox.isChecked():
         ax.xaxis.grid(True)
     else:
         ax.xaxis.grid(False)
     if self.YGcheckBox.isChecked():
         ax.yaxis.grid(True)
     else:
         ax.yaxis.grid(False)
     if not self.XMcheckBox.isChecked():
         ax.tick_params(axis='x',
                        which='both',
                        bottom='off',
                        top='off',
                        labelbottom='off')
     if not self.YMcheckBox.isChecked():
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
     self.rmmpl()
     self.addmpl(fig)