Python OLS Beispiele

Programmiersprache: Python

Namespace / Paketname: statsmodels.formula.api

Klasse / Typ: OLS

Beispiele auf hotexamples.com: 10

Python OLS - 10 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die statsmodels.formula.api.OLS, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

OLS(9)

summary(4)

fit(3)

predict(1)

Häufig verwendete Methoden

OLS (9)

summary (4)

fit (3)

predict (1)

Beispiel #1

Datei anzeigen

Datei: backtest.py Projekt: chris-ch/omarket

def fit_quality(df):
    regr_df = df.reset_index()
    day_nanos = 24 * 60 * 60 * 1E9
    nanos = regr_df['date'] - regr_df['date'].min()
    df2 = pandas.DataFrame(
        data=[nanos.astype(int) / day_nanos, regr_df['equity']]).transpose()
    ols2 = OLS(df2['equity'], df2['date'])
    result = ols2.fit()
    return {
        'p-value F-test': result.f_pvalue,
        'r-squared': result.rsquared,
        'p-value x': result.pvalues[0]
    }

Beispiel #2

Datei anzeigen

    def compute_regression(self):
        if len(self._y_values) < len(self.securities) - 1:
            # not enough values for regression
            _LOGGER.error('not enough values for regression')

        dependent = pandas.DataFrame({self.securities[0]: self._y_values})
        independent = pandas.DataFrame({key: self._x_values[count] for count, key in enumerate(self.securities[1:])})
        if self._with_constant_term:
            ols = OLS(dependent, add_constant(independent))

        else:
            ols = OLS(dependent, independent)

        self.result = ols.fit()

Beispiel #3

Datei anzeigen

Datei: SKLearnSuite.py Projekt: Igerald/Python-Tools

def MultiReg(pdf,DV,IV):
    from statsmodels.formula.api import OLS,ols
    df = pd.read_excel(pdf)
    y,x1,x2 = df[DV[0]],df[IV[0]],df[IV[1]]
    print(len(x1))
    print(len(x2))
    x = np.column_stack((x1,x2))
    REG = OLS(y,x).fit()
    reg = ols(formula='y~x1+x2',data=df).fit()
    print(REG.summary())
    print('_'*50)
    print(reg.summary())
    eq1,eq2 = list(REG.params),list(reg.params)
    df['REG'] = eq1[0]*x1 + eq2[1]*x2
    df['reg'] = eq2[0] + eq2[1]*x1 + eq2[2]*x1
    plt.plot(df['REG'],'b-')
    plt.plot(y,'rs')
    plt.show()
    plt.clf()
    plt.plot(df['reg'],'b-')
    plt.plot(y,'rs')
    plt.show()

Beispiel #4

Datei anzeigen

Datei: multiple.py Projekt: halil/sau-ml

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=53)

from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor.fit(x_train, y_train)

y_predictions = linear_regressor.predict(x_test)

from statsmodels.formula.api import OLS
x_new = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1)

x_opt = x_new[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 1, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3, 4, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3, 5]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()

x_opt = x_new[:, [0, 3]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()
print(regressor_OLS.summary())

Beispiel #5

Datei anzeigen

from sklearn.cross_validation import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1 / 5)

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_predict = regressor.predict(X_test)

X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

from statsmodels.formula.api import OLS

ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 1, 3, 4, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 3, 4, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_opt = X[:, [0, 3, 5]]
ols_regressor = OLS(endog=Y, exog=X_opt).fit()
ols_regressor.summary()

X_train, X_test, Y_train, Y_test = train_test_split(X_opt, Y, test_size=1 / 5)

Beispiel #6

Datei anzeigen

Datei: my_multiple_linear_regression.py Projekt: SepehrSalim/Multiple_Linear_Regression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# ***** Predicting the Test set results & showing R-Squared *****
y_pred = regressor.predict(X_test)
r_squared = regressor.score(X_test, y_test)

# <<< Building an optimal model using Backward Elimination by statsmodels >>>
from statsmodels.formula.api import OLS
from statsmodels.tools.tools import add_constant

# Adding a column of Ones to X : add_constant method
X = add_constant(X)

X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.945

# Let's compare P to 0.05
# Remove x2 (index 2) [0.99 >> 0.05]
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.946

# Remove x1 (index 1) [0.94 >> 0.05]
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary() # Adj. R-squared = 0.948

# Remove x2 (index 2) [0.602 >> 0.05]
X_opt = X[:, [0, 3, 5]]

Beispiel #7

Datei anzeigen

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.',
                                   fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id,
                feature_name,
                formula=self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                                                        output='dict',
                                                        odof=odof)
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory,
                               fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df

Beispiel #8

Datei anzeigen

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution
            # for the raw data is uniform one but if we take the log10,
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha,
                L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=1)

        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output='dict')
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }
            print(self.pvalues_features[key])

        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                     output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {
            'FEATURE': feature_name,
            'DRUG_ID': drug_id,
            'DRUG_NAME': drug_name,
            'DRUG_TARGET': drug_target,
            'N_FEATURE_pos': odof.Npos,
            'N_FEATURE_neg': odof.Nneg,
            'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
            'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
            'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
            'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
            'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
            'FEATURE_IC50_effect_size': odof.effectsize_ic50,
            'FEATURE_pos_Glass_delta': odof.pos_glass,
            'FEATURE_neg_Glass_delta': odof.neg_glass,
            'ANOVA_FEATURE_pval': FEATURE_PVAL,
            'ANOVA_TISSUE_pval': tissue_PVAL,
            'ANOVA_MSI_pval': MSI_PVAL,
            'ANOVA_MEDIA_pval': MEDIA_PVAL,
            'FEATURE_IC50_T_pval': odof.ttest  # pvalues is in index 1
        }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df

Beispiel #9

Datei anzeigen

Datei: kalmanregr.py Projekt: chris-ch/omarket

def main():
    pyplot.style.use('ggplot')

    secs = ['EWA', 'EWC']
    # get adjusted close prices from Yahoo
    prices_path = os.sep.join(['..', '..', 'data', 'ewa-ewc.pkl', 'w'])
    if os.path.exists(prices_path):
        print('loading from cache')
        prices = pandas.read_pickle(prices_path)

    else:
        print('loading from web')
        prices = data.DataReader(secs, 'yahoo', '2011-12-28', '2016-12-28')['Adj Close']
        prices.to_pickle('prices.pkl')

    prices.reset_index(inplace=True)
    in_sample_prices = prices[prices['Date'] < date(2016, 1, 1)]
    out_sample_prices = prices[prices['Date'] >= date(2016, 1, 1)]
    prices = prices.set_index('Date')
    in_sample_prices = in_sample_prices.set_index('Date')
    out_sample_prices = out_sample_prices.set_index('Date')

    Y = in_sample_prices['EWC']
    X = add_constant(in_sample_prices['EWA'])

    regress = OLS(Y, X).fit()
    print(regress.params)

    # visualize the correlation between assest prices over time
    cm = pyplot.cm.get_cmap('jet')
    count = prices['EWA'].count()
    colors = numpy.linspace(0.1, 1, count)
    sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=30, c=colors, cmap=cm, edgecolor='k', alpha=0.7)
    cb = pyplot.colorbar(sc)
    cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index])
    pyplot.xlabel(prices.columns[0])
    pyplot.ylabel(prices.columns[1])

    delta = 1e-4
    process_noise = delta / (1 - delta) * numpy.eye(2)
    measurement_noise = 1e-5
    obs_mat = numpy.vstack([prices['EWA'], numpy.ones(prices['EWA'].shape)]).T[:, numpy.newaxis]
    initial_state_estimate = numpy.zeros(2)
    initial_error_covariance = numpy.ones((2, 2))
    kf = KalmanFilter(n_dim_obs=1, n_dim_state=2,
                      initial_state_mean=initial_state_estimate,
                      initial_state_covariance=initial_error_covariance,
                      transition_matrices=numpy.eye(2),
                      observation_matrices=obs_mat,
                      observation_covariance=measurement_noise,
                      transition_covariance=process_noise)

    state_means, state_covs = kf.filter(prices['EWC'].values)
    results = {'slope': state_means[:, 0], 'intercept': state_means[:, 1]}
    output_df = pandas.DataFrame(results, index=prices.index)
    output_df.plot(subplots=True)
    pyplot.show()

    # visualize the correlation between assest prices over time
    cm = pyplot.cm.get_cmap('jet')
    colors = numpy.linspace(0.1, 1, count)
    sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=50, c=colors, cmap=cm, edgecolor='k', alpha=0.7)
    cb = pyplot.colorbar(sc)
    cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index])
    pyplot.xlabel(prices.columns[0])
    pyplot.ylabel(prices.columns[1])

    # add regression lines
    step = 100
    xi = numpy.linspace(prices[prices.columns[0]].min(), prices[prices.columns[0]].max(), 2)
    count_states = state_means[::step].size
    colors_l = numpy.linspace(0.1, 1, count_states)
    i = 0
    for beta in state_means[::step]:
        pyplot.plot(xi, beta[0] * xi + beta[1], alpha=.2, lw=1, c=cm(colors_l[i]))
        i += 1

    pyplot.show()

    slopes = state_means.transpose()[0][:out_sample_prices.index.size].transpose()

    portfolio = out_sample_prices['EWC'] - out_sample_prices['EWA'] * slopes
    portfolio.plot()
    pyplot.show()

Beispiel #10

Datei anzeigen

 def redraw(self):
     variables = []
     if self.includeallcheckBox.isChecked():
         for i in range(self.interactionlistWidget.count()):
             variables.append(self.interactionlistWidget.item(i).text())
     else:
         for i in range(self.selectedlistWidget.count()):
             variables.append(self.selectedlistWidget.item(i).text())
     nX = len(variables)
     if nX < 1:
         QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     Yname = self.YcomboBox.currentText()
     Lc = DS.Lc[DS.Ic]
     Gc = DS.Gc[DS.Ic]
     Lcy = Lc[Gc]
     Lcx = Lc[-Gc]
     data = DS.Raw.loc[DS.Ir, DS.Ic]
     Y = data[Lcy]
     X = data[Lcx]
     if nX > X.shape[0]:
         QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     ny = self.YcomboBox.currentIndex()
     Y = Y.values.astype('float')
     X = X.values.astype('float')
     Y = Y[:, ny]
     nr = len(Y)
     basey = [Term([LookupFactor(Yname)])]
     basex = []
     for term in variables:
         if term == 'Intercept':
             basex = [INTERCEPT]
             variables.remove(term)
     for term in variables:
         vterm = term.split(':')
         term_lookup = [LookupFactor(x) for x in vterm]
         if len(term_lookup) > 1:
             if vterm[0] == vterm[1]:
                 term_lookup = [EvalFactor(vterm[0] + ' ** 2')]
         basex.append(Term(term_lookup))
     desc = ModelDesc(basey, basex)
     data = np.column_stack((X, Y))
     columns = Lcx.tolist()
     columns.append(Yname)
     data = pd.DataFrame(data, columns=columns)
     y, mx = dmatrices(desc, data, return_type='dataframe')
     dism = np.linalg.inv(np.dot(mx.T.values, mx.values))
     mod = OLS(y, mx)
     DOE.res = mod.fit()
     # calculation of cross-validation
     ypcv = list()
     rcv = list()
     bres = list()
     loo = LeaveOneOut()
     loo.get_n_splits(mx)
     for train_index, test_index in loo.split(mx):
         mx_train = mx.ix[train_index, :]
         mx_test = mx.ix[test_index, :]
         y_train = y.ix[train_index, :]
         y_test = y.ix[test_index, :]
         modcv = OLS(y_train, mx_train)
         rescv = modcv.fit()
         ypcv.append(rescv.predict(mx_test).values[0])
         rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0])
         bres.append((rescv.params - DOE.res.params).values**2)
     bres = pd.DataFrame(bres)
     bres = bres.sum() * nr / (nr - 1)
     bres = np.sqrt(bres.values)
     tres = np.abs(DOE.res.params.values / bres)
     pt = 2 * t.pdf(tres, nr)
     fig = Figure()
     ax = fig.add_subplot(111)
     if self.coefradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (DOE.res.pvalues[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\
             format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1]))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0]
             cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:]
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:]
             cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:]
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.coefpredradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (pt[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title(
                 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.
                 format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr),
                        DOE.res.params[0], DOE.res.params[0] +
                        tres[0] * bres[0] / np.sqrt(nr)))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=tres[1:] * bres[1:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=tres[0:] * bres[0:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.fitradioButton.isChecked():
         yf = DOE.res.fittedvalues.tolist()
         resid = DOE.res.resid.tolist()
         ax.scatter(y, yf, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('Fitted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         df = DOE.res.df_resid
         vares = np.sum(DOE.res.resid**2) / df
         rmsef = np.sqrt(vares)
         vary = np.var(y.values)
         evar = (1 - vares / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSEF {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsef, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         ax.set_xlabel('Measured Values')
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], yf[i]))
     elif self.predradioButton.isChecked():
         ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('CV Predicted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('CV Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         ax.set_xlabel('Measured Values')
         df = DS.Raw.shape[0]
         varcv = np.sum(np.array(rcv)**2) / df
         rmsecv = np.sqrt(varcv)
         vary = np.var(y.values)
         evar = (1 - varcv / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSECV {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsecv, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], ypcv[i]))
     elif self.levradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 20
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = np.diag(np.dot(np.dot(mx, dism), mx.T))
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.surradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 100
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = DOE.res.predict(mx)
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.dismradioButton.isChecked():
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(dism)
         fig.colorbar(cax)
         ax.set_title('Trace = {:10.4f}'.format(np.trace(dism)))
     elif self.inflradioButton.isChecked():
         mxc = preprocessing.scale(mx.values,
                                   with_mean=True,
                                   with_std=False)
         mxc2 = mxc**2
         infl = np.sum(mxc2, axis=0) * np.diag(dism)
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r')
         fig.colorbar(cax)
         ax.yaxis.grid(False)
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
         ax.set_xlabel('Inlaction Factor')
     if self.XcheckBox.isChecked():
         if self.XlineEdit.text():
             ax.set_xlabel(self.XlineEdit.text())
     else:
         ax.set_xlabel('')
     if self.YcheckBox.isChecked():
         if self.YlineEdit.text():
             ax.set_ylabel(self.YlineEdit.text())
     else:
         ax.set_ylabel('')
     if self.XGcheckBox.isChecked():
         ax.xaxis.grid(True)
     else:
         ax.xaxis.grid(False)
     if self.YGcheckBox.isChecked():
         ax.yaxis.grid(True)
     else:
         ax.yaxis.grid(False)
     if not self.XMcheckBox.isChecked():
         ax.tick_params(axis='x',
                        which='both',
                        bottom='off',
                        top='off',
                        labelbottom='off')
     if not self.YMcheckBox.isChecked():
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
     self.rmmpl()
     self.addmpl(fig)