def fit_quality(df): regr_df = df.reset_index() day_nanos = 24 * 60 * 60 * 1E9 nanos = regr_df['date'] - regr_df['date'].min() df2 = pandas.DataFrame( data=[nanos.astype(int) / day_nanos, regr_df['equity']]).transpose() ols2 = OLS(df2['equity'], df2['date']) result = ols2.fit() return { 'p-value F-test': result.f_pvalue, 'r-squared': result.rsquared, 'p-value x': result.pvalues[0] }
def compute_regression(self): if len(self._y_values) < len(self.securities) - 1: # not enough values for regression _LOGGER.error('not enough values for regression') dependent = pandas.DataFrame({self.securities[0]: self._y_values}) independent = pandas.DataFrame({key: self._x_values[count] for count, key in enumerate(self.securities[1:])}) if self._with_constant_term: ols = OLS(dependent, add_constant(independent)) else: ols = OLS(dependent, independent) self.result = ols.fit()
def MultiReg(pdf,DV,IV): from statsmodels.formula.api import OLS,ols df = pd.read_excel(pdf) y,x1,x2 = df[DV[0]],df[IV[0]],df[IV[1]] print(len(x1)) print(len(x2)) x = np.column_stack((x1,x2)) REG = OLS(y,x).fit() reg = ols(formula='y~x1+x2',data=df).fit() print(REG.summary()) print('_'*50) print(reg.summary()) eq1,eq2 = list(REG.params),list(reg.params) df['REG'] = eq1[0]*x1 + eq2[1]*x2 df['reg'] = eq2[0] + eq2[1]*x1 + eq2[2]*x1 plt.plot(df['REG'],'b-') plt.plot(y,'rs') plt.show() plt.clf() plt.plot(df['reg'],'b-') plt.plot(y,'rs') plt.show()
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=53) from sklearn.linear_model import LinearRegression linear_regressor = LinearRegression() linear_regressor.fit(x_train, y_train) y_predictions = linear_regressor.predict(x_test) from statsmodels.formula.api import OLS x_new = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1) x_opt = x_new[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() x_opt = x_new[:, [0, 1, 3, 4, 5]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() x_opt = x_new[:, [0, 3, 4, 5]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() x_opt = x_new[:, [0, 3, 5]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() x_opt = x_new[:, [0, 3]] regressor_OLS = OLS(endog=y, exog=x_opt).fit() print(regressor_OLS.summary())
from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1 / 5) from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, Y_train) Y_predict = regressor.predict(X_test) X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1) X_opt = X[:, [0, 1, 2, 3, 4, 5]] from statsmodels.formula.api import OLS ols_regressor = OLS(endog=Y, exog=X_opt).fit() ols_regressor.summary() X_opt = X[:, [0, 1, 3, 4, 5]] ols_regressor = OLS(endog=Y, exog=X_opt).fit() ols_regressor.summary() X_opt = X[:, [0, 3, 4, 5]] ols_regressor = OLS(endog=Y, exog=X_opt).fit() ols_regressor.summary() X_opt = X[:, [0, 3, 5]] ols_regressor = OLS(endog=Y, exog=X_opt).fit() ols_regressor.summary() X_train, X_test, Y_train, Y_test = train_test_split(X_opt, Y, test_size=1 / 5)
regressor = LinearRegression() regressor.fit(X_train, y_train) # ***** Predicting the Test set results & showing R-Squared ***** y_pred = regressor.predict(X_test) r_squared = regressor.score(X_test, y_test) # <<< Building an optimal model using Backward Elimination by statsmodels >>> from statsmodels.formula.api import OLS from statsmodels.tools.tools import add_constant # Adding a column of Ones to X : add_constant method X = add_constant(X) X_opt = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = OLS(endog = y, exog = X_opt).fit() regressor_OLS.summary() # Adj. R-squared = 0.945 # Let's compare P to 0.05 # Remove x2 (index 2) [0.99 >> 0.05] X_opt = X[:, [0, 1, 3, 4, 5]] regressor_OLS = OLS(endog = y, exog = X_opt).fit() regressor_OLS.summary() # Adj. R-squared = 0.946 # Remove x1 (index 1) [0.94 >> 0.05] X_opt = X[:, [0, 3, 4, 5]] regressor_OLS = OLS(endog = y, exog = X_opt).fit() regressor_OLS.summary() # Adj. R-squared = 0.948 # Remove x2 (index 2) [0.602 >> 0.05] X_opt = X[:, [0, 3, 5]]
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.', fontsize=18): """Compute ABOVA one drug and one feature level :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = odof.drug_id results['DRUG_NAME'] = odof.drug_name results['DRUG_TARGET'] = odof.drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.loc[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith('C(tissue')] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith('C(media')] if len(medias): df.drop(medias[0], axis=1, inplace=True) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict', odof=odof) try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.loc[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory, fontsize=fontsize) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode='media') # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = { 'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df
def main(): pyplot.style.use('ggplot') secs = ['EWA', 'EWC'] # get adjusted close prices from Yahoo prices_path = os.sep.join(['..', '..', 'data', 'ewa-ewc.pkl', 'w']) if os.path.exists(prices_path): print('loading from cache') prices = pandas.read_pickle(prices_path) else: print('loading from web') prices = data.DataReader(secs, 'yahoo', '2011-12-28', '2016-12-28')['Adj Close'] prices.to_pickle('prices.pkl') prices.reset_index(inplace=True) in_sample_prices = prices[prices['Date'] < date(2016, 1, 1)] out_sample_prices = prices[prices['Date'] >= date(2016, 1, 1)] prices = prices.set_index('Date') in_sample_prices = in_sample_prices.set_index('Date') out_sample_prices = out_sample_prices.set_index('Date') Y = in_sample_prices['EWC'] X = add_constant(in_sample_prices['EWA']) regress = OLS(Y, X).fit() print(regress.params) # visualize the correlation between assest prices over time cm = pyplot.cm.get_cmap('jet') count = prices['EWA'].count() colors = numpy.linspace(0.1, 1, count) sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=30, c=colors, cmap=cm, edgecolor='k', alpha=0.7) cb = pyplot.colorbar(sc) cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index]) pyplot.xlabel(prices.columns[0]) pyplot.ylabel(prices.columns[1]) delta = 1e-4 process_noise = delta / (1 - delta) * numpy.eye(2) measurement_noise = 1e-5 obs_mat = numpy.vstack([prices['EWA'], numpy.ones(prices['EWA'].shape)]).T[:, numpy.newaxis] initial_state_estimate = numpy.zeros(2) initial_error_covariance = numpy.ones((2, 2)) kf = KalmanFilter(n_dim_obs=1, n_dim_state=2, initial_state_mean=initial_state_estimate, initial_state_covariance=initial_error_covariance, transition_matrices=numpy.eye(2), observation_matrices=obs_mat, observation_covariance=measurement_noise, transition_covariance=process_noise) state_means, state_covs = kf.filter(prices['EWC'].values) results = {'slope': state_means[:, 0], 'intercept': state_means[:, 1]} output_df = pandas.DataFrame(results, index=prices.index) output_df.plot(subplots=True) pyplot.show() # visualize the correlation between assest prices over time cm = pyplot.cm.get_cmap('jet') colors = numpy.linspace(0.1, 1, count) sc = pyplot.scatter(prices[prices.columns[0]], prices[prices.columns[1]], s=50, c=colors, cmap=cm, edgecolor='k', alpha=0.7) cb = pyplot.colorbar(sc) cb.ax.set_yticklabels([p.date() for p in prices[::count//9].index]) pyplot.xlabel(prices.columns[0]) pyplot.ylabel(prices.columns[1]) # add regression lines step = 100 xi = numpy.linspace(prices[prices.columns[0]].min(), prices[prices.columns[0]].max(), 2) count_states = state_means[::step].size colors_l = numpy.linspace(0.1, 1, count_states) i = 0 for beta in state_means[::step]: pyplot.plot(xi, beta[0] * xi + beta[1], alpha=.2, lw=1, c=cm(colors_l[i])) i += 1 pyplot.show() slopes = state_means.transpose()[0][:out_sample_prices.index.size].transpose() portfolio = out_sample_prices['EWC'] - out_sample_prices['EWA'] * slopes portfolio.plot() pyplot.show()
def redraw(self): variables = [] if self.includeallcheckBox.isChecked(): for i in range(self.interactionlistWidget.count()): variables.append(self.interactionlistWidget.item(i).text()) else: for i in range(self.selectedlistWidget.count()): variables.append(self.selectedlistWidget.item(i).text()) nX = len(variables) if nX < 1: QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\ QtWidgets.QMessageBox.Ok) return () Yname = self.YcomboBox.currentText() Lc = DS.Lc[DS.Ic] Gc = DS.Gc[DS.Ic] Lcy = Lc[Gc] Lcx = Lc[-Gc] data = DS.Raw.loc[DS.Ir, DS.Ic] Y = data[Lcy] X = data[Lcx] if nX > X.shape[0]: QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\ QtWidgets.QMessageBox.Ok) return () ny = self.YcomboBox.currentIndex() Y = Y.values.astype('float') X = X.values.astype('float') Y = Y[:, ny] nr = len(Y) basey = [Term([LookupFactor(Yname)])] basex = [] for term in variables: if term == 'Intercept': basex = [INTERCEPT] variables.remove(term) for term in variables: vterm = term.split(':') term_lookup = [LookupFactor(x) for x in vterm] if len(term_lookup) > 1: if vterm[0] == vterm[1]: term_lookup = [EvalFactor(vterm[0] + ' ** 2')] basex.append(Term(term_lookup)) desc = ModelDesc(basey, basex) data = np.column_stack((X, Y)) columns = Lcx.tolist() columns.append(Yname) data = pd.DataFrame(data, columns=columns) y, mx = dmatrices(desc, data, return_type='dataframe') dism = np.linalg.inv(np.dot(mx.T.values, mx.values)) mod = OLS(y, mx) DOE.res = mod.fit() # calculation of cross-validation ypcv = list() rcv = list() bres = list() loo = LeaveOneOut() loo.get_n_splits(mx) for train_index, test_index in loo.split(mx): mx_train = mx.ix[train_index, :] mx_test = mx.ix[test_index, :] y_train = y.ix[train_index, :] y_test = y.ix[test_index, :] modcv = OLS(y_train, mx_train) rescv = modcv.fit() ypcv.append(rescv.predict(mx_test).values[0]) rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0]) bres.append((rescv.params - DOE.res.params).values**2) bres = pd.DataFrame(bres) bres = bres.sum() * nr / (nr - 1) bres = np.sqrt(bres.values) tres = np.abs(DOE.res.params.values / bres) pt = 2 * t.pdf(tres, nr) fig = Figure() ax = fig.add_subplot(111) if self.coefradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (DOE.res.pvalues[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\ format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1])) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0] cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:] ax.errorbar(ind, DOE.res.params[1:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:] cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:] ax.errorbar(ind, DOE.res.params[0:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.coefpredradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (pt[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title( 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'. format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr), DOE.res.params[0], DOE.res.params[0] + tres[0] * bres[0] / np.sqrt(nr))) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') ax.errorbar(ind, DOE.res.params[1:], yerr=tres[1:] * bres[1:] / np.sqrt(nr), fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') ax.errorbar(ind, DOE.res.params[0:], yerr=tres[0:] * bres[0:] / np.sqrt(nr), fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.fitradioButton.isChecked(): yf = DOE.res.fittedvalues.tolist() resid = DOE.res.resid.tolist() ax.scatter(y, yf, color='red', alpha=0.3, marker='o') ax.set_ylabel('Fitted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) df = DOE.res.df_resid vares = np.sum(DOE.res.resid**2) / df rmsef = np.sqrt(vares) vary = np.var(y.values) evar = (1 - vares / vary) * 100 ax.set_title( 'df {:3.0f}; RMSEF {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsef, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) ax.set_xlabel('Measured Values') if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], yf[i])) elif self.predradioButton.isChecked(): ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o') ax.set_ylabel('CV Predicted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('CV Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) ax.set_xlabel('Measured Values') df = DS.Raw.shape[0] varcv = np.sum(np.array(rcv)**2) / df rmsecv = np.sqrt(varcv) vary = np.var(y.values) evar = (1 - varcv / vary) * 100 ax.set_title( 'df {:3.0f}; RMSECV {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsecv, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], ypcv[i])) elif self.levradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 20 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = np.diag(np.dot(np.dot(mx, dism), mx.T)) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.surradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 100 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = DOE.res.predict(mx) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.dismradioButton.isChecked(): fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(dism) fig.colorbar(cax) ax.set_title('Trace = {:10.4f}'.format(np.trace(dism))) elif self.inflradioButton.isChecked(): mxc = preprocessing.scale(mx.values, with_mean=True, with_std=False) mxc2 = mxc**2 infl = np.sum(mxc2, axis=0) * np.diag(dism) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r') fig.colorbar(cax) ax.yaxis.grid(False) ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') ax.set_xlabel('Inlaction Factor') if self.XcheckBox.isChecked(): if self.XlineEdit.text(): ax.set_xlabel(self.XlineEdit.text()) else: ax.set_xlabel('') if self.YcheckBox.isChecked(): if self.YlineEdit.text(): ax.set_ylabel(self.YlineEdit.text()) else: ax.set_ylabel('') if self.XGcheckBox.isChecked(): ax.xaxis.grid(True) else: ax.xaxis.grid(False) if self.YGcheckBox.isChecked(): ax.yaxis.grid(True) else: ax.yaxis.grid(False) if not self.XMcheckBox.isChecked(): ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') if not self.YMcheckBox.isChecked(): ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') self.rmmpl() self.addmpl(fig)