def fit_distributions(distributions, data): """ fit distributions on data :param distributions: :param data: :return: """ if distributions[0] == "ALL": f = fitter.Fitter(data, distributions=None, verbose=True) else: f = fitter.Fitter(data, distributions=distributions, verbose=True) f.fit() return f
def distribution_helper(data_list, distribution_list): distribution_fit_object = fitter.Fitter(data_list, distributions=distribution_list, timeout=600, verbose=False) distribution_fit_object.fit() error_map = distribution_fit_object.df_errors.to_dict() distribution_error_map = error_map['sumsquare_error'] return distribution_error_map
def filterSummary(): #Table by variable if filterSelect.value != "No Filter": basins = file1[filterSelect.value].unique() lista_basin = [] lista_results_basin = [] lista_mean = [] lista_std = [] lista_generador = [] for z in basins: basin_df = file1[file1[filterSelect.value] == z] lista_basin.append(z) try: basin_dis = fitter.Fitter(basin_df[targetVar.value], distributions=[ "gamma", "uniform", "lognorm", "norm", "expon", "exponnorm", "logistic", "lognorm", "triang" ]) #basin_dis=fitter.Fitter(basin_df[targetVar.value], distributions=["gamma", "uniform", "lognorm"]) basin_dis.fit() basin_bestfit = basin_dis.get_best() print("A1") base = generador_estadistica_2(basin_bestfit) print(base) lista_mean.append(base['mean']) lista_std.append(base['std']) lista_results_basin.append(basin_bestfit) lista_generador.append(base["generador"]) except: lista_results_basin.append("Action can not be performed") aux_bestdist_basin = { filterSelect.value: lista_basin, "Best Distribution": lista_results_basin, "Mean": lista_mean, "std": lista_std } aux_data_gen = { filterSelect.value: lista_basin, "generador": lista_generador } bestdist_basin = pd.DataFrame(aux_bestdist_basin) data_gen = pd.DataFrame(aux_data_gen) #bestdist_basin=generador_estadistica(bestdist_basin) #bestdist_basin2=generador_estadistica(bestdist_basin) return bestdist_basin, data_gen return 0
def _leaderboard_compute_overall_score(self, N=100): """Based on NULL distribution, compute overall score of model1 Not finalised. """ self._compute_pvalues_pred1(N=N) self._compute_pvalues_param1(N=N) import fitter fit_param1 = fitter.Fitter(self.rdistance_param1) fit_param1.distributions = ['beta'] fit_param1.fit() fit_pred1 = fitter.Fitter(self.rdistance_pred1) fit_pred1.distributions = ['beta'] fit_pred1.fit() import scipy.stats self.pvalues_param1 = scipy.stats.beta.cdf(self.scores['param1'].scores, *fit_param1.fitted_param['beta']) self.pvalues_pred1 = scipy.stats.beta.cdf(self.scores['pred1'].scores, *fit_pred1.fitted_param['beta']) self.scores['pred1']['pvalues'] = self.pvalues_pred1 self.scores['param1']['pvalues'] = self.pvalues_param1
def analysis(variable): try: #global file1 varinput = variable #varinput = file1[variable] #targetVar.on_change('value', updateHisto) distfit_var = fitter.Fitter(variable, distributions=[ "gamma", "uniform", "lognorm", "norm", "expon", "exponnorm", "logistic", "lognorm", "triang" ]) #distfit_var = fitter.Fitter(variable, distributions=["gamma", "uniform", "lognorm"]) distfit_var.fit() sumario = distfit_var.summary(plot=False, Nbest=8) #bestdist=distfit_var.get_best() #tabla=datatable_var(sumario) return sumario, distfit_var except: print("This is not a numeric variable")
def __init__(self, pfad, **kwargs): """ Initialize attributes and example docstring Args: pfad (str): File path of image Kwargs: none (yet) Returns: nothing Creates: self.att (dic): empty dictionary for holding image attributes Raises: nothing Use me if you want to create an image """ self.att = {} #attribute directory self.rnio = rnio.RnIo() self.fitter = fitter.Fitter() #checks if pfad is valid if os.path.exists(pfad) == True: self.pfad = str(pfad) logging.info('Image pfad was set: %s', pfad) self.calc_name() with Bild.lock: self.att['bid'] = Bild.bid_count Bild.bid_count += 1 else: logging.warning('No file found under pfad %s. No image was opened', pfad)
# Make environments envs = [gym.make("Pong-v0") for i in range(n_envs)] for i,env in enumerate(envs): env.seed(i) obs_bookmarks = [env.reset() for env in envs] # Used to track observations between environments prev_bookmarks = [0 for i in range(n_envs)] # Make model and optimizer action_dim = 2 # Pong specific number of possible actions prepped_state = preprocess(obs_bookmarks[0]) # Returns a vector representation of the observation input_dim = prepped_state.shape[0] net = model.Model(input_dim, action_dim) optimizer = optim.Adam(net.parameters(), lr=lr) fit_obj = fitter.Fitter(input_dim, action_dim, n_olddatas=n_olddatas) if resume: net.load_state_dict(torch.load(net_save_file)) optimizer.load_state_dict(torch.load(optim_save_file)) optimizer.zero_grad() # Various functions that will be useful later logsoftmax = nn.LogSoftmax() softmax = nn.Softmax() mseloss = nn.MSELoss() # Store actions, observations, values actions, observs, rewards, old_pis, old_vals, advantages, mask = [], [], [], [], [], [], [] episode_reward = 0
def Solve(): g = open('OutPut.txt', "w") pp = PdfPages("AllHistogram.pdf") np.seterr(divide='ignore', invalid='ignore') if not sys.warnoptions: import warnings warnings.simplefilter("ignore") # ignore some warnings from system cnt = 0 # count how many images had taken for filename in glob.glob( 'D:/10semester/Progonov/Лабораторные работы/mirflickr/*.jpg'): photo = Image.open(filename) photo = photo.convert('RGB') g.write('Output for Image number {}\n\n'.format(cnt + 1)) Red = [] Green = [] Blue = [] width, height = photo.size # define W and H for y in range(0, height): # each pixel has coordinates for x in range(0, width): RGB = photo.getpixel((x, y)) R, G, B = RGB # now we can use the RGB value Red.append(R) Green.append(G) Blue.append(B) sorted(Red) sorted(Green) sorted(Blue) g.write('Max and min values of Red channel of image {} are: {}, {}\n'. format(cnt + 1, max(Red), min(Red))) g.write( 'Max and min values of Green channel of image {} are: {}, {}\n'. format(cnt + 1, max(Green), min(Green))) g.write( 'Max and min values of Blue channel of image {} are: {}, {}\n\n'. format(cnt + 1, max(Blue), min(Blue))) # for Red channel g.write('Sum of Red channel is : {}\n'.format(sum(Red))) g.write('Median of Red channel is : {}\n'.format(stt.median(Red))) g.write('Lower and Upper quantile of Red channel are : {} {}\n'.format( np.quantile(Red, 0.25), np.quantile(Red, 0.75))) g.write('Mean value is : {}\n'.format(stt.mean(Red))) g.write('Skewness and Kurtosis are : {} {}\n'.format( skew(np.array(Red)), kurtosis(Red))) g.write('Average value of Red channel is : {}\n'.format( sum(Red) / (width * height))) g.write('The Variance of Red channel is : {}\n\n'.format( stt.variance(Red))) # ================================================ # for Green channel g.write('Sum of Green channel is : {} \n'.format(sum(Green))) g.write('Median of Green channel is : {}\n'.format(stt.median(Green))) g.write( 'Lower and Upper quantile of Green channel are : {} {}\n'.format( np.quantile(Green, 0.25), np.quantile(Green, 0.75))) g.write('Mean value is : {}\n'.format(stt.mean(Green))) g.write('Skewness and Kurtosis are : {} {}\n'.format( skew(np.array(Green)), kurtosis(Green))) g.write('Average value of Green channel is : {}\n'.format( sum(Green) / (width * height))) g.write('The Variance of Green channel is : {}\n\n'.format( stt.variance(Green))) # ===================================================== # for Blue channel g.write('Sum of Blue channel is : {}\n'.format(sum(Blue))) g.write('Median of Blue channel is : {}\n'.format(stt.median(Blue))) g.write( 'Lower and Upper quantile of Blue channel are : {} {}\n'.format( np.quantile(Blue, 0.25), np.quantile(Blue, 0.75))) g.write('Mean value is : {}\n'.format(stt.mean(Blue))) g.write('Skewness and Kurtosis are : {} {}\n'.format( skew(np.array(Blue)), kurtosis(Blue))) g.write('Average value of Blue channel is : {}\n'.format( sum(Blue) / (width * height))) g.write('The Variance of Blue channel is : {}\n\n'.format( stt.variance(Blue))) photo.close() for name, num in COLOR.items(): plt.figure() photo = np.array(Image.open(filename)) a = photo[:, :, num].ravel() f = fitter.Fitter( a, distributions=['beta', 'gamma', 'uniform', 'norm', 'laplace'], bins=256, verbose=False) f.fit() g.write("Fitted errors for " + name + " channel:\n\n") for k, v in f._fitted_errors.items(): g.write(str(k) + ' >>> ' + str(v) + '\n') g.write("\n") f.summary() f.hist() plt.title(str(name + " channel of Image number " + str(cnt + 1))) pp.savefig() plt.close('all') g.write('=================================================\n\n') cnt += 1 if cnt >= 1: break pp.close() g.close()
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.', fontsize=18): """Compute ABOVA one drug and one feature level :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = odof.drug_id results['DRUG_NAME'] = odof.drug_name results['DRUG_TARGET'] = odof.drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.loc[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith('C(tissue')] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith('C(media')] if len(medias): df.drop(medias[0], axis=1, inplace=True) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict', odof=odof) try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.loc[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory, fontsize=fontsize) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode='media') # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
entries = pd.read_csv("/home/cokelaer/entries.txt", header=None) entries = list(entries[0].as_matrix()) # This command takes a while: about 20 minutes with a good connection. # This will download lots of fields from uniprot for each entry. # Later on we will play with the sequence length, which could # have been extracted from the downloaded file but this example # if for illustration. # obtain a dataframe filled with all data from all entries df = u.get_df() # let us build a vector made of the length of the sequence. # we restrict ourself to 3000 nucleotides data = df[df.Length < 3000].Length # now, we may want to figure out wha kind of distribution this sample is conng # from. We will use the package called fitter, available on pypi with a layer # built on top of scipy (distribution and fit) import fitter f = fitter.Fitter(data, bins=150) f.distributions = ['lognorm', 'chi2', 'rayleigh', 'cauchy', 'invweibull'] f.fit() f.summary() f.summary(lw=3) xlabel("Sequence length", fontsize=20) ylabel("PDF", fontsize=20) savefig("sequence_length_fitting.png", dpi=200) savefig("sequence_length_fitting.eps", dpi=200) savefig("sequence_length_fitting.svg", dpi=200)
import dca import scipy import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import fitter dca.productividad() from dca import gasto data = gasto[gasto.mes_max > 50] f_Qi = fitter.Fitter(data.Qi_hist, timeout=120) f_Qi.fit() best_Qi = f_Qi.get_best() Qi_params = list(best_Qi.values())[0] f_di = fitter.Fitter(data.di_hyp, timeout=120) f_di.fit() best_di = f_di.get_best() di_params = list(best_di.values())[0] f_b = fitter.Fitter(data.b, timeout=120) f_b.fit() best_b = f_b.get_best() b_params = list(best_b.values())[0] #print(f_Qi.summary())
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = { 'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df