class TPolynomialLassoModel(TModelClass): """ Child class representing the polynomial regression model, using Lasso Regularisation """ def __init__(self, name, Target, Feature: pd.DataFrame, Target_test, Feature_test: pd.DataFrame, Pipeline: Pipeline, EnsemblePipeline: Pipeline, Degree: int = 2, Interaction: bool = False, Bias: bool = True, CrossVal: int = 5): """ Constructor to set up a polynomial model with Lasso Regularisation. The Alpha values are obtained via cross-validation. 100 alpha values are checked along the path. parameters: - name : the name of the object instance - Feature : The features to use & transform - Target : the training target data - Target_test: the test target data - Feature_test: the untransformed features for testing. - Pipeline : a pipeline generated by the PipelineFactory - EnsemblePipeline : the pipeline generated by the PipelineFactory for the entire ensemble dataset - Degree : The polynomial degree (integer), DEFAULT=2 - Interaction : Boolean variable indicating if ONLY interaction terms are included in the polynomial features (True) or that all terms are included (False), DEFAULT=False - Bias : Boolean indicating if a bias column is added (i.e. where all powers are zero, acts as intercept in linear model). [DEFAULT = True] - CrossVal: How-many-fold crossvalidation is required? [DEFAULT = 5] It sets the following properties - pipeline : a pipeline object containing the preprocessing transformations (excluding the fitter function) - CVmodel : the fitter function to be used (should be an sklearn function with "fit" method) - model : The model is only set once the CV_model has run a fit-operation - feature_tf: the transformed features as obtained by the pipeline - parJobs : number of cores to use. Go full parallel on serial case, and single core on parallel code parameters sets for accounting/tracking purposes - crossVal : the Crossvalidation model/size (=CrossVal parameter) - bestAlpha: best alpha-hyperparameter as obtained by ElasticNetCV - bestL1_ratio: best l1_ratio-hyperparameter as obtained by ElasticNetCV - cv_iter : number of iterations needed by ElasticNetCV - cv_warns: %-number of convergence-warnings thrown during ElasticNetCV = #warnings/(#alphas * #l1_ratio) """ #from sklearn.preprocessing import StandardScaler #from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LassoCV from multiprocessing import current_process super().__init__(name, Target, Feature, Target_test, Feature_test) if current_process().name == 'MainProcess': print('Hello from the main process') self.parJobs = -1 else: print('Hello from child process') self.parJobs = 1 self.nameModel = 'Polynomial Model with Lasso Regularisation' self.name = name print("Initialising the child class:", self.nameModel) #create a pipeline (can be extended to contain more functions, p67) self.pipeline = Pipeline self.EnsPipe = EnsemblePipeline self.feature_tf = self.pipeline.fit_transform( Feature) #this is a numpy array... #track the r1 and alpha lists for the CV model self.n_alphas = 13 # 100 is the sklearn default self.alphas = [ 1.0e-6, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100.0, 1.0e3, 1.0e4, 1.0e5, 1.0e6 ] self.CVmodel = LassoCV( #eps=1e-8, #length of the path: Alpha_min/Alpha_max = 1.0E-3 #n_alphas=self.n_alphas, #test 100(default) alphas alphas=self.alphas, #set the alphas automatically fit_intercept=True, #the data is already centered by standard scaler normalize= False, #standardization happens elsewhere, note that this is a tricky thing for small data precompute= 'auto', #let sklearn decide if the Gram matrix needs precomputing max_iter=1000, #maximum number of iterations tol=1.0E-3, cv= CrossVal, #X-fold crossvalidation--> for our small data: leave-one-out? n_jobs=self. parJobs #if you use -1 (= go parallel over all cores), you can not suppress the annoying convergence warning ) #to keep track of some options: self.crossVal = CrossVal self.bestAlpha = 0 self.cv_iter = 0 self.cv_warns = 0 # %-number of warnings thrown during ElasticNetCV = #warnings/(#alphas * #l1_ratio) self.model = None self.coefUsage = None # contains the % of non-zero values of each coefficient over the model instances of the ensemble, is set upon creation of the average self.sklCVfail = False # are there reasons to believe the sk-learn CV failed miserably? def __PrintAlphMSESeries(self, avg, vmin, vmax, nrow): """ Printing function to be used to trace LASSO alphas """ f = open("ConvLASSO.dat", "a+") for nr in range(nrow): line = " %0.8f %0.8f %0.8f %0.8f \n" % ( self.CVmodel.alphas_[nr], avg[nr], vmax[nr] - avg[nr], avg[nr] - vmin[nr]) f.write(line) line = " \n" f.write(line) f.close() def __GenerateLassoPath(self, gridType: str = None): """ Some extra internal heuristics catching failures on small data. It seems like we are in trouble with the error getting worse for smaller regularisation. sk-learn CV optimisation fails. Try having a look at the lasso_path, where do we have non-zero coefs? Parameters: - gridType : "piecelin", "lin" [default piecelin] returns: - set the CVmodel.alpha_ to the "beter" choice alpha value - set the alphas grid to a new grid """ from sklearn.linear_model import lasso_path alphas, coef_path, _ = lasso_path( X=self.feature_tf, #training data y=self.target, #targets alphas=self.alphas, #set the alphas automatically precompute= 'auto' #let sklearn decide if the Gram matrix needs precomputing ) cntNoZero = np.count_nonzero(coef_path, axis=0) self.sklCVfail = True #so standard CV failed...we need to take over update = False #f=open("PathLASSO.dat","a+") for nr in range(len(alphas)): #alphas go from big to small if ((cntNoZero[nr] > 0) and (not update)): update = True ca = alphas[nr] #as the sk-learn at this point will have failed miserably, #replace the sklearn "best" alpha with a less epic fail self.CVmodel.alpha_ = ca #other attributes of the CV model are not used (phew) if (gridType == "lin"): #linear self.alphas = list(i * 0.1 * ca for i in range(1, 21)) self.n_alphas = 20 else: #default: peisewise linear self.alphas = list(i * 0.1 * ca for i in range(1, 10)) self.alphas.extend(list(i * ca for i in range(1, 10))) self.n_alphas = 18 #break#get out of this loop and do not add more alphas #line=str(alphas[nr])+" "+str(coef_path[:,nr]).strip('[]')+"==> "+str(cntNoZero[nr])+" \n" #f.write(line) #line=" \n" #f.write(line) #f.close() def __GenerateLinAlphaSeries(self, AtBest: bool = False, Print: bool = False) -> bool: """ Small private function to create a piecewise linear grid at the step of the average MSE plot !! WARNING: We assume sk-learn has sorted the alphas largest to smallest (this is the case for version 0.21.2...but this is python) Parameters: - Print : print the curve, Boolean, Default=False - AtBest : Generate a linear grid around the "Best" Alpha (True) or around the Log-dropoff (False, Default) Returns: Boolean indicating if the grid was updated, and shoul dbe run again """ #find the drop of point update = False if (AtBest): # close top the best value: from 0.1 to 2x best ca = self.CVmodel.alpha_ self.alphas = list(i * 0.1 * ca for i in range(1, 21)) self.n_alphas = 20 update = True else: avg = np.average( self.CVmodel.mse_path_, axis=1 ) #go over all columns(axis=1), to get the minimum of a row, axis=0 nrow = len(avg) if (Print): vmin = np.amin(self.CVmodel.mse_path_, axis=1) vmax = np.amax(self.CVmodel.mse_path_, axis=1) self.__PrintAlphMSESeries(avg, vmin, vmax, nrow) for nr in range(nrow - 1): #the alphas_ are sorted from large to small diff = avg[nr + 1] - avg[nr] if ((diff < 0.0) and (abs(diff) > (0.05 * avg[nr]))): #drop needs to be larger than 5% #if ((abs(diff)>(0.05*avg[nr]))):#drop needs to be larger than 5%--> sometimes there is an increase...illogical as bias(avg error) should increase...but also case in Menon paper ca = self.CVmodel.alphas_[nr + 1] self.alphas = list(i * 0.1 * ca for i in range(1, 10)) self.alphas.extend(list(i * ca for i in range(1, 10))) self.n_alphas = 18 update = True break #get out of this loop and do not add more alphas if (not update ): #it seems like we are in trouble with the error getting #worse for smaller regularisation #try having a look at the lasso_path, where do we have non-zero coefs? self.__GenerateLassoPath(gridType="piecelin") update = True return update #@ignore_warnings(category=ConvergenceWarning) def fit(self): """ Class-method wrapping the fit-method of the sklearn model - Target : a pandas dataframe with the Target data belonging to the Features provided upon initialisation. """ import warnings import numpy as np from sklearn.linear_model import Lasso PrintMSEcurve = False with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.CVmodel.fit(self.feature_tf, self.target) warnCount = len(caught_warnings) update = self.__GenerateLinAlphaSeries(AtBest=False, Print=False) if update: params = dict() params['alphas'] = self.alphas self.CVmodel.set_params( **params ) #clean way to update the model as we only wish to change the alphas #run the piece-wise linear grid with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.CVmodel.fit(self.feature_tf, self.target) warnCount = len(caught_warnings) if ( np.count_nonzero(self.CVmodel.coef_) == 0 ): #no remaining coefficients--> check the path for better option self.__GenerateLassoPath(gridType="lin") if (PrintMSEcurve): avg = np.average( self.CVmodel.mse_path_, axis=1 ) #go over all columns(axis=1), to get the minimum of a row, axis=0 vmin = np.amin(self.CVmodel.mse_path_, axis=1) vmax = np.amax(self.CVmodel.mse_path_, axis=1) nrow = len(avg) self.__PrintAlphMSESeries(avg, vmin, vmax, nrow) totcnt = self.CVmodel.alphas_.size self.cv_warns = (warnCount / totcnt) * 100.0 #keep track of the best alpha and mixing, and use these to set up the actual model if (not self.sklCVfail): self.bestAlpha = self.CVmodel.alpha_ else: self.bestAlpha = self.CVmodel.alpha_ * 0.50 #reduce regularization somewhat to avoid purely intercept models self.cv_iter = self.CVmodel.n_iter_ self.model = Lasso( alpha=self.bestAlpha, fit_intercept= True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this normalize=False, #standardization is performed elsewhere precompute=True #precompute the Gram matrix (default) ) print("=== MODEL WAS OPT HERE : best alpha=", self.bestAlpha, " in [", self.CVmodel.alphas_[-1], ", ", self.CVmodel.alphas_[0], "] ", " n-alpha=", self.n_alphas, " #WARN-->", warnCount, '/', totcnt, " : ", self.cv_warns, " % ================") #print(na,": #WARN-->",warnCount,'/',totcnt," : ",self.cv_warns," % | a=",self.bestAlpha," r1=",self.bestL1_ratio) #print(" %5i : #WARN--> %5i / %5i : %9.4f %% | a= %9.4f r1= %9.4f "%(na,warnCount,totcnt,self.cv_warns,self.bestAlpha,self.bestL1_ratio) ) with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.model.fit(self.feature_tf, self.target) self.setCoefficients() NZC = np.count_nonzero(self.model.coef_) print("did some fitting, Parent-style:", type(self.model).__name__, " --> #non-zero coef=", f'{NZC: 5d}') # f=open("ZeroCoef.dat","a+") # line=" --> #non-zero coef="+f'{NZC: 5d}'+" \n" # f.write(line) # f.close() #@ignore_warnings(category=ConvergenceWarning) def fitSanityCheck(self) -> int: """ Class method which should cover/deal with failures of sklearn. For some reason, sklearn LinearRegression randomly fails on small datasets. This failure gives rise to huge coefficents. Hoever, just shuffling the data seems to resolve the issue. This function returns the number of shuffles needed to regain sanity. """ #import sys import warnings import numpy as np from sklearn.linear_model import Lasso #If we had too many warnings then something probably went wrong #There are two ways to fix the warnings: # 1) an alpha range closer to the best one # 2) more iterations # So let us try both 2x #start while loop untill it works...or n_alphas becomes too big? cnt = 0 insane = False PrintMSEcurve = False if (not self.sklCVfail): # normal sanitycheck thress = 1.0 #larger than 1% warnings & not on the edge insane = (self.cv_warns > thress) or ( self.CVmodel.alpha_ == self.CVmodel.alphas_[-1]) or (self.CVmodel.alpha_ == self.CVmodel.alphas_[0]) warnCount = 0 totcnt = self.CVmodel.alphas_.size while ( insane and (cnt < 5)): #more than 5% warnings, and less than 10K alphas cnt += 1 if (cnt % 2 == 0): #even: increase n_iter to 10K params = dict() params['max_iter'] = 10000 #10x default self.CVmodel.set_params( **params ) #clean way to update the model as we only wish to change the alphas #run the piece-wise linear grid with warnings.catch_warnings( record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.CVmodel.fit(self.feature_tf, self.target) warnCount = len(caught_warnings) else: #odd = new alpha range update = self.__GenerateLinAlphaSeries(AtBest=True, Print=False) if update: params = dict() params['alphas'] = self.alphas params['max_iter'] = 1000 #back to our default self.CVmodel.set_params( **params ) #clean way to update the model as we only wish to change the alphas #run the piece-wise linear grid with warnings.catch_warnings( record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.CVmodel.fit(self.feature_tf, self.target) warnCount = len(caught_warnings) if (PrintMSEcurve): avg = np.average( self.CVmodel.mse_path_, axis=1 ) #go over all columns(axis=1), to get the minimum of a row, axis=0 vmin = np.amin(self.CVmodel.mse_path_, axis=1) vmax = np.amax(self.CVmodel.mse_path_, axis=1) nrow = len(avg) self.__PrintAlphMSESeries(avg, vmin, vmax, nrow) totcnt = self.CVmodel.alphas_.size self.cv_warns = (warnCount / totcnt) * 100.0 insane = (self.cv_warns > thress) or ( self.CVmodel.alpha_ == self.CVmodel.alphas_[-1]) or ( self.CVmodel.alpha_ == self.CVmodel.alphas_[0]) #insane=(self.cv_warns>thress) #larger than 1% warnings #keep track of the best alpha and mixing, and use these to set up the actual model self.bestAlpha = self.CVmodel.alpha_ self.cv_iter = self.CVmodel.n_iter_ print("=*= MODEL WAS OPT HERE : best alpha=", self.bestAlpha, " in [", self.CVmodel.alphas_[-1], ", ", self.CVmodel.alphas_[0], "] ", " n-alpha=", self.n_alphas, " #WARN-->", warnCount, '/', totcnt, " : ", self.cv_warns, " % =====SANITY======") if (cnt > 0): # only generate new non-CV model once #after sanity, setup our "Lasso" model self.model = Lasso( alpha=self.bestAlpha, fit_intercept= True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this normalize=False, #standardization is performed elsewhere precompute=True #precompute the Gram matrix (default) ) with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.model.fit(self.feature_tf, self.target) self.setCoefficients() else: # the sanitycheck if the average error of the CV becomes larger due to terms # NZC=np.count_nonzero(self.model.coef_) insane = (np.count_nonzero(self.model.coef_) == 0) while ( insane and (cnt < 5)): #more than 5% warnings, and less than 10K alphas cnt += 1 self.bestAlpha = self.bestAlpha * 0.5 #halve the alpha (up to 4x --> = x0.0625) self.model = Lasso( alpha=self.bestAlpha, fit_intercept= True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this normalize=False, #standardization is performed elsewhere precompute=True #precompute the Gram matrix (default) ) with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always", category=ConvergenceWarning) self.model.fit(self.feature_tf, self.target) insane = (np.count_nonzero(self.model.coef_) == 0) # if (NZC==0): # f=open("ZeroCoef.dat","a+") # line=" ++> #non-zero coef="+f'{NZC: 5d}'+" Fixed: "+str(not insane)+" \n" # f.write(line) # f.close() self.setCoefficients( ) #only update the coeficients for the last case if insane: print( "EPIC FAIL? The sanity check did not appear to fix our problem ", self.name, ". NOT Terminating this sick job! ==> sk-learn CV-fail= ", self.sklCVfail) #sys.exit() return cnt ##serial version # def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool): # """ # Use the ensemble data to create an "average" model, and set the "coefficients" # in the current model. This should be performed in each model separately # # --> needs to include hyper paramaters...how do we deal with multi-preference? # # """ # from sklearn.linear_model import Lasso # #import time # # # 1. Calculate the average coefficients # # 1.1. transform them to arrays # #start = time.perf_counter_ns() # #print("3.1) Average Coefficients : AVG") # intercept=np.zeros(EnsembleData.NData) # coef=np.zeros((EnsembleData.NData,EnsembleData.modelCoef[0]['coef_'][1].shape[1])) # for i in range(EnsembleData.NData): # mcf=EnsembleData.modelCoef[i] # intercept[i]=np.asarray(mcf['intercept_'][1]).ravel() # coef[i,:]=np.asarray(mcf['coef_'][1]).ravel() # print(i,")",coef[i,:]) # # mean_intercept=np.mean(intercept,axis=0)#axis is the varying direction, so 0 means we calculate the average of a column by varying the row # mean_coef=np.mean(coef,axis=0) # print("MEANS=> Int=",mean_intercept," COEF=",mean_coef) # # 2. Set the model coefficients to these averaged values # # ENR and sklearn black-boxing complicate things a bit here: # self.model=Lasso(alpha=self.bestAlpha, # temp values -> will be mean? # fit_intercept=True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this # normalize=False, #standardization is performed elsewhere # precompute=True #precompute the Gram matrix (default) # ) # #self.model.fit(self.feature_tf,self.target) #just to make sure the intercept and coef attributes are defined--> f**k python # # self.model.intercept_=mean_intercept # self.model.coef_=mean_coef # self.isAverage = True # self.hasCI=False # if setCI: # #end = time.perf_counter_ns() # #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9) # # 3. Calculate Confidence Interval using Bootstrapper tech? # # & 4. Store the CI data # ## For the intercept # boot=TBootstrap(data=intercept,Func=np.mean) # #end = time.perf_counter_ns() # #print("3.2.b) NPboot",(end-start)/1E9) # boot.NPbootstrap(n_iter=2000, Jackknife=True) # #end = time.perf_counter_ns() # #print("3.2.c) Con Int",(end-start)/1E9) # avgm, avgp = boot.ConfidenceInterval(CItype="BCa",alpha=0.05,n_samples=2000)#95%confidence interval # self.CI["intercept_lo"]=avgm # self.CI["intercept_hi"]=avgp # ## For the coefficients # avgml=list() # avgpl=list() # for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): # #end = time.perf_counter_ns() # #print("3.2) Average Coefficients : CI Coef ",col," ",(end-start)/1E9) # boot=TBootstrap(data=coef[:,col],Func=np.mean) # boot.NPbootstrap(n_iter=2000, Jackknife=True) # avgm, avgp = boot.ConfidenceInterval(CItype="BCa",alpha=0.05)#95%confidence interval # avgml.append(avgm) # avgpl.append(avgp) # # self.CI["coef_lo"]=avgml # self.CI["coef_hi"]=avgpl # self.hasCI = True # # #store the resulting coefficients in our wrapper tracker...and we are done # self.setCoefficients() # self.Quality=TModelQualityData(EData=EnsembleData) #paralel def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool): """ Use the ensemble data to create an "average" model, and set the "coefficients" in the current model. This should be performed in each model separately --> needs to include hyper paramaters...how do we deal with multi-preference? """ from sklearn.linear_model import Lasso import multiprocessing as mp from HPCTools import get_num_procs #import time # 1. Calculate the average coefficients # 1.1. transform them to arrays #start = time.perf_counter_ns() #print("3.1) Average Coefficients : AVG") self.coefUsage = np.zeros( EnsembleData.modelCoef[0]['coef_'][1].shape[1]) intercept = np.zeros(EnsembleData.NData) coef = np.zeros((EnsembleData.NData, EnsembleData.modelCoef[0]['coef_'][1].shape[1])) alphas = np.zeros(EnsembleData.NData) for i in range(EnsembleData.NData): mcf = EnsembleData.modelCoef[i] intercept[i] = np.asarray(mcf['intercept_'][1]).ravel() coef[i, :] = np.asarray(mcf['coef_'][1]).ravel() stra = mcf['alpha'][1] alphas[i] = stra.split()[3] for j in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): self.coefUsage[j] = 100.0 * ( np.count_nonzero(coef[:, j]) / EnsembleData.NData ) # The %-fraction of non-zero versions of each coefficient self.bestAlpha = np.mean(alphas, axis=0) mean_intercept = np.mean( intercept, axis=0 ) #axis is the varying direction, so 0 means we calculate the average of a column by varying the row mean_coef = np.mean(coef, axis=0) print("MEANS=> Int=", mean_intercept, " COEF=", mean_coef) # 2. Set the model coefficients to these averaged values # ENR and sklearn black-boxing complicate things a bit here: self.model = Lasso( alpha=self.bestAlpha, # temp values -> will be mean? fit_intercept= True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this normalize=False, #standardization is performed elsewhere precompute=True #precompute the Gram matrix (default) ) #The black box nature of sklearn/python rears its ugly head here. #apparently we are not allowed to create an estimator ourselves by setting #the coefficients...ok maybe in this case we should replace the lasso with #a normal poly since we make no use of the power-features of lasso #however, this is rather annoyng. # So just to kill the possibility of sklearn trowing an exception when # predicting, just run it once and overwritte the intercept and coef attributes # --> f**k you python #print("FEATURE_TF=",self.feature_tf) #print("TARGET_TF=",self.target) # print("DICT=",self.model) # print("fit_intercept=", self.model.fit_intercept) # #print("__doc__=", self.model.__doc__) # print("_estimator_type=",self.model._estimator_type) # print("_preprocess_data=",self.model._preprocess_data) # print("selection=",self.model.selection) # print("__getattribute__=",self.model.__getattribute__) # #print("bool=",self.model.__bool__) # print("get_state pre fit =",self.model.__getstate__()) #print("=",self.model.__setstate__()) ftt2 = [0, 1, 2, 3, 4, 5, 6] ftt = [[1, 2, 3], [0, 0.1, 0.0], [0.1, 0.1, 0.1], [3, 2, 2], [5.2, 3, 4], [0.0, 0.2, 0.3], [-0.1, -0.2, -0.3]] self.model.fit(ftt, ftt2) # print("get_state post fit=",self.model.__getstate__()) #self.model.fit(self.feature_tf,self.target) #just to make sure the intercept and coef attributes are defined--> f**k python self.model.intercept_ = mean_intercept self.model.coef_ = mean_coef self.isAverage = True self.hasCI = False #print("get_state post setting=",self.model.__getstate__()) if setCI: #end = time.perf_counter_ns() #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9) # 3. Calculate Confidence Interval using Bootstrapper tech? # & 4. Store the CI data ## For the intercept boot = TBootstrap(data=intercept, Func=np.mean) #end = time.perf_counter_ns() #print("3.2.b) NPboot",(end-start)/1E9) boot.NPbootstrap(n_iter=2000, Jackknife=True) #end = time.perf_counter_ns() #print("3.2.c) Con Int",(end-start)/1E9) avgm, avgp = boot.ConfidenceInterval( CItype="BCa", alpha=0.05, n_samples=2000) #95%confidence interval self.CI["intercept_lo"] = avgm self.CI["intercept_hi"] = avgp print("===BOOT INTERCEPT:", avgm, avgp) ## For the coefficients # Parallelisation for sections performing bootstraps. # Parallelization at the highest level of a column, # ??Is the overhead sufficiently low to have benefits? # 1. create our process pool with as many processes as physical cores pool = mp.Pool(processes=get_num_procs(-1)) # 2. set drones to work alpha = 0.05 #95%confidence interval print("col-range=", EnsembleData.modelCoef[0]['coef_'][1].shape[1]) for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): print("col", col, " coef[:,col]=", coef[:, col], " type=", type(coef[:, col])) drones = [ pool.apply_async(Bootstrap_1Col, args=(col, coef[:, col], alpha)) for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]) ] # 3. as we can not assume the cols to be produced in the correct order # --> make it a dict ciDict = dict() for drone in drones: col, avgm, avgp = drone.get() ciDict[col] = list([avgm, avgp]) # 4. wait untill all processes are finished pool.close() pool.join() # 5. and put then in the corrcet order in the list avgml = list() avgpl = list() for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): avgml.append(ciDict[col][0]) avgpl.append(ciDict[col][1]) self.CI["coef_lo"] = avgml self.CI["coef_hi"] = avgpl self.hasCI = True #store the resulting coefficients in our wrapper tracker...and we are done #print("Store resulting coefficients.") self.setCoefficients() self.Quality = TModelQualityData(EData=EnsembleData) def printAverageCoefficients(self, File: str = None): """ Print a block of information to a file, containing the averaged coefficients for the polynomial model. --> needs to include hyper paramaters... parameters: - self: - File: string containing a filename, if None standard output is used. Default=None """ coefstr = list() maxl = 0 pipeline = None for name, step in self.EnsPipe.steps: if (name == 'poly_features'): pipeline = self.EnsPipe if pipeline == None: for name, step in self.pipeline.steps: if (name == 'poly_features'): pipeline = self.pipeline for i in range(pipeline['poly_features'].powers_.shape[0]): line = "(" for j in range(pipeline['poly_features'].powers_.shape[1]): pw = pipeline['poly_features'].powers_[i, j] if (pw != 0): line = line + "x_" + str(j) + "^" + str(pw) line = line + ")" if len(line) == 2: #so this term is missing line = "(1)" curl = len( line ) #keep track of the length of the longest coefficient string if (curl > maxl): maxl = curl coefstr.append(line) if File is None: print("======= THE AVERAGED MODEL ==============") print(" Model : ", self.name) print(self.Quality.QualitiesText()) if self.hasCI: print("Intercept : ", f'{self.model.intercept_: 12.7f}', " and CI=[", f'{self.CI["intercept_lo"]: 12.7f}', " ; ", f'{self.CI["intercept_hi"]: 12.7f}', "]") for col in range(len(self.model.coef_)): #print("coef ",coefstr[col]," : ",self.model.coef_[col]," and CI=[",self.CI["coef_lo"][col]," ; ",self.CI["coef_hi"][col],"]") print("coef ", f'{coefstr[col]: <{maxl}}', " : ", f'{self.model.coef_[col]: 12.7f}', " and CI=[", f'{self.CI["coef_lo"][col]: 12.7f}', " ; ", f'{self.CI["coef_hi"][col]: 12.7f}', "] usage= ", f'{self.coefUsage[col]: 8.3f}', " %") else: print("Intercept : ", f'{self.model.intercept_: 12.7f}') for col in range(len(self.model.coef_)): print("coef ", f'{coefstr[col]: <{maxl}}', " : ", f'{self.model.coef_[col]: 12.7f}', " usage= ", f'{self.coefUsage[col]: 8.3f}', " %") print("====================================\n\n") else: foo = open( File, "a+", ) foo.write("======= THE AVERAGED MODEL ==============\n") line = " Model : " + self.name + "\n" foo.write(line) foo.write(self.Quality.QualitiesText()) if self.hasCI: line = "Intercept : " + f'{self.model.intercept_: 12.7f}' + " and CI=[" + f'{self.CI["intercept_lo"]: 12.7f}' + " ; " + f'{self.CI["intercept_hi"]: 12.7f}' + "] \n" foo.write(line) for col in range(len(self.model.coef_)): line = "coef " + f'{coefstr[col]: <{maxl}}' + " : " + f'{self.model.coef_[col]: 12.7f}' + " and CI=[" + f'{self.CI["coef_lo"][col]: 12.7f}' + " ; " + f'{self.CI["coef_hi"][col]: 12.7f}' + "] usage= " + f'{self.coefUsage[col]: 8.3f}' + " % \n" foo.write(line) else: line = "Intercept : " + f'{self.model.intercept_: 12.7f}' + "\n" foo.write(line) for col in range(len(self.model.coef_)): line = "coef " + f'{coefstr[col]: <{maxl}}' + " : " + f'{self.model.coef_[col]: 12.7f}' + " usage= " + f'{self.coefUsage[col]: 8.3f}' + " % \n" foo.write(line) foo.write("====================================\n\n") foo.close() def setCoefficients(self): """ Class-method printing the fitting coefficients for a polynomial regression with elastic net regularization """ import numpy as np super().setCoefficients() #--------- hyper parameters ------------------- self.modelcoef['header_hyperparameter'] = [ self.coefindex, "The selected best alpha and mixing hyper-parameters for Elastic Net Regularization are:" ] line = " - alpha = %0.3f " % (self.bestAlpha) self.modelcoef['alpha'] = [-(self.coefindex + 1), line] line = " - n_iter_CV = %i " % (self.cv_iter) self.modelcoef['n_iter'] = [-(self.coefindex + 2), line] line = " - CV-warnings = %0.2f %%" % (self.cv_warns) self.modelcoef['cv_warns'] = [-(self.coefindex + 3), line] #------------usual coefficients----------------- self.modelcoef['header_coef'] = [ self.coefindex + 4, "The coefficients for each target (one per row) are given by:" ] self.modelcoef['coef_'] = [ self.coefindex + 5, np.array([self.model.coef_]) ] self.modelcoef['header_intercept'] = [ self.coefindex + 6, "The intercepts for each target (one per row) are given by:" ] self.modelcoef['intercept_'] = [ self.coefindex + 7, np.array([self.model.intercept_]) ] self.coefindex += 8
class GeoMagLinearARX(GeoMagARXRegressor): def __init__(self, auto_order=10, exog_order=10, pred_step=1, transformer_X=None, transformer_y=None, include_interactions=False, interactions_degree=2, lasso=False, lars=False, **kwargs): if lasso and lars: base_estimator = LassoLars() elif lasso and not lars: base_estimator = Lasso() else: base_estimator = LinearRegression() super().__init__(base_estimator=base_estimator, auto_order=auto_order, exog_order=exog_order, pred_step=pred_step, transformer_X=transformer_X, transformer_y=transformer_y, include_interactions=include_interactions, interactions_degree=interactions_degree, **kwargs) self.lasso = lasso self.lars = lars self.cv_is_fitted_ = False @property def coef_(self): return self.base_estimator_fitted_.coef_ @cached_property @requires_processor_fitted def fitted_values_(self): fitted_vals = np.matmul(self.train_features_, self.coef_) return fitted_vals @cached_property @requires_processor_fitted def sigma_sq_(self): n, p = self.train_shape_ diff = self.train_target_ - self.fitted_values_ sigma_sq = diff.dot(diff) / (n - p) return sigma_sq @cached_property @requires_processor_fitted def inv_XTX_(self): inv_XTX = np.linalg.inv( np.matmul(self.train_features_.transpose(), self.train_features_)) return inv_XTX @cached_property @not_implemented_for_lasso def standard_error_(self, squared=False): mse = np.diag(self.sigma_sq_ * self.inv_XTX_) if squared: return mse else: return np.sqrt(mse) @cached_property @not_implemented_for_lasso @requires_processor_fitted def pvals_(self): from scipy.stats import t n, p = self.train_features_.shape df = n - p test_stat = np.abs(self.coef_) / self.standard_error_ pval = np.around(2 * t.sf(test_stat, df), decimals=3) return pval @cached_property @not_implemented_for_lasso def pval_df_(self): check_is_fitted(self) pval_df = self._format_df(self.pvals_) return pval_df @cached_property def coef_df_(self): check_is_fitted(self) coef_df = self._format_df(self.coef_) return coef_df @cached_property def train_errors(self): yhat = self.predict(self.train_features_) return (self.train_target_ - yhat) @not_implemented_for_lasso def compute_prediction_se(self, X, y=None, squared=False): test_features, y = self.process_data(X, y, fit=False, remove_NA=False) self.prediction_se_mask_ = _get_NA_mask(test_features) test_features = test_features[self.prediction_se_mask_] covar = self.sigma_sq_ * ( test_features.dot(self.inv_XTX_.dot(test_features.transpose())) + np.eye(test_features.shape[0])) if squared: return np.diag(covar) else: return np.sqrt(np.diag(covar)) @not_implemented_for_lasso def compute_prediction_interval(self, X, y=None, level=.95): from scipy.stats import t ypred = self.predict(X, y) self.prediction_se_ = self.compute_prediction_se(X, y) self.prediction_se_ = self.process_predictions( self.prediction_se_, Vx=X[self.vx_colname][self.prediction_se_mask_], inverse_transform_y=False) n, p = self.train_features_.shape lower_z, upper_z = t.interval(level, n - p) pred_interval = { 'ypred': ypred, 'lower': ypred + (lower_z * self.prediction_se_), 'upper': ypred + (upper_z * self.prediction_se_) } return pred_interval def _format_df(self, vals, decimals=3): ar_names = np.array( ["ar" + str(i) for i in range(self.auto_order_steps_)]) exog_names = np.concatenate( [[x + str(i) for i in range(self.exog_order_steps_)] for x in self.train_features_cols_]).T names = np.concatenate([ar_names, exog_names]) vals_no_interactions = pd.Series(vals[:len(names)], index=names) df = pd.DataFrame({ col: vals_no_interactions[vals_no_interactions.index.str.contains( '^' + col + '[0-9]+$')].reset_index(drop=True) for col in self.train_features_cols_.insert(0, 'ar') }) if vals.shape[0] > len(names) and self.include_interactions: powers = self.interactions_processor_.powers_ n_features = self.train_features_cols_.shape[0] colnames = self.train_features_cols_.insert(0, 'ar') interaction_masks = powers[n_features + 1:].astype(bool) interaction_colnames = [ '_'.join(colnames[mask].tolist()) for mask in interaction_masks ] interaction_names = np.concatenate( [[x + str(i) for i in range(self.exog_order_steps_)] for x in interaction_colnames]) interactions = pd.Series(vals[len(names):len(names) + len(interaction_names)], index=interaction_names) interactions_df = pd.DataFrame({ col: interactions[interactions.index.str.contains( '^' + col + '[0-9]+$')].reset_index(drop=True) for col in interaction_colnames }) df = pd.concat([df, interactions_df], axis=1) if self.seasonality: seasonality_names = ('sin_yr', 'cos_yr', 'sin_day', 'cos_day') seasonality_df = pd.DataFrame({ seasonality_names[i]: [vals[-i]] for i in range(len(seasonality_names)) }) df = pd.concat([df, seasonality_df], axis=1) # Set index to minutes lag df.set_index(np.arange(0, self.exog_order, step=self.time_res_minutes_).astype(int), inplace=True) df.index.set_names('lag', inplace=True) if decimals is not None: df = df.round(decimals) return df @lasso_method def fit_cv(self, X, y, storm_level=0, time_level=1, vx_colname='vx_gse', n_splits=5, **cv_params): self._prefit(storm_level=storm_level, time_level=time_level, vx_colname=vx_colname) features, target = self.process_data(X, y, fit=True) if self.lars: self.base_estimator_fitted_ = LassoLarsCV() else: self.base_estimator_fitted_ = LassoCV() cv = GroupKFold(n_splits=n_splits) self.cv_split_ = list( cv.split(features, target, groups=self.train_storms_)) self.base_estimator_fitted_.set_params(cv=self.cv_split_, **cv_params) self.base_estimator_fitted_.fit(features, target) self.cv_is_fitted_ = True return self