def crossgrid(self, griddic, cv=None): """ perform a crossvalidation procedure for mparameters in grid and cv-sample cv inputs grid: dictionary of the form dict(ESTIMATORNAME1__PARAMETER1 = PARAMLST1, ESTIMATORNAME2__PARAMETER2 = PARAMLST2, ... ) cv: crossvalidation array of the form [IDn1, IDn2,...IDnN] """ # initialize cv procedure if cv != None: # if cv not empty overwrite existing cv procedure self.cv = cv elif cv == None and self.cv == None: # if no cv procedure has been specified set the classical l20o print('ATTENTION:\tNo CV procedure specified, proceeding with reduced l20o.') cv = cv.leave_x_out(self.Y,20) # initialize the _gridsearch attribute # need to include how to create the dictionary from the input self._gridsearch = sk.grid_search.GridSearchCV(self._pipe, griddic, n_jobs=-1) # @UndefinedVariable # fit the CV grid self._gridsearch.fit(self.X,self.Y)
pipe.setpipe(['GB']) # cvcounter test print('Pipe.cvcounter =\t'+str(pipe.cvcounter)) print("X size: " + str(np.shape(X))) print("Y size: " + str(np.shape(Y))) # test initialization of grid parameters # FFS + RF dic # griddic = dict(FFS__k=[50,100],RF__n_estimators=[100,200]) # FFS + FDA dic griddic = dict(GB__n_estimators=[100,200,300],GB__learning_rate=[0.1,0.3,0.5],GB__max_features=["Auto",100.],GB__max_depth=[3,5,10]) #griddic = dict(); pipe.crossgrid(griddic,crossval=cv.leave_x_out(pipe.Y, 50, nsamples=100)) #pipe.crossgrid(griddic,crossval=cv.leave_x_out(pipe.Y, 20, nsamples=300)) print(pipe.return_score()) print(pipe._gridsearch.grid_scores_) print(pipe._pipe.named_steps.keys()) print(pipe._pipe) pipe.return_rank() pipe.return_ranks(.5,printtofile=True) # prediction df2 = pd.read_csv('data/test.csv') X2, IDS, names2 = filtertrain(df2,'test') Y2 = pipe._gridsearch.predict_proba(X2)
wids = ['week4','week5','week6','week10'] elif organ == 'liver' and isTargeted == True: wids = ['4w','5w','6w','10w'] cvweeks = wids[0:2] pns,cns,Xdata,Ydata = jmport.importdata(organ,isTargeted=isTargeted,LogConc=LogConc) _, X, Y, _ = jmport.filterd(pns,Xdata,Ydata,wids) # run automated tests #run an initialization test for a pipeline with pca and fda pipe = Pipe(X,Y,cns,wids,organ=organ,isTargeted=isTargeted,LogConc=LogConc) pipe.setpipe(['FFS','GB']) # cvcounter test print('Pipe.cvcounter =\t'+str(pipe.cvcounter)) print(np.shape(np.array(X))) # test initialization of grid parameters #griddic = dict(FFS__k=[10,20,40,50,100,130,200,750,800],RF__n_estimators=[10,100,200,300],RF__criterion=["gini","entropy"],RF__max_features=["sqrt","log2"]) griddic = dict(FFS__k=[10,20,40,50,100,130,200,750,800],GB__n_estimators=[100,200,300,600],GB__learning_rate=[0.1,0.3,0.5],GB__max_features=["auto"],GB__max_depth=[3,5,10]) pipe.crossgrid(griddic,cv=cv.leave_x_out(pipe.Y,20,nsamples=200,testlst=[i for i,n in enumerate(pns) if any(j in n for j in cvweeks)])) print(pipe.return_score()) print(pipe._gridsearch.grid_scores_) print(pipe._pipe.named_steps.keys()) pipe.return_rank() pipe.return_ranks(.9,printtofile=True)
# cvcounter test print('Pipe.cvcounter =\t' + str(pipe.cvcounter)) print("X size: " + str(np.shape(X))) print("Y size: " + str(np.shape(Y))) # test initialization of grid parameters # FFS + RF dic # griddic = dict(FFS__k=[50,100],RF__n_estimators=[100,200]) # FFS + FDA dic griddic = dict(GB__n_estimators=[100, 200, 300], GB__learning_rate=[0.1, 0.3, 0.5], GB__max_features=["Auto", 100.], GB__max_depth=[3, 5, 10]) #griddic = dict(); pipe.crossgrid(griddic, crossval=cv.leave_x_out(pipe.Y, 50, nsamples=100)) #pipe.crossgrid(griddic,crossval=cv.leave_x_out(pipe.Y, 20, nsamples=300)) print(pipe.return_score()) print(pipe._gridsearch.grid_scores_) print(pipe._pipe.named_steps.keys()) print(pipe._pipe) pipe.return_rank() pipe.return_ranks(.5, printtofile=True) # prediction df2 = pd.read_csv('data/test.csv') X2, IDS, names2 = filtertrain(df2, 'test') Y2 = pipe._gridsearch.predict_proba(X2)