def run(self,X_exp,y_exp): ''' given input data X_exp labels y_exp \in {-1,0,1} (0=unlabeled), define input/output state files and serialize input data ''' if not pathexists(self.exp_dir): os.makedirs(self.exp_dir) input_statefile = pathjoin(self.exp_dir,self.exp_name+'_input.pkl') output_statefile = pathjoin(self.exp_dir,self.exp_name+'_output.pkl') jldump({'X_exp':X_exp,'y_exp':y_exp,'exp_name':self.exp_name}, input_statefile,compress=jlcompress,cache_size=jlcache) input_state, output_state = self._collect_state(input_statefile, output_statefile) return input_state, output_state
def _collect_state(self,input_statefile,output_statefile, output_exists='overwrite'): ''' run _cv for an experiment (input_statefile) and serialize the output (output_statefile), scales features using scaling_method, and excludes unlabeled (y=0) samples if output_exists=='overwrite': overwrite existing output dump output_exists=='noupdate': read existing output dump and return without updating ''' input_state = jlload(input_statefile) output_state = {} if output_exists != 'overwrite' and pathexists(output_statefile): print('Loading existing state from', output_statefile) output_state = jlload(output_statefile) if output_exists == 'noupdate': return input_state, output_state X = input_state['X_exp'].copy() y = input_state['y_exp'].copy() multi_output = y.size != max(y.shape) if multi_output: # error out if incompatible models in model_eval for model in self.model_eval: if model in model_nomulti: print('Error: model %s not compatible with multi-output labels'%model) return input_state,output_state if self.scaling_method=='Normalize': scale_fn = Normalizer(norm='l2').fit_transform elif self.scaling_method=='MinMax': scale_fn = MinMaxScaler().fit_transform elif self.scaling_method=='Standard': scale_fn = StandardScaler().fit_transform elif self.scaling_method==None: scale_fn = lambda X: X else: print('Error: unknown scaling method "%s"'%self.scaling_method) return input_state,output_state print('Scaling features using method "%s"'%self.scaling_method) X = scale_fn(X) # remove unlabeled samples if multi_output: labmask = (y!=0).any(axis=1) y = y[labmask,:] else: labmask = y!=0 y = y[labmask].ravel() X = X[labmask,:] # make sure X,y are valid after scaling/masking operations check_X_y(X,y,multi_output=multi_output) # get number of *labeled* samples N = len(y) if self.cv_id == 'loo': cv = LeaveOneOut(N) elif self.pred_mode == 'clf': cv = StratifiedKFold(y,n_folds=self.train_folds,random_state=train_state) elif self.pred_mode == 'reg': cv = KFold(n=N,n_folds=self.train_folds,random_state=train_state,shuffle=True) output_state = self._run_cv(X,y,cv,output_state) output_state.update({'cv':cv,'cv_id':self.cv_id,'labmask':labmask, 'scaling_method':self.scaling_method, 'model_eval':self.model_eval,'model_classes':2, 'model_features':X.shape[1]}) jldump(output_state,output_statefile,compress=jlcompress, cache_size=jlcache) return input_state, output_state