Exemple #1
0
    def run(self,X_exp,y_exp):
        '''
        given input data X_exp labels y_exp \in {-1,0,1} (0=unlabeled),
        define input/output state files and serialize input data
        '''
        if not pathexists(self.exp_dir):
            os.makedirs(self.exp_dir)

        input_statefile  = pathjoin(self.exp_dir,self.exp_name+'_input.pkl')
        output_statefile = pathjoin(self.exp_dir,self.exp_name+'_output.pkl')
        jldump({'X_exp':X_exp,'y_exp':y_exp,'exp_name':self.exp_name},
               input_statefile,compress=jlcompress,cache_size=jlcache)
        input_state, output_state = self._collect_state(input_statefile,
                                                        output_statefile)
        return input_state, output_state            
Exemple #2
0
    def _collect_state(self,input_statefile,output_statefile,
                       output_exists='overwrite'):
        '''
        run _cv for an experiment (input_statefile) and serialize the output 
        (output_statefile), scales features using scaling_method, and excludes
        unlabeled (y=0) samples

        if output_exists=='overwrite': overwrite existing output dump
           output_exists=='noupdate': read existing output dump and return without updating
        
        '''

        input_state  = jlload(input_statefile)
        output_state = {}
        if output_exists != 'overwrite' and pathexists(output_statefile):
            print('Loading existing state from', output_statefile)
            output_state = jlload(output_statefile)
            if output_exists == 'noupdate':
                return input_state, output_state

        X = input_state['X_exp'].copy()
        y = input_state['y_exp'].copy()

        multi_output = y.size != max(y.shape)

        if multi_output:
            # error out if incompatible models in model_eval
            for model in self.model_eval:
                if model in model_nomulti:                    
                    print('Error: model %s not compatible with multi-output labels'%model)
                    return input_state,output_state

        if self.scaling_method=='Normalize':
            scale_fn = Normalizer(norm='l2').fit_transform
        elif self.scaling_method=='MinMax':
            scale_fn = MinMaxScaler().fit_transform
        elif self.scaling_method=='Standard':
            scale_fn = StandardScaler().fit_transform
        elif self.scaling_method==None:
            scale_fn = lambda X: X
        else:
            print('Error: unknown scaling method "%s"'%self.scaling_method)
            return input_state,output_state

        print('Scaling features using method "%s"'%self.scaling_method)
        X = scale_fn(X)

        # remove unlabeled samples
        if multi_output:
            labmask = (y!=0).any(axis=1)
            y = y[labmask,:]
        else:
            labmask = y!=0
            y = y[labmask].ravel()
        X = X[labmask,:]

        # make sure X,y are valid after scaling/masking operations
        check_X_y(X,y,multi_output=multi_output)

        # get number of *labeled* samples
        N = len(y)

        if self.cv_id == 'loo':
            cv = LeaveOneOut(N)
        elif self.pred_mode == 'clf':
            cv = StratifiedKFold(y,n_folds=self.train_folds,random_state=train_state)
        elif self.pred_mode == 'reg':
            cv = KFold(n=N,n_folds=self.train_folds,random_state=train_state,shuffle=True)

        output_state = self._run_cv(X,y,cv,output_state)
        output_state.update({'cv':cv,'cv_id':self.cv_id,'labmask':labmask,
                             'scaling_method':self.scaling_method,
                             'model_eval':self.model_eval,'model_classes':2,
                             'model_features':X.shape[1]})

        jldump(output_state,output_statefile,compress=jlcompress,
               cache_size=jlcache)

        return input_state, output_state