def __LaunchInferTask(self, strategy, InputDir, OutputDir): ## load best parameters with open('%s/params.txt' % InputDir, 'r') as i_file: params = json.load(i_file) i_file.close() ## load holdout test data HoldoutData = DataUtil.load('%s/holdout/test.%s' % (InputDir, self._data_format), format=self._data_format) ## load submit test data SubmitData = DataUtil.load('%s/submit/test.%s' % (InputDir, self._data_format), format=self._data_format) print('\n---- Load holdout/submit data done. ----\n') variants = params[strategy] for variant in variants: print( '\n==== Model inferring for strategy %s, variant %s begins. ====\n' % (strategy, variant['variant'])) VarFields = variant['variant'] varstr = ':'.join([ '%s#%s' % (VarKey, VarFields[VarKey]) for VarKey in VarFields ]) best = variant['best'] score = best['mean'] BestParams = best['params'] print('\n---- Best parameter for variant %s ---- \n' % variant) print(score, BestParams) print('\n-------- \n') head = 'strategy#%s:%s' % (strategy, varstr) VarOutputDir = '%s/%s' % (OutputDir, head) if (os.path.exists(VarOutputDir) == False): os.makedirs(VarOutputDir) model = self._d_strategy[strategy](BestParams, self._kfold, InputDir, VarOutputDir, self._data_format) if (self._level > 1): model.infer(head, HoldoutData, SubmitData, True) else: model.infer(head, HoldoutData, SubmitData, False) print(' \n Strategy %s, variant %s, cv score %s \n' % (strategy, variant, best['mean'])) print( '\n==== Model inferring for strategy %s, variant %s ends. ====\n' % (strategy, variant['variant'])) del model gc.collect() return
def train(self, importance=False): """""" print('\n parameters %s \n' % self.parameters) d_fold_val = {} for fold in range(self.kfold): print('\n---- fold %s begins.\n' % fold) ## load data TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## train and predict on valid self.__fit() eval = self.__predict() d_fold_val[fold] = eval ## save OutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) DataUtil.save(self.TrainData, '%s/train.%s' % (OutputDir, self.data_format), format=self.data_format) DataUtil.save(self.TestData, '%s/test.%s' % (OutputDir, self.data_format), format=self.data_format) print('\n---- Fold %d done. ----\n' % fold) return d_fold_val
def infer(self, head, HoldoutData, SubmitData, metric_pk=False): """""" ## l_pred_fold = [] PredHoldout = pd.DataFrame(index=HoldoutData.index) PredHoldout['index'] = HoldoutData['index'] PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales'] PredSubmit = pd.DataFrame(index=SubmitData.index) for fold in range(self.kfold): ## load TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## fit PredFold = pd.DataFrame(index=self.TestData.index) PredFold['index'] = self.TestData['index'] PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales'] PredFold['fold'] = fold self.__fit() ## inferring PredFold[head] = self._model.predict( self.TestData[self._l_train_columns]) PredHoldout['fold%s' % (fold)] = self._model.predict( HoldoutData[self._l_train_columns]) PredSubmit['fold%s' % fold] = self._model.predict( SubmitData[self._l_train_columns]) l_pred_fold.append(PredFold) ## aggregate folds data PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True) ## save for folds data for fold in range(self.kfold): FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(FoldOutputDir) == False): os.makedirs(FoldOutputDir) TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format) TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format) TrainData = PredKFold[PredKFold['fold'] != fold] TestData = PredKFold[PredKFold['fold'] == fold] DataUtil.save(TrainData, TrainFile, format=self.data_format) DataUtil.save(TestData, TestFile, format=self.data_format) HoldCols = [ col for col in PredHoldout.columns if col.startswith('fold') ] ## save for holdout data PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1) HoldoutOutputDir = '%s/holdout' % self.OutputDir if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(PredHoldout, '%s/test.%s' % (HoldoutOutputDir, self.data_format), format=self.data_format) ## save for submit data PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1) SubmitOutputDir = '%s/submit' % self.OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(PredSubmit, '%s/test.%s' % (SubmitOutputDir, self.data_format), format=self.data_format) ## metric PK if (metric_pk): d_metric = {} for col in self._l_train_columns: diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales']) rmse = np.sqrt(np.sum(diff * diff) / len(diff)) d_metric[col] = rmse diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales'] ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff)) print('\n===== metric pk result ====\n') print('single model: %s, ensemble model %s: %s' % (d_metric, head, ensemble_metric)) print('\n===== metric pk result ====\n') return
def run(self, tasks, encode_type='simple'): """""" print('\n==== Engineering for kfold begins ====') ## load category values with open('%s/holdout/category.pkl' % self._InputDir, 'rb') as ca_file,\ open('%s/holdout/featmap.pkl' % self._InputDir, 'rb') as fe_file,\ open('%s/holdout/idmean.pkl' % self._InputDir, 'rb') as im_file,\ open('%s/holdout/idmedian.pkl' % self._InputDir, 'rb') as im2_file: self._d_values = pickle.load(ca_file) self._d_flat_values = pickle.load(fe_file) self._d_id_mean_values = pickle.load(im_file) self._d_id_median_values = pickle.load(im2_file) ca_file.close() fe_file.close() im_file.close() im2_file.close() KFoldInputDir = '%s/kfold' % self._InputDir KFoldOutputDir = '%s/kfold' % self._OutputDir #### for submit, public test with open('%s/train.pkl' % KFoldInputDir, 'rb') as tr_file, open('%s/test.pkl' % KFoldInputDir, 'rb') as te_file: self.TrainData = pickle.load(tr_file) self.TestData = pickle.load(te_file) tr_file.close() te_file.close() for task in tasks: self.__LaunchTask(task, encode_type=encode_type) ## save submit, public test SubmitOutputDir = '%s/submit' % self._OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(self.TrainData, '%s/train.csv' % SubmitOutputDir, format='csv') DataUtil.save(self.TestData, '%s/test.csv' % SubmitOutputDir, format='csv') #### for kfold, local CV for fold in range(self._kfold): print('\n==== fold %s begins ...' % fold) FoldInputDir = '%s/%s' % (KFoldInputDir, fold) FoldOutputDir = '%s/%s' % (KFoldOutputDir, fold) ## load with open('%s/train.pkl' % FoldInputDir, 'rb') as tr_file, open('%s/test.pkl' % FoldInputDir, 'rb') as te_file: self.TrainData = pickle.load(tr_file) self.TestData = pickle.load(te_file) tr_file.close() te_file.close() ## launch task for task in tasks: self.__LaunchTask(task, encode_type=encode_type) ## save if (os.path.exists(FoldOutputDir) == False): os.makedirs(FoldOutputDir) DataUtil.save(self.TrainData, '%s/train.csv' % FoldOutputDir, format='csv') DataUtil.save(self.TestData, '%s/test.csv' % FoldOutputDir, format='csv') print('\n==== fold %s done.' % fold) print('\n==== Engineering for kfold done ====') #### for holdout, local test print('\n==== Engineering for holdout begins ====') ## load HoldoutInputDir = '%s/holdout' % self._InputDir HoldoutOutputDir = '%s/holdout' % self._OutputDir with open('%s/test.pkl' % HoldoutInputDir, 'rb') as te_file,\ open('%s/train.pkl' % HoldoutInputDir, 'rb') as tr_file: self.TestData = pickle.load(te_file) self.TrainData = pickle.load(tr_file) te_file.close() tr_file.close() ## launch task for task in tasks: self.__LaunchTask(task, encode_type=encode_type) ## save if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(self.TrainData, '%s/train.csv' % HoldoutOutputDir, format='csv') DataUtil.save(self.TestData, '%s/test.csv' % HoldoutOutputDir, format='csv') print('\n==== Engineering for holdout done ====') return
def __LaunchTrainTask(self, strategy, d_params, InputDir, OutputDir): """""" ## copy holdout data file HoldoutFile = '%s/holdout/test.%s' % (InputDir, self._data_format) HoldoutOutputDir = '%s/holdout' % OutputDir if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(DataUtil.load(HoldoutFile, self._data_format), '%s/test.%s' % (HoldoutOutputDir, self._data_format), format=self._data_format) ## copy submit data file SubmitFile = '%s/submit/test.%s' % (InputDir, self._data_format) SubmitOutputDir = '%s/submit' % OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(DataUtil.load(SubmitFile, self._data_format), '%s/test.%s' % (SubmitOutputDir, self._data_format), format=self._data_format) print('\n ---- Copying holdout/submit data done.\n') ## run each group of parameters of each algorithm OutputParams = [] for variant in d_params['variants']: #### for check #if(variant['objective'] not in ['fair']): # continue # if(variant['selection'] not in ['random']): # continue #if((variant['algorithm'] not in ['RGF_Sib'])): # continue #if((variant['criterion'] not in ['mse', 'mae'])): # continue #### l_var_evals = [] BestSTD = -1 BestParam = {} BestRMSE = 65535 VariantParams = [] for param in d_params['params']: count = 0 for VarKey in variant: if ((VarKey in param) and (variant[VarKey] == param[VarKey])): count += 1 if (count == len(variant)): VariantParams.append(param) print( '\n==== Model tuning for strategy %s, variant %s begins... ====\n' % (strategy, variant)) for param in VariantParams: model = self._d_strategy[strategy](param, self._kfold, InputDir, OutputDir, self._data_format) rmse = model.train() l_var_evals.append({'params': param, 'eval': rmse}) del model gc.collect() mean = np.mean(list(rmse.values())) std = np.std(list(rmse.values())) if (mean < BestRMSE): BestRMSE = mean BestParam = param BestSTD = std print( '\n==== Model tuning for strategy %s, variant %s ends. ====\n' % (strategy, variant)) OutputParams.append({ 'variant': variant, 'result': l_var_evals, 'best': { 'mean': BestRMSE, 'std': BestSTD, 'params': BestParam } }) return OutputParams
def __LaunchAggregateTask(self, l_variant_model, InputDir, OutputDir): """""" #### for folds data for fold in range(self._kfold): print('\n Aggregate for fold %s begins. \n' % fold) l_train_fold = [] l_test_fold = [] ## load for mf in l_variant_model: TrainFile = '%s/%s/kfold/%s/train.%s' % (InputDir, mf, fold, self._data_format) TestFile = '%s/%s/kfold/%s/test.%s' % (InputDir, mf, fold, self._data_format) TrainData = DataUtil.load(TrainFile, self._data_format) TestData = DataUtil.load(TestFile, self._data_format) l_train_fold.append(TrainData) l_test_fold.append(TestData) print('\n Load data for fold %s done. \n' % fold) ## aggregate for train TrainFoldData = pd.DataFrame(index=l_train_fold[0].index) TrainFoldData['index'] = l_train_fold[0]['index'] TrainFoldData['Item_Outlet_Sales'] = l_train_fold[0][ 'Item_Outlet_Sales'] for idx in range(len(l_variant_model)): TrainFoldData[l_variant_model[idx]] = l_train_fold[idx][ l_variant_model[idx]] ## aggregate for test TestFoldData = pd.DataFrame(index=l_test_fold[0].index) TestFoldData['index'] = l_test_fold[0]['index'] TestFoldData['Item_Outlet_Sales'] = l_test_fold[0][ 'Item_Outlet_Sales'] for idx in range(len(l_variant_model)): TestFoldData[l_variant_model[idx]] = l_test_fold[idx][ l_variant_model[idx]] ## save FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold) if (os.path.exists(FoldOutputDir) == False): os.makedirs(FoldOutputDir) DataUtil.save(TrainFoldData, '%s/train.%s' % (FoldOutputDir, self._data_format), format='csv') DataUtil.save(TestFoldData, '%s/test.%s' % (FoldOutputDir, self._data_format), format='csv') print('\n Aggregate or fold %s done. \n' % fold) print('\n Aggregate kfold data fone.\n') ##### aggregate for holdout l_holdout = [] for mf in l_variant_model: HoldoutFile = '%s/%s/holdout/test.%s' % (InputDir, mf, self._data_format) holdout = DataUtil.load(HoldoutFile, self._data_format) l_holdout.append(holdout) HoldoutData = pd.DataFrame(index=l_holdout[0].index) HoldoutData['index'] = l_holdout[0]['index'] HoldoutData['Item_Outlet_Sales'] = l_holdout[0]['Item_Outlet_Sales'] for idx in range(len(l_variant_model)): HoldoutData[l_variant_model[idx]] = l_holdout[idx][ l_variant_model[idx]] ## save HoldoutOutputDir = '%s/holdout' % OutputDir if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(HoldoutData, '%s/test.%s' % (HoldoutOutputDir, self._data_format), format='csv') print('\n Aggregate for holdout data done.\n') #### aggregate for submit data l_submit = [] for mf in l_variant_model: SubmitFile = '%s/%s/submit/test.%s' % (InputDir, mf, self._data_format) submit = DataUtil.load(SubmitFile, self._data_format) l_submit.append(submit) SubmitData = pd.DataFrame(index=l_submit[0].index) for idx in range(len(l_variant_model)): SubmitData[l_variant_model[idx]] = l_submit[idx][ l_variant_model[idx]] ## save SubmitOutputDir = '%s/submit' % OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(SubmitData, '%s/test.%s' % (SubmitOutputDir, self._data_format), format='csv') print('\n Aggregate for submit data done.\n') return