Ejemplo n.º 1
0
    def __LaunchInferTask(self, strategy, InputDir, OutputDir):
        ## load best parameters
        with open('%s/params.txt' % InputDir, 'r') as i_file:
            params = json.load(i_file)
        i_file.close()

        ## load holdout test data
        HoldoutData = DataUtil.load('%s/holdout/test.%s' %
                                    (InputDir, self._data_format),
                                    format=self._data_format)
        ## load submit test data
        SubmitData = DataUtil.load('%s/submit/test.%s' %
                                   (InputDir, self._data_format),
                                   format=self._data_format)
        print('\n---- Load holdout/submit data done. ----\n')

        variants = params[strategy]
        for variant in variants:
            print(
                '\n==== Model inferring for strategy %s, variant %s begins. ====\n'
                % (strategy, variant['variant']))
            VarFields = variant['variant']
            varstr = ':'.join([
                '%s#%s' % (VarKey, VarFields[VarKey]) for VarKey in VarFields
            ])
            best = variant['best']
            score = best['mean']
            BestParams = best['params']
            print('\n---- Best parameter for variant %s ---- \n' % variant)
            print(score, BestParams)
            print('\n-------- \n')

            head = 'strategy#%s:%s' % (strategy, varstr)
            VarOutputDir = '%s/%s' % (OutputDir, head)
            if (os.path.exists(VarOutputDir) == False):
                os.makedirs(VarOutputDir)
            model = self._d_strategy[strategy](BestParams, self._kfold,
                                               InputDir, VarOutputDir,
                                               self._data_format)
            if (self._level > 1):
                model.infer(head, HoldoutData, SubmitData, True)
            else:
                model.infer(head, HoldoutData, SubmitData, False)
            print(' \n Strategy %s, variant %s, cv score %s \n' %
                  (strategy, variant, best['mean']))
            print(
                '\n==== Model inferring for strategy %s, variant %s ends. ====\n'
                % (strategy, variant['variant']))
            del model
            gc.collect()

        return
Ejemplo n.º 2
0
    def train(self, importance=False):
        """"""
        print('\n parameters %s \n' % self.parameters)
        d_fold_val = {}
        for fold in range(self.kfold):
            print('\n---- fold %s begins.\n' % fold)

            ## load data
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## train and predict on valid
            self.__fit()
            eval = self.__predict()
            d_fold_val[fold] = eval

            ## save
            OutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(OutputDir) == False):
                os.makedirs(OutputDir)
            DataUtil.save(self.TrainData,
                          '%s/train.%s' % (OutputDir, self.data_format),
                          format=self.data_format)
            DataUtil.save(self.TestData,
                          '%s/test.%s' % (OutputDir, self.data_format),
                          format=self.data_format)

            print('\n---- Fold %d done. ----\n' % fold)

        return d_fold_val
Ejemplo n.º 3
0
    def infer(self, head, HoldoutData, SubmitData, metric_pk=False):
        """"""
        ##
        l_pred_fold = []
        PredHoldout = pd.DataFrame(index=HoldoutData.index)
        PredHoldout['index'] = HoldoutData['index']
        PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales']
        PredSubmit = pd.DataFrame(index=SubmitData.index)
        for fold in range(self.kfold):
            ## load
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## fit
            PredFold = pd.DataFrame(index=self.TestData.index)
            PredFold['index'] = self.TestData['index']
            PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales']
            PredFold['fold'] = fold
            self.__fit()

            ## inferring
            PredFold[head] = self._model.predict(
                self.TestData[self._l_train_columns])
            PredHoldout['fold%s' % (fold)] = self._model.predict(
                HoldoutData[self._l_train_columns])
            PredSubmit['fold%s' % fold] = self._model.predict(
                SubmitData[self._l_train_columns])
            l_pred_fold.append(PredFold)
        ## aggregate folds data
        PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True)
        ## save for folds data
        for fold in range(self.kfold):
            FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format)
            TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format)

            TrainData = PredKFold[PredKFold['fold'] != fold]
            TestData = PredKFold[PredKFold['fold'] == fold]
            DataUtil.save(TrainData, TrainFile, format=self.data_format)
            DataUtil.save(TestData, TestFile, format=self.data_format)

        HoldCols = [
            col for col in PredHoldout.columns if col.startswith('fold')
        ]
        ## save for holdout data
        PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1)
        HoldoutOutputDir = '%s/holdout' % self.OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(PredHoldout,
                      '%s/test.%s' % (HoldoutOutputDir, self.data_format),
                      format=self.data_format)
        ## save for submit data
        PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1)
        SubmitOutputDir = '%s/submit' % self.OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(PredSubmit,
                      '%s/test.%s' % (SubmitOutputDir, self.data_format),
                      format=self.data_format)

        ## metric PK
        if (metric_pk):
            d_metric = {}
            for col in self._l_train_columns:
                diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales'])
                rmse = np.sqrt(np.sum(diff * diff) / len(diff))
                d_metric[col] = rmse
            diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales']
            ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff))
            print('\n===== metric pk result ====\n')
            print('single model: %s, ensemble model %s: %s' %
                  (d_metric, head, ensemble_metric))
            print('\n===== metric pk result ====\n')

        return
Ejemplo n.º 4
0
    def run(self, tasks, encode_type='simple'):
        """"""
        print('\n==== Engineering for kfold begins ====')
        ## load category values
        with open('%s/holdout/category.pkl' % self._InputDir, 'rb') as ca_file,\
            open('%s/holdout/featmap.pkl' % self._InputDir, 'rb') as fe_file,\
            open('%s/holdout/idmean.pkl' % self._InputDir, 'rb') as im_file,\
            open('%s/holdout/idmedian.pkl' % self._InputDir, 'rb') as im2_file:
            self._d_values = pickle.load(ca_file)
            self._d_flat_values = pickle.load(fe_file)
            self._d_id_mean_values = pickle.load(im_file)
            self._d_id_median_values = pickle.load(im2_file)
        ca_file.close()
        fe_file.close()
        im_file.close()
        im2_file.close()

        KFoldInputDir = '%s/kfold' % self._InputDir
        KFoldOutputDir = '%s/kfold' % self._OutputDir
        #### for submit, public test
        with open('%s/train.pkl' % KFoldInputDir,
                  'rb') as tr_file, open('%s/test.pkl' % KFoldInputDir,
                                         'rb') as te_file:
            self.TrainData = pickle.load(tr_file)
            self.TestData = pickle.load(te_file)
        tr_file.close()
        te_file.close()
        for task in tasks:
            self.__LaunchTask(task, encode_type=encode_type)
        ## save submit, public test
        SubmitOutputDir = '%s/submit' % self._OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(self.TrainData,
                      '%s/train.csv' % SubmitOutputDir,
                      format='csv')
        DataUtil.save(self.TestData,
                      '%s/test.csv' % SubmitOutputDir,
                      format='csv')
        #### for kfold, local CV
        for fold in range(self._kfold):
            print('\n==== fold %s begins ...' % fold)

            FoldInputDir = '%s/%s' % (KFoldInputDir, fold)
            FoldOutputDir = '%s/%s' % (KFoldOutputDir, fold)
            ## load
            with open('%s/train.pkl' % FoldInputDir,
                      'rb') as tr_file, open('%s/test.pkl' % FoldInputDir,
                                             'rb') as te_file:
                self.TrainData = pickle.load(tr_file)
                self.TestData = pickle.load(te_file)
            tr_file.close()
            te_file.close()

            ## launch task
            for task in tasks:
                self.__LaunchTask(task, encode_type=encode_type)

            ## save
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            DataUtil.save(self.TrainData,
                          '%s/train.csv' % FoldOutputDir,
                          format='csv')
            DataUtil.save(self.TestData,
                          '%s/test.csv' % FoldOutputDir,
                          format='csv')

            print('\n==== fold %s done.' % fold)

        print('\n==== Engineering for kfold done ====')

        #### for holdout, local test
        print('\n==== Engineering for holdout begins ====')

        ## load
        HoldoutInputDir = '%s/holdout' % self._InputDir
        HoldoutOutputDir = '%s/holdout' % self._OutputDir
        with open('%s/test.pkl' % HoldoutInputDir, 'rb') as te_file,\
            open('%s/train.pkl' % HoldoutInputDir, 'rb') as tr_file:
            self.TestData = pickle.load(te_file)
            self.TrainData = pickle.load(tr_file)
        te_file.close()
        tr_file.close()

        ## launch task
        for task in tasks:
            self.__LaunchTask(task, encode_type=encode_type)

        ## save
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(self.TrainData,
                      '%s/train.csv' % HoldoutOutputDir,
                      format='csv')
        DataUtil.save(self.TestData,
                      '%s/test.csv' % HoldoutOutputDir,
                      format='csv')

        print('\n==== Engineering for holdout done ====')
        return
Ejemplo n.º 5
0
    def __LaunchTrainTask(self, strategy, d_params, InputDir, OutputDir):
        """"""
        ## copy holdout data file
        HoldoutFile = '%s/holdout/test.%s' % (InputDir, self._data_format)
        HoldoutOutputDir = '%s/holdout' % OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(DataUtil.load(HoldoutFile, self._data_format),
                      '%s/test.%s' % (HoldoutOutputDir, self._data_format),
                      format=self._data_format)
        ## copy submit data file
        SubmitFile = '%s/submit/test.%s' % (InputDir, self._data_format)
        SubmitOutputDir = '%s/submit' % OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(DataUtil.load(SubmitFile, self._data_format),
                      '%s/test.%s' % (SubmitOutputDir, self._data_format),
                      format=self._data_format)
        print('\n ---- Copying holdout/submit data done.\n')

        ## run each group of parameters of each algorithm
        OutputParams = []
        for variant in d_params['variants']:
            #### for check
            #if(variant['objective'] not in ['fair']):
            #    continue
            # if(variant['selection'] not in ['random']):
            #    continue
            #if((variant['algorithm'] not in ['RGF_Sib'])):
            #    continue
            #if((variant['criterion'] not in ['mse', 'mae'])):
            #    continue
            ####
            l_var_evals = []
            BestSTD = -1
            BestParam = {}
            BestRMSE = 65535
            VariantParams = []
            for param in d_params['params']:
                count = 0
                for VarKey in variant:
                    if ((VarKey in param)
                            and (variant[VarKey] == param[VarKey])):
                        count += 1
                if (count == len(variant)):
                    VariantParams.append(param)
            print(
                '\n==== Model tuning for strategy %s, variant %s begins... ====\n'
                % (strategy, variant))
            for param in VariantParams:
                model = self._d_strategy[strategy](param, self._kfold,
                                                   InputDir, OutputDir,
                                                   self._data_format)
                rmse = model.train()
                l_var_evals.append({'params': param, 'eval': rmse})
                del model
                gc.collect()
                mean = np.mean(list(rmse.values()))
                std = np.std(list(rmse.values()))
                if (mean < BestRMSE):
                    BestRMSE = mean
                    BestParam = param
                    BestSTD = std
            print(
                '\n==== Model tuning for strategy %s, variant %s ends. ====\n'
                % (strategy, variant))
            OutputParams.append({
                'variant': variant,
                'result': l_var_evals,
                'best': {
                    'mean': BestRMSE,
                    'std': BestSTD,
                    'params': BestParam
                }
            })

        return OutputParams
Ejemplo n.º 6
0
    def __LaunchAggregateTask(self, l_variant_model, InputDir, OutputDir):
        """"""
        #### for folds data
        for fold in range(self._kfold):
            print('\n Aggregate for fold %s begins. \n' % fold)
            l_train_fold = []
            l_test_fold = []
            ## load
            for mf in l_variant_model:
                TrainFile = '%s/%s/kfold/%s/train.%s' % (InputDir, mf, fold,
                                                         self._data_format)
                TestFile = '%s/%s/kfold/%s/test.%s' % (InputDir, mf, fold,
                                                       self._data_format)
                TrainData = DataUtil.load(TrainFile, self._data_format)
                TestData = DataUtil.load(TestFile, self._data_format)
                l_train_fold.append(TrainData)
                l_test_fold.append(TestData)
            print('\n Load data for fold %s done. \n' % fold)
            ## aggregate for train
            TrainFoldData = pd.DataFrame(index=l_train_fold[0].index)
            TrainFoldData['index'] = l_train_fold[0]['index']
            TrainFoldData['Item_Outlet_Sales'] = l_train_fold[0][
                'Item_Outlet_Sales']
            for idx in range(len(l_variant_model)):
                TrainFoldData[l_variant_model[idx]] = l_train_fold[idx][
                    l_variant_model[idx]]
            ## aggregate for test
            TestFoldData = pd.DataFrame(index=l_test_fold[0].index)
            TestFoldData['index'] = l_test_fold[0]['index']
            TestFoldData['Item_Outlet_Sales'] = l_test_fold[0][
                'Item_Outlet_Sales']
            for idx in range(len(l_variant_model)):
                TestFoldData[l_variant_model[idx]] = l_test_fold[idx][
                    l_variant_model[idx]]
            ## save
            FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            DataUtil.save(TrainFoldData,
                          '%s/train.%s' % (FoldOutputDir, self._data_format),
                          format='csv')
            DataUtil.save(TestFoldData,
                          '%s/test.%s' % (FoldOutputDir, self._data_format),
                          format='csv')
            print('\n Aggregate or fold %s done. \n' % fold)
        print('\n Aggregate kfold data fone.\n')
        ##### aggregate for holdout
        l_holdout = []
        for mf in l_variant_model:
            HoldoutFile = '%s/%s/holdout/test.%s' % (InputDir, mf,
                                                     self._data_format)
            holdout = DataUtil.load(HoldoutFile, self._data_format)
            l_holdout.append(holdout)
        HoldoutData = pd.DataFrame(index=l_holdout[0].index)
        HoldoutData['index'] = l_holdout[0]['index']
        HoldoutData['Item_Outlet_Sales'] = l_holdout[0]['Item_Outlet_Sales']
        for idx in range(len(l_variant_model)):
            HoldoutData[l_variant_model[idx]] = l_holdout[idx][
                l_variant_model[idx]]
        ## save
        HoldoutOutputDir = '%s/holdout' % OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(HoldoutData,
                      '%s/test.%s' % (HoldoutOutputDir, self._data_format),
                      format='csv')
        print('\n Aggregate for holdout data done.\n')
        #### aggregate for submit data
        l_submit = []
        for mf in l_variant_model:
            SubmitFile = '%s/%s/submit/test.%s' % (InputDir, mf,
                                                   self._data_format)
            submit = DataUtil.load(SubmitFile, self._data_format)
            l_submit.append(submit)
        SubmitData = pd.DataFrame(index=l_submit[0].index)
        for idx in range(len(l_variant_model)):
            SubmitData[l_variant_model[idx]] = l_submit[idx][
                l_variant_model[idx]]
        ## save
        SubmitOutputDir = '%s/submit' % OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(SubmitData,
                      '%s/test.%s' % (SubmitOutputDir, self._data_format),
                      format='csv')
        print('\n Aggregate for submit data done.\n')

        return