class TabularDataset(BaseDataset): def __init__(self, data_path: str, is_regression=False, label_column=-1, header='infer', sep=',', nan_values=("n/a", "na", "--", "-", "?"), train_val_split: bool = False, val_split_size: float = 0.2): super().__init__() self.is_regression = is_regression self.train_val_split = train_val_split self.val_split_size = val_split_size self.data_path = data_path self.label_column = label_column self.header = header self.sep = sep self.nan_values = nan_values self.data_manager = None self._process_pipeline = None def load_tabular_data(self, data_path): self.data_manager = DataManager() train_data_node = self.data_manager.load_train_csv( data_path, label_col=self.label_column, header=self.header, sep=self.sep, na_values=list(self.nan_values)) task_type = REGRESSION if self.is_regression else CLASSIFICATION self._process_pipeline = FEPipeline(fe_enabled=False, metric='acc', task_type=task_type) return self._process_pipeline.fit_transform(train_data_node) def load_data(self): self.train_dataset = self.load_tabular_data(self.data_path) def load_test_data(self): test_data_node = self.data_manager.load_test_csv(self.test_data_path, has_label=False, keep_default_na=True, header=self.header, sep=self.sep) self.test_dataset = self._process_pipeline.transform(test_data_node)
parser.add_argument('--n_jobs', type=int, default=1) args = parser.parse_args() time_limit = args.time_limit eval_type = args.eval_type n_jobs = args.n_jobs ensemble_method = args.ens_method if ensemble_method == 'none': ensemble_method = None print('==> Start to evaluate with Budget %d' % time_limit) dm = DataManager() train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?']) test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True) from mindware.components.utils.constants import REGRESSION pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION) train_data = pipeline.fit_transform(train_node) test_data = pipeline.transform(test_node) save_dir = './data/eval_exps/soln-ml' if not os.path.exists(save_dir): os.makedirs(save_dir) rgs = Regressor(metric='mse', ensemble_method=ensemble_method, evaluation=eval_type, time_limit=time_limit, output_dir=save_dir,