Ejemplo n.º 1
0
class TabularDataset(BaseDataset):
    def __init__(self,
                 data_path: str,
                 is_regression=False,
                 label_column=-1,
                 header='infer',
                 sep=',',
                 nan_values=("n/a", "na", "--", "-", "?"),
                 train_val_split: bool = False,
                 val_split_size: float = 0.2):
        super().__init__()
        self.is_regression = is_regression
        self.train_val_split = train_val_split
        self.val_split_size = val_split_size
        self.data_path = data_path
        self.label_column = label_column
        self.header = header
        self.sep = sep
        self.nan_values = nan_values
        self.data_manager = None
        self._process_pipeline = None

    def load_tabular_data(self, data_path):
        self.data_manager = DataManager()
        train_data_node = self.data_manager.load_train_csv(
            data_path,
            label_col=self.label_column,
            header=self.header,
            sep=self.sep,
            na_values=list(self.nan_values))

        task_type = REGRESSION if self.is_regression else CLASSIFICATION
        self._process_pipeline = FEPipeline(fe_enabled=False,
                                            metric='acc',
                                            task_type=task_type)
        return self._process_pipeline.fit_transform(train_data_node)

    def load_data(self):
        self.train_dataset = self.load_tabular_data(self.data_path)

    def load_test_data(self):
        test_data_node = self.data_manager.load_test_csv(self.test_data_path,
                                                         has_label=False,
                                                         keep_default_na=True,
                                                         header=self.header,
                                                         sep=self.sep)
        self.test_dataset = self._process_pipeline.transform(test_data_node)
Ejemplo n.º 2
0
parser.add_argument('--n_jobs', type=int, default=1)

args = parser.parse_args()

time_limit = args.time_limit
eval_type = args.eval_type
n_jobs = args.n_jobs
ensemble_method = args.ens_method
if ensemble_method == 'none':
    ensemble_method = None

print('==> Start to evaluate with Budget %d' % time_limit)

dm = DataManager()
train_node = dm.load_train_csv("train_dataset.csv", label_col=-1, header='infer', na_values=['nan', '?'])
test_node = dm.load_test_csv("test_dataset.csv", header='infer', has_label=True)
from mindware.components.utils.constants import REGRESSION

pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION)
train_data = pipeline.fit_transform(train_node)
test_data = pipeline.transform(test_node)

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

rgs = Regressor(metric='mse',
                ensemble_method=ensemble_method,
                evaluation=eval_type,
                time_limit=time_limit,
                output_dir=save_dir,