def evaluate():
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1)
    try:
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)
        test_data = dm.get_data_node(X_test, y_test)

        clf = Classifier(dataset_name='iris',
                         time_limit=150,
                         output_dir=save_dir,
                         ensemble_method=ensemble_method,
                         evaluation=eval_type,
                         metric='acc')
        clf.fit(train_data)
        clf.refit()
        pred = clf.predict(test_data)
        print('final score', clf.score(test_data))
    except Exception as e:
        return False
    return True
Beispiel #2
0
def main():
    time_limit = 60
    print('==> Start to evaluate with Budget %d' % time_limit)

    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1,
                                                        stratify=y)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    add_classifier(UserDefinedDecisionTree)
    clf = Classifier(time_limit=time_limit,
                     output_dir=save_dir,
                     enable_meta_algorithm_selection=False,
                     include_algorithms=['UserDefinedDecisionTree'],
                     ensemble_method=None,
                     metric='acc')
    _start_time = time.time()

    clf.fit(train_data)
    print(clf.summary())

    pred = clf.predict(test_data)
    print(accuracy_score(test_data.data[1], pred))

    shutil.rmtree(save_dir)
    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            self.headers = X.columns
        X = np.array(X)
        y = np.array(y)
        self.origin_X = X
        self.origin_y = y

        if (np.sum(np.isnan(X)) == 0):
            self.impute = False
            self.impute_method = ['mean']

        self.preprocess(X, y)
        for key in self.impute_operator:
            X_train = self.train_data[key]
            self.dm = DataManager(X_train, y)
            train_data = self.dm.get_data_node(X_train, y)
            self.mdl[key] = soln_Regressor(
                time_limit=self.time_limit / len(self.impute_method),
                output_dir=self.output_dir,
                ensemble_method=self.ensemble_method,
                evaluation=self.evaluation,
                metric=self.metric,
                n_jobs=self.n_jobs)
            self.mdl[key].fit(train_data)
        return 0
Beispiel #4
0
def test_rgs():
    time_limit = 60
    print('==> Start to evaluate with Budget %d' % time_limit)
    ensemble_method = 'bagging'
    eval_type = 'holdout'

    boston = load_boston()
    X, y = boston.data, boston.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    rgs = Regressor(metric='mse',
                    ensemble_method=ensemble_method,
                    enable_meta_algorithm_selection=False,
                    evaluation=eval_type,
                    time_limit=time_limit,
                    output_dir=save_dir)

    rgs.fit(train_data)
    print(rgs.summary())

    pred = rgs.predict(test_data)
    print(mean_squared_error(test_data.data[1], pred))

    shutil.rmtree(save_dir)
def main():
    ensemble_method = None
    time_limit = 120
    print('==> Start to evaluate with Budget %d' % time_limit)
    eval_type = 'holdout'

    boston = load_boston()
    X, y = boston.data, boston.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    rgs = Regressor(metric='mse',
                    ensemble_method=ensemble_method,
                    evaluation=eval_type,
                    time_limit=time_limit,
                    output_dir=save_dir)

    rgs.fit(train_data)
    pred = rgs.predict(test_data)

    print(mean_squared_error(test_data.data[1], pred))
Beispiel #6
0
def load_data(dataset,
              data_dir='./',
              datanode_returned=False,
              preprocess=True,
              task_type=None):
    dm = DataManager()
    if task_type is None:
        data_path = data_dir + 'data/datasets/%s.csv' % dataset
    elif task_type in CLS_TASKS:
        data_path = data_dir + 'data/cls_datasets/%s.csv' % dataset
    elif task_type in REG_TASKS:
        data_path = data_dir + 'data/rgs_datasets/%s.csv' % dataset
    else:
        raise ValueError("Unknown task type %s" % str(task_type))

    # if dataset in ['credit_default']:
    #     data_path = data_dir + 'data/datasets/%s.xls' % dataset

    # Load train data.
    if dataset in [
            'higgs', 'amazon_employee', 'spectf', 'usps', 'vehicle_sensIT',
            'codrna'
    ]:
        label_column = 0
    elif dataset in ['rmftsa_sleepdata(1)']:
        label_column = 1
    else:
        label_column = -1

    if dataset in ['spambase', 'messidor_features']:
        header = None
    else:
        header = 'infer'

    if dataset in ['winequality_white', 'winequality_red']:
        sep = ';'
    else:
        sep = ','

    train_data_node = dm.load_train_csv(
        data_path,
        label_col=label_column,
        header=header,
        sep=sep,
        na_values=["n/a", "na", "--", "-", "?"])

    if preprocess:
        pipeline = FEPipeline(fe_enabled=False,
                              metric='acc',
                              task_type=task_type)
        train_data = pipeline.fit_transform(train_data_node)
    else:
        train_data = train_data_node

    if datanode_returned:
        return train_data
    else:
        X, y = train_data.data
        feature_types = train_data.feature_types
        return X, y, feature_types
Beispiel #7
0
def test_cls():
    save_dir = './data/eval_exps/soln-ml'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    time_limit = 60
    print('==> Start to evaluate with Budget %d' % time_limit)
    ensemble_method = 'bagging'
    eval_type = 'holdout'

    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1,
                                                        stratify=y)
    dm = DataManager(X_train, y_train)
    train_data = dm.get_data_node(X_train, y_train)
    test_data = dm.get_data_node(X_test, y_test)

    clf = Classifier(time_limit=time_limit,
                     output_dir=save_dir,
                     enable_meta_algorithm_selection=False,
                     ensemble_method=ensemble_method,
                     ensemble_size=10,
                     evaluation=eval_type,
                     metric='acc')
    clf.fit(train_data)
    print(clf.summary())

    pred = clf.predict(test_data)
    print(accuracy_score(test_data.data[1], pred))

    shutil.rmtree(save_dir)
Beispiel #8
0
 def operate(self, data):
     dm = DataManager(data)
     data = dm.get_data_node(data, [])
     for fe in self.list:
         if fe is not None:
             data = fe.operate(data)
     return data.data[0]
def evaluate_fe_pipeline():
    from solnml.utils.data_manager import DataManager
    dm = DataManager()
    # file_path = "data/proprocess_data.csv"
    file_path = 'data/a9a/dataset_183_adult.csv'
    dm.load_train_csv(file_path)

    pipeline = FEPipeline(fe_enabled=True).fit(dm)
    train_data = pipeline.transform(dm)
    print(train_data)
    print(train_data.data)
Beispiel #10
0
def model_fit(_id,obj,paramsj,X_trainj,y_trainj):
    info_path = './models_information/'+_id+'_information'
    info_file = open(info_path,'w')
    print('Model training begins!')
    try:
        # read data
        X_train = np.array(pd.DataFrame(json.loads(X_trainj)))
        y_train = np.array(pd.DataFrame(json.loads(y_trainj)))[:,0]
        params = json.loads(paramsj)

        #print(y_train)
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)
        save_dir = '../data/eval_exps/soln-ml'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        # train mode
        if(obj == 'clf'):
            mdl = Classifier(time_limit=params['time_limit'],
                    output_dir=save_dir,
                    ensemble_method=params['ensemble_method'],
                    evaluation=params['evaluation'],
                    metric=params['metric'],
                    n_jobs=4)

        elif(obj == 'reg'):
            mdl = rgs = Regressor(metric=params['metric'],
                    ensemble_method=params['ensemble_method'],
                    evaluation=params['evaluation'],
                    time_limit=params['time_limit'],
                    output_dir=save_dir,
                    random_state=1,
                    n_jobs=n_jobs)

        mdl.fit(train_data)

    except:
        print('Model training failed!')
        info_file.write('Model training failed!')
        info_file.close()
        return -1
    result = dict()
    result['best_algo_id'] = str(mdl.best_algo_id)
    result['best_hpo_config'] = str(mdl.best_hpo_config)
    result['nbest_algo_id'] = str(mdl.nbest_algo_id)
    result['best_perf'] = str(mdl.best_perf)
    result['best_fe_config'] = str(mdl.best_fe_config)
    result['get_ens_model_info'] = str(mdl.get_ens_model_info)
    #get_ens_model_info is not realized in this version yet
    info_file.write(json.dumps(result))
    info_file.close()
    print('Model training finished!')
    return 0
def fetch_data(task_id):
    dm = DataManager()
    train_data_path = data_dir + 'data/p%d.csv' % task_id
    test_data_path = data_dir + 'data/test_data.csv'
    if not os.path.exists(train_data_path) or not os.path.exists(test_data_path):
        create_csv(task_id)

    train_data_node = dm.load_train_csv(train_data_path, label_col=-1, header='infer', sep=',')
    print('loading train data finished.')

    test_data_node = dm.load_test_csv(test_data_path, has_label=False, header='infer', sep=',')
    print('loading test data finished.')
    return train_data_node, test_data_node
Beispiel #12
0
    def load_tabular_data(self, data_path):
        self.data_manager = DataManager()
        train_data_node = self.data_manager.load_train_csv(
            data_path,
            label_col=self.label_column,
            header=self.header,
            sep=self.sep,
            na_values=list(self.nan_values))

        task_type = REGRESSION if self.is_regression else CLASSIFICATION
        self._process_pipeline = FEPipeline(fe_enabled=False,
                                            metric='acc',
                                            task_type=task_type)
        return self._process_pipeline.fit_transform(train_data_node)
Beispiel #13
0
class TabularDataset(BaseDataset):
    def __init__(self,
                 data_path: str,
                 is_regression=False,
                 label_column=-1,
                 header='infer',
                 sep=',',
                 nan_values=("n/a", "na", "--", "-", "?"),
                 train_val_split: bool = False,
                 val_split_size: float = 0.2):
        super().__init__()
        self.is_regression = is_regression
        self.train_val_split = train_val_split
        self.val_split_size = val_split_size
        self.data_path = data_path
        self.label_column = label_column
        self.header = header
        self.sep = sep
        self.nan_values = nan_values
        self.data_manager = None
        self._process_pipeline = None

    def load_tabular_data(self, data_path):
        self.data_manager = DataManager()
        train_data_node = self.data_manager.load_train_csv(
            data_path,
            label_col=self.label_column,
            header=self.header,
            sep=self.sep,
            na_values=list(self.nan_values))

        task_type = REGRESSION if self.is_regression else CLASSIFICATION
        self._process_pipeline = FEPipeline(fe_enabled=False,
                                            metric='acc',
                                            task_type=task_type)
        return self._process_pipeline.fit_transform(train_data_node)

    def load_data(self):
        self.train_dataset = self.load_tabular_data(self.data_path)

    def load_test_data(self):
        test_data_node = self.data_manager.load_test_csv(self.test_data_path,
                                                         has_label=False,
                                                         keep_default_na=True,
                                                         header=self.header,
                                                         sep=self.sep)
        self.test_dataset = self._process_pipeline.transform(test_data_node)
    def post(self):
        start_time = time.time()
        args = self.parser.parse_args()
        _id = request.form['model_name']
        X_file = request.files['data_file_X']
        y_file = request.files['data_file_y']
        _id = request.form['model_name']
        obj = request.form['objective']
        # read data
        X_train = np.array(pd.read_csv(X_file))
        y_train = np.array(pd.read_csv(y_file))[:, 0]
        print(y_train)
        if not (obj):
            obj = 'clf'
        dm = DataManager(X_train, y_train)
        train_data = dm.get_data_node(X_train, y_train)

        save_dir = './data/eval_exps/soln-ml'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # train mode
        if (obj == 'clf'):
            mdl = Classifier(time_limit=100,
                             output_dir=save_dir,
                             ensemble_method='bagging',
                             evaluation='holdout',
                             metric='acc',
                             n_jobs=4)
        elif (obj == 'reg'):
            mdl = rgs = Regressor(metric='mse',
                                  ensemble_method=ensemble_method,
                                  evaluation=eval_type,
                                  time_limit=time_limit,
                                  output_dir=save_dir,
                                  random_state=1,
                                  n_jobs=n_jobs)
        model_fit(_id, mdl, train_data)
        self.model_factory.add_pipeline(mdl, train_data, _id)
        print(self.model_factory)
        result = {
            'trainTime': time.time() - start_time,
            'trainShape': X_train.shape
        }
        self.model_factory[params['pipeline_id']]['stats'] = result
        return json.dumps(result)
 def post(self):
     args = self.parser.parse_args()
     X_file = request.files['data_file_X']
     X_test = np.array(pd.read_csv(X_file))
     y_test = np.zeros(X_test.shape[0])
     dm = DataManager(X_test, y_test)
     test_data = dm.get_data_node(X_test, y_test)
     _id = request.form['model_name']
     proba = request.form['need_proba']
     mdl = self.model_factory.pipelines[_id]['model']
     if (proba):
         try:
             y_pred = mdl.predict_proba(test_data)
         except:
             y_pred = mdl.predict(test_data)
     else:
         y_pred = mdl.predict(test_data)
     print(y_pred)
     #params = read_params(args['params'].stream)
     return pd.DataFrame(y_pred).to_json()
Beispiel #16
0
def load_data(dataset, data_dir='./'):
    dm = DataManager(na_values=[])
    data_path = data_dir + 'data/openml100/%s.csv' % dataset

    # Load train data.
    if dataset in ['higgs', 'cjs', 'Australian', 'monks-problems-1', 'monks-problems-2', 'monks-problems-3', 'profb',
                   'JapaneseVowels']:
        label_column = 0
    else:
        label_column = -1

    header = 'infer'
    sep = ','

    train_data_node = dm.load_train_csv(data_path, label_col=label_column, header=header, sep=sep,
                                        na_values=["n/a", "na", "--", "?"], keep_default_na=False)
    train_data_node = dm.preprocess_fit(train_data_node)
    X, y = train_data_node.data
    feature_types = train_data_node.feature_types
    return X, y, feature_types
def evaluate_data_manager():
    # from data_manager import DataManager
    # dm = DataManager()
    # train_df = dm.load_train_csv("data/proprocess_data.csv")
    # print(train_df)
    # print(dm.feature_types)
    # print(dm.missing_flags)

    from solnml.utils.data_manager import DataManager
    import numpy as np
    X = np.array([[1, 2, 3, 4], [1, 'asfd', 2, 1.4]])
    y = [1, 2]
    dm = DataManager(X, y)
    print(dm.feature_types)
    print(dm.missing_flags)
class Classifier():
    def __init__(self,
                 dataset_name='default_dataset_name',
                 time_limit=10800,
                 amount_of_resource=None,
                 metric='acc',
                 include_algorithms=None,
                 enable_meta_algorithm_selection=True,
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=150,
                 random_state=1,
                 n_jobs=1,
                 evaluation='holdout',
                 impute=False,
                 impute_method=['MatrixFactorization', 'KNN', 'IterativeSVD'],
                 pre_fs=True,
                 fb_k=300,
                 fb_r0=50,
                 fb_max_iter=50,
                 fb_population_size=10,
                 fb_n0=500,
                 fb_metric='accuracy',
                 output_dir="/tmp/"):
        self.dataset_name = dataset_name
        self.metric = metric
        self.task_type = None
        self.time_limit = time_limit
        self.amount_of_resource = amount_of_resource
        self.include_algorithms = include_algorithms
        self.enable_meta_algorithm_selection = enable_meta_algorithm_selection
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.per_run_time_limit = per_run_time_limit
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.evaluation = evaluation
        self.output_dir = output_dir
        self.dm = None
        #imputation arguments
        self.impute = impute
        self.impute_method = impute_method
        self.impute_operator = {}
        #fb arguments
        self.pre_fs = pre_fs
        self.fb_k = fb_k
        self.fb_r0 = fb_r0
        self.fb_max_iter = fb_max_iter
        self.fb_population_size = fb_population_size
        self.fb_n0 = fb_n0
        self.fb_metric = fb_metric  #['accuracy','f1_score']
        self.origin_X = None
        self.origin_y = None
        self.fb_operator = None
        self.headers = None
        self.selected_headers = None
        self.train_data = {}
        self.test_data = {}
        self.mdl = {}
        self.y_pred = {}
        # Create output directory.
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not self.impute:
            self.impute_method = ['mean']  #simply impute anyway

    def run_impute(self, X, state='train'):
        if state == 'train':
            self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]])
            for imp_method in self.impute_method:
                print('Impute ' + imp_method + ' starts!')
                if imp_method == 'mean':
                    imp_ope = SimpleFill()
                if imp_method == 'KNN':
                    imp_ope = KNN()
                if imp_method == 'IterativeSVD':
                    imp_ope = IterativeSVD()
                if imp_method == 'MatrixFactorization':
                    imp_ope = MatrixFactorization()
                X_filled = imp_ope.fit_transform(X)
                self.train_data[imp_method] = X_filled
                self.impute_operator[imp_method] = imp_ope
                self.train_data['ave'] += X_filled
                print('Impute ' + imp_method + ' ends!')
            self.train_data['ave'] /= len(self.impute_method)
        return 0

    def feature_selection(self, X, y, state='train'):
        if state == 'train':
            print('Feature_selection starts!')
            fb = FeatureBand(r0=self.fb_r0,
                             n0=self.fb_n0,
                             clf=load_clf('logistic'),
                             max_iter=self.fb_max_iter,
                             k=self.fb_k,
                             population_size=self.fb_population_size,
                             local_search=True)
            times, iter_best, global_best = fb.fit(X,
                                                   y,
                                                   metrics=self.fb_metric)
            for key in self.train_data:
                self.train_data[key] = fb.transform(self.train_data[key])
            self.fb_operator = fb
            if self.headers is not None:
                self.selected_headers = self.headers[fb.featrue_selected]
        return 0

    def preprocess(self, X, y):

        self.run_impute(X)
        if X.shape[1] > self.fb_k and self.pre_fs == True:
            self.feature_selection(self.train_data['ave'], y)
        return 0

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            self.headers = X.columns
        X = np.array(X)
        y = np.array(y)
        self.origin_X = X
        self.origin_y = y

        if (np.sum(np.isnan(X)) == 0):
            self.impute = False
            self.impute_method = ['mean']

        self.preprocess(X, y)
        for key in self.impute_operator:
            X_train = self.train_data[key]
            self.dm = DataManager(X_train, y)
            train_data = self.dm.get_data_node(X_train, y)
            self.mdl[key] = soln_Classifier(
                time_limit=self.time_limit / len(self.impute_method),
                output_dir=self.output_dir,
                ensemble_method=self.ensemble_method,
                evaluation=self.evaluation,
                metric=self.metric,
                n_jobs=self.n_jobs)
            self.mdl[key].fit(train_data)
        return 0

    def predict_proba(self, X_test):
        y_pred = None
        X_test = np.array(X_test)
        for key in self.impute_operator:
            if np.sum(np.isnan(X_test)) > 0:
                X_test_filled = self.impute_operator[key].fit_transform(X_test)
            else:
                X_test_filled = X_test
            if X_test.shape[1] > self.fb_k and self.pre_fs == True:
                X_test_filled = self.fb_operator.transform(X_test_filled)
            test_data = self.dm.get_data_node(X_test_filled, [])
            self.y_pred[key] = self.mdl[key].predict_proba(test_data)
            if y_pred is None:
                y_pred = self.y_pred[key]
            else:
                y_pred += self.y_pred[key]
        y_pred /= len(self.impute_operator)
        return y_pred

    def predict(self, X_test):
        return np.argmax(self.predict_proba(X_test), axis=1)

    @property
    def get_feature_selected(self):
        if self.fb_operator is not None:
            return self.fb_operator.featrue_selected

    def feature_analysis(self, topk=30):
        from lightgbm import LGBMClassifier
        relation_list = {}
        result = None
        importance_array = []
        for key in self.impute_operator:
            mdl = self.mdl[key]
            data = self.dm.get_data_node(self.train_data[key], self.origin_y)
            data_tf = mdl.data_transform(data).data[0]
            sub_topk = min(int(topk / len(self.impute_operator)),
                           data_tf.shape[1])
            lgb = LGBMClassifier()
            lgb.fit(data_tf, self.origin_y)
            _importance = lgb.feature_importances_
            index_needed = np.argsort(-_importance)[:sub_topk]

            temp_array = _importance[index_needed]
            temp_array = temp_array / np.max(temp_array)
            importance_array.append(temp_array)

            relation_list[key] = mdl.feature_corelation(data, index_needed)
            relation_list[key].index = key + relation_list[key].index
            if self.selected_headers is not None:
                relation_list[key].columns = list(self.selected_headers)
            elif self.fb_operator is not None:
                relation_list[key].columns = [
                    'origin_fearure' + str(it)
                    for it in self.fb_operator.featrue_selected
                ]
            if result is None:
                result = relation_list[key]
            else:
                result = result.append(relation_list[key])

        importance_frame = pd.DataFrame(np.hstack(importance_array))
        importance_frame.columns = ['feature_importance']
        importance_frame.index = result.index
        return pd.concat([importance_frame, result], axis=1)
        return result

    def save(self, save_dir):
        saveloadmodel.save(self, save_dir, task_type='CLF')
Beispiel #19
0
args = parser.parse_args()

time_limit = args.time_limit
eval_type = args.eval_type
n_jobs = args.n_jobs
ensemble_method = args.ens_method
if ensemble_method == 'none':
    ensemble_method = None

print('==> Start to evaluate with Budget %d' % time_limit)

boston = load_boston()
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
dm = DataManager(X_train, y_train)
train_data = dm.get_data_node(X_train, y_train)
test_data = dm.get_data_node(X_test, y_test)

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

rgs = Regressor(metric='mse',
                dataset_name='boston',
                ensemble_method=ensemble_method,
                evaluation=eval_type,
                time_limit=time_limit,
                output_dir=save_dir,
                random_state=1,
                n_jobs=n_jobs)
    default='ensemble_selection',
    choices=['none', 'bagging', 'blending', 'stacking', 'ensemble_selection'])
parser.add_argument('--n_jobs', type=int, default=1)

args = parser.parse_args()

time_limit = args.time_limit
eval_type = args.eval_type
n_jobs = args.n_jobs
ensemble_method = args.ens_method
if ensemble_method == 'none':
    ensemble_method = None

print('==> Start to evaluate with Budget %d' % time_limit)

dm = DataManager()
train_node = dm.load_train_csv("train_dataset.csv",
                               label_col=-1,
                               header='infer',
                               na_values=['nan', '?'])
test_node = dm.load_test_csv("test_dataset.csv",
                             header='infer',
                             has_label=True)
from solnml.components.utils.constants import REGRESSION

pipeline = FEPipeline(fe_enabled=False, task_type=REGRESSION)
train_data = pipeline.fit_transform(train_node)
test_data = pipeline.transform(test_node)

save_dir = './data/eval_exps/soln-ml'
if not os.path.exists(save_dir):