def fit(self, data, **kwargs): """ Fit the classifier to given training data. :param data: instance of DataManager :return: self """ metric = accuracy_score if 'metric' not in kwargs else kwargs['metric'] # TODO:Automated feature engineering if isinstance(data, pd.DataFrame): self.pre_pipeline = DP_Pipeline(None) data = self.pre_pipeline.execute(data, phase='train') # Check the task type: {binary, multiclass} task_type = type_of_target(data.train_y) if task_type in [ 'multiclass-multioutput', 'continuous', 'continuous-multioutput', 'unknown' ]: raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type) self.task_type = task_type kwargs['task_type'] = task_type # Options for multiclass averaging. average = 'weighted' metric = get_metric(metric) kwargs['metric'] = metric super().fit(data, **kwargs) return self
def fit(self, data, **kwargs): """ Fit the regressor to given training data. :param data: instance of DataManager :return: self """ metric = mean_squared_error if 'metric' not in kwargs else kwargs[ 'metric'] # TODO:Automated feature engineering if isinstance(data, pd.DataFrame): self.pre_pipeline = DP_Pipeline(None) data = self.pre_pipeline.execute(data, phase='train', stratify=False) # Check the task type: {continuous} task_type = type_of_target(data.train_y) if task_type != 'continuous': raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type) self.task_type = task_type kwargs['task_type'] = task_type metric = get_metric(metric) kwargs['metric'] = metric super().fit(data, **kwargs) return self
def fit(self, data, **kwargs): """Fit the classifier to given training data. Parameters ---------- data : instance of DataManager or DataFrame metric : callable, optional (default='autosklearn.metrics.accuracy_score'). feat_type : list, optional (default=None) List of str of `len(X.shape[1])` describing the attribute type. Possible types are `Categorical` and `Numerical`. `Categorical` attributes will be automatically One-Hot encoded. The values used for a categorical attribute must be integers, obtained for example by `sklearn.preprocessing.LabelEncoder <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_. dataset_name : str, optional (default=None) Create nicer output. If None, a string will be determined by the md5 hash of the dataset. Returns ------- self """ metric = accuracy_score if 'metric' not in kwargs else kwargs['metric'] # feat_type = None if 'feat_type' not in kwargs else kwargs['feat_type'] # dataset_name = None if 'dataset_name' not in kwargs else kwargs['dataset_name'] # # The number of evaluations. # runcount = None if 'runcount' not in kwargs else kwargs['runcount'] # TODO:Automated feature engineering if isinstance(data, pd.DataFrame): self.pre_pipeline = DP_Pipeline(None) data = self.pre_pipeline.execute(data, phase='train') # Check the task type: {binary, multiclass} task_type = type_of_target(data.train_y) if task_type in [ 'multiclass-multioutput', 'continuous', 'continuous-multioutput', 'unknown' ]: raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type) self.task_type = task_type kwargs['task_type'] = task_type # Options for multiclass averaging. average = 'weighted' metric = get_metric(metric) kwargs['metric'] = metric super().fit(data, **kwargs) return self
def _get_performance(self, train_x, train_y, valid_x, valid_y, model): metric = get_metric(self.metricstr) model.fit(train_x, train_y) if self.metricstr == 'auc': pred = model.predict_proba(valid_x)[:, 1] else: pred = model.predict(valid_x) return metric(pred, valid_y)
def __init__(self, max_iter, metrics, stratify, model=LogisticRegression(multi_class='auto', solver='liblinear')): self.max_iter = max_iter self.model = model self.metricstr = metrics self.stratify = stratify self.metric = get_metric(metrics) self.feature_sets = None self.feature_cols = dict() self.train_data = None self.valid_data = None self.numerical_features = None self.init_length = None
def fit(self, data, **kwargs): """Fit the regressor to given training data. Parameters ---------- data : instance of DataManager. metric : callable, optional (default='autosklearn.metrics.mean_squared_error'). dataset_name : str, optional (default=None) Create nicer output. If None, a string will be determined by the md5 hash of the dataset. Returns ------- self """ # feat_type = None if 'feat_type' not in kwargs else kwargs['feat_type'] # dataset_name = None if 'dataset_name' not in kwargs else kwargs['dataset_name'] # # The number of evaluations. # runcount = None if 'runcount' not in kwargs else kwargs['runcount'] # TODO:Automated feature engineering if isinstance(data, pd.DataFrame): self.pre_pipeline = DP_Pipeline(None) data = self.pre_pipeline.execute(data, phase='train', stratify=False) # Check the task type: {continuous} task_type = type_of_target(data.train_y) if task_type != 'continuous': raise ValueError("UNSUPPORTED TASK TYPE: %s!" % task_type) self.task_type = task_type kwargs['task_type'] = task_type metric = mean_squared_error if 'metric' not in kwargs else kwargs['metric'] metric = get_metric(metric) kwargs['metric'] = metric super().fit(data, **kwargs) return self
def _selection_process(self, dm): generated_train_data, generated_valid_data, generated_test_data = self.solver.transform( ) features = np.load("features_27.npz") generated_train_data, generated_valid_data, generated_test_data = features[ "train"], features["valid"], None feature_num = dm.train_X.shape[1] if feature_num < 20: dm.train_X = generated_train_data dm.val_X = generated_valid_data dm.test_X = generated_test_data else: print("start selection process...............") selector = RandomForestSelector() selector.fit(dm.train_X, dm.train_y) lr = LogisticRegression() best_perf = get_metric(self.metrics, generated_train_data, dm.train_y, generated_valid_data, dm.val_y, lr) best_k = 0 for percentile in range(1, 10): k = int((percentile / 10.0) * feature_num) selected_train_data = selector.transform(dm.train_X, k) selected_valid_data = selector.transform(dm.val_X, k) perf = get_metric(metrics=self.metrics, x_train=np.hstack((generated_train_data, selected_train_data)), y_train=dm.train_y, x_valid=np.hstack((generated_valid_data, selected_valid_data)), y_valid=dm.val_y, model=lr) if perf <= best_perf: break else: print("selected pertentile:", percentile, "perf:", best_perf, flush=True) best_perf = perf best_k = k if best_k != 0: dm.train_X = np.hstack((generated_train_data, selector.transform(dm.train_X, best_k))) dm.val_X = np.hstack((generated_valid_data, selector.transform(dm.val_X, best_k))) if dm.test_X is not None: dm.test_X = np.hstack( (generated_test_data, selector.transform(dm.test_X, best_k))) else: dm.train_X = generated_train_data dm.val_X = generated_valid_data dm.test_X = generated_test_data