def fit(self, X, y, sample_weight=None, **fit_params): log.info("hyperparameter optimizing: " + str(self.model)) bo = bayes_opt.BayesianOptimization(functools.partial(cv_weighted_instantiated_model, self.model, X, y, sample_weight, self.kf, [self.metric], self.parallel), self.hyperparameter_bounds) bo.maximize(init_points=5, n_iter = 30, acq="ei", xi=1e-4) #go greedy (low xi) b/c this takes a long time optimal_hyperparameters = {hyperparameter: bo.res["max"]["max_params"][hyperparameter].astype(type(self.hyperparameter_bounds[hyperparameter][0])) for hyperparameter in self.hyperparameter_bounds} log.info("optimal: " + str(optimal_hyperparameters)) self.model.set_params(**optimal_hyperparameters) self.model.fit(X,y,sample_weight=sample_weight) return self.model
def fit_model_(model, X, y, **fit_params): ''' Fits a pickled model. Handy for multiprocessing and such ''' import ml_battery.log as log name, estimator = model log.info("training " + name) start = time.time() estimator = pickle.loads(estimator) estimator = estimator.fit(X,y,**fit_params) # in case of hyperparameter optimized thingies, this returns the underlying estimator estimator = pickle.dumps(estimator) end = time.time() log.info("finished training " + name) return ((name, estimator), (name, end-start))
def fit(self, X, y, **fit_params): if self.parallel: fitted_models_and_times = joblib.Parallel(n_jobs=32)(joblib.delayed(fit_model_)(model, X, y, **fit_params) for model in pickle_estimators(self.estimators)) fitted_models, times = zip(*fitted_models_and_times) self.estimators, self.fit_times = unpickle_estimators(fitted_models), dict(times) return self else: self.fit_times = {} for i, (name, estimator) in enumerate(self.estimators): log.info("training: " + name) log.info("opened sessions: " + n_opened_sessions()) start = time.time() self.estimators[i] = (name, estimator.fit(X,y,**fit_params)) # in case of hyperparameter optimized thingies, this returns the underlying estimator end = time.time() self.fit_times[name] = end-start return self
def fit(self, X, y=None,**fit_params): # y is ignored log.info("fitting codebook") self.label_encoders = {} self.onehot_encoders = {} self.numeric_columns = [] encoder_X = self.X_possible_values if self.X_possible_values is not None else X for col in X.columns: if (col in self.codebook.index) and (self.codebook.loc[col]["Type"] == "C"): label_encoder = sklearn.preprocessing.LabelEncoder().fit(encoder_X[[col]].astype(str)) onehot_encoder = sklearn.preprocessing.OneHotEncoder().fit(np.arange(len(label_encoder.classes_)).reshape((-1,1))) self.label_encoders[col] = label_encoder self.onehot_encoders[col] = onehot_encoder else: self.numeric_columns.append(col) if self.scale and self.numeric_columns: self.scaler = sklearn.preprocessing.MinMaxScaler() self.scaler.fit(X[self.numeric_columns]) return self
def transform(self, X): onehotted_dataframes = [] for col in X.columns: log.info("transforming column: " + str(col)) if (col in self.codebook.index) and (self.codebook.loc[col]["Type"] == "C"): onehot_encoded = self.onehot_encoders[col].transform( self.label_encoders[col].transform( X[[col]].astype(str) ).reshape((-1,1))).toarray() onehotted_dataframes.append( pd.DataFrame( onehot_encoded, columns = [col + self.sep + cls for cls in self.label_encoders[col].classes_], index = X.index )) X = X.drop(self.onehot_encoders, axis=1) if self.scale and self.numeric_columns: X = pd.DataFrame(self.scaler.transform(X[self.numeric_columns]),columns=self.numeric_columns,index=X.index) return X.join(onehotted_dataframes)
def fit_(self, X, y, sample_weight=None, feed_dict_extras={}): ''' Trains for a number of epochs. Model input must be in self.model.x, output in self.model.y, loss in self.model.loss, and training using self.model.train_step ''' if not hasattr(self, "batch_size"): self.batch_size = None for epoch in range(self.n_epochs): batcher = np_batcher(X.shape[0], self.batch_size) for batch in batcher: feed_dict = { self.model.x: X[batch], self.model.y: y[batch], self.model.sample_weight: sample_weight[batch] } feed_dict.update(feed_dict_extras) loss, _ = self.sess.run( (self.model.loss, self.model.train_step), feed_dict) log.info("epoch: " + str(epoch) + " :::: loss: " + str(loss.sum())) return self
def fit(self, X, y, sample_weight=None): start = time.time() self.set_output_shape_(y) #first, fit the multiestimator, to do the whole hyperparameter thing log.info("training multiestimator") self.multiestimator.fit(X,y,sample_weight=sample_weight) #check X,y,sample_weight.... the built in cv splitting stuff doesn't like pandas shit X,y = sklearn.utils.check_X_y(X,y) if sample_weight is not None: sample_weight = np.array(sample_weight) self.kfold.get_n_splits(X) #for each cv split, train the models on the train splits, predict on the test split, and keep those as new features for the meta estimator new_features = np.zeros((X.shape[0], len(self.multiestimator.estimators)*self.output_shape_)) for train_index, test_index in self.kfold.split(X,y): X_meta_train, X_meta_test = X[train_index], X[test_index] y_meta_train, y_meta_test = y[train_index], y[test_index] if sample_weight is not None: sample_weight_meta_train, sample_weight_meta_test = sample_weight[train_index], sample_weight[test_index] else: sample_weight_meta_train, sample_weight_meta_test = None, None cloned_multi = sklearn.base.clone(self.multiestimator) log.info("cv training") cloned_multi.fit(X_meta_train, y_meta_train, sample_weight=sample_weight_meta_train) new_features[test_index] = self.get_stacked_features_(cloned_multi, X_meta_test) cloned_multi.cleanup() #clean up temporary estimators new_X = np.hstack((X,new_features)) log.info("training metaestimator") self.metaestimator.fit(new_X,y,sample_weight) log.info("finished training metaestimator") end = time.time() self.fit_time = end-start return self
def transform(self, X, *args, **kwargs): log.info("performing feature selection") new_X = super().transform(X, *args, **kwargs) return pd.DataFrame(new_X, columns=X.columns[self.get_support()])
def __setstate__(self, d): log.info("unpickling tf model") if "model" in d: model = d["model"] del d["model"] self.__dict__.update(d) name = PickleableTFModel.TEMP_MODEL_FILE_ + str(random.random()) path = os.path.join(os.getcwd(), name) try: log.info("writing model bytes") with open(path, "wb") as f: f.write(model) log.info("extracting zip file") with zipfile.ZipFile(path, "r") as zf: zf.extractall() ckpt = ".".join(zf.namelist()[0].split(".")[:-1]) log.info("building model") self.build_model() with self.model.graph_.as_default(): saver = tf.train.Saver(self.__getvariables__()) log.info("creating session") self.sess = tf.Session(config=TF_CONFIG_, graph=self.model.graph_) log.info("restoring checkpoint: " + ckpt) saver.restore(self.sess, os.path.join(".", ckpt)) log.info("tf model restored!") except Exception as e: log.info(e) finally: log.info("destroying the evidence") files = zf.namelist() os.remove(path) for f in files: os.remove(f) else: self.__dict__.update(d)