def maybe_cache_model(self, model, train_data, test_data): if not (model.cacheable and self.use_cache): return key_cols = list(self.get_id_cols()) cache_df = self.get_cache_df(model) id_row = self.construct_id_row(model, train_data, test_data) key = tuple(id_row.iloc[0]) h = str(uuid.uuid4()) model_file = models_experiment_config.get_hash_pkl_file( model.get_model_name(), h) h_fname = global_config.get_save_path(model_file) with open(h_fname, 'wb') as f: pickle.dump(model, f) id_row['hash'] = h id_row = id_row.set_index(key_cols) if key in cache_df.index: cache_df.loc[key] = id_row.iloc[0] else: cache_df = cache_df.append(id_row) cache_csv_path = global_config.get_save_path( self.get_model_csv_cache(model.get_model_name())) cache_df.to_csv(cache_csv_path)
def perform(self, grade=main_config.single_grade, train_years=main_config.train_years, test_years=main_config.test_years, *args, **kwargs): train_cohort = Cohort(grade, train_years) test_cohort = Cohort(grade, test_years) df = pd.DataFrame() for model in self.models: if not (isinstance(model, SklearnModel) or isinstance(model, TFKerasModel)): continue feature_proc = model.get_feature_processor train_data, test_data = \ self.get_train_test_data(train_cohort, feature_proc, test_cohort) model.train(train_data, test_data) model_name = model.get_model_name() file_name = model_name + '_importances.png' save_path = global_config.get_save_path(file_name) feature_names, result, sorted_idxs_full = self.get_feature_importances( model, train_data) sorted_idxs = sorted_idxs_full[-config.top_n_features:] fig, ax1 = plt.subplots() ax1.boxplot(result.importances[sorted_idxs].T, vert=False, labels=feature_names[sorted_idxs]) ax1.set_title('Top Features for {}'.format(model_name)) ax1.set_xlabel('Importance') fig.tight_layout() fig.savefig(save_path, facecolor='w') cur_df = pd.DataFrame({ 'feature_name': feature_names[sorted_idxs_full[::-1]], 'importance_score': result.importances_mean[sorted_idxs_full[::-1]], }) cur_df['model'] = model.get_model_name() df = pd.concat([df, cur_df], ignore_index=True) return df
def __init__(self, name='ignore', model_types=main_config.model_types, get_algorithm=main_config.get_sherpa_algorithm, criteria='test precision using top 5.0%', features_list=main_config.features, labels=main_config.labels, metrics=main_config.metrics, use_multi_dataset=True, lower_is_better=False, use_cache=main_config.use_cache): super(HPTuningExperiment, self).__init__(name, features_list, labels) self.model_types = model_types self.get_algorithm = get_algorithm self.criteria = criteria self.metrics = metrics self.use_multi_dataset = use_multi_dataset self.out_csv = global_config.get_save_path(config.out_csv, use_user_time=True) self.out_img = global_config.get_save_path(config.out_img, use_user_time=True) self.metrics_df = pd.DataFrame() self.lower_is_better = lower_is_better self.use_cache = use_cache
def maybe_get_cached_model(self, model, train_data, test_data): if main_config.overwrite_cache or (not self.use_cache): return None id_row = self.construct_id_row(model, train_data, test_data) key = tuple(id_row.iloc[0]) cache_df = self.get_cache_df(model) if key not in cache_df.index: return None h = cache_df.loc[key].hash model_file = models_experiment_config.get_hash_pkl_file( model.get_model_name(), h) h_fname = global_config.get_save_path(model_file) with open(h_fname, 'rb') as f: model = pickle.load(f) return model
def explore(self): students_info = self.query(common_queries.get_student_data([9, 12])) # self.bivariate_df.discipline_incidents_rate = students_info.discipline_incidents_rate.fillna(0) self.bivariate_df.absenteeism_rate = students_info.absenteeism_rate self.bivariate_df.final_gpa = students_info.gpas.apply( lambda x: x if x is None else x[-1]).astype(float) self.bivariate_df.academic_invs = students_info.inv_groups.apply( lambda x: 0 if x is None else x.count('academic_inv')) # self.bivariate_df.extracurr_invs = students_info.inv_groups.apply( # lambda x: 0 if x is None else x.count('atheletics') + x.count('extracurr_program') # ) self.bivariate_df.label = students_info.label self.bivariate_df.dropna(inplace=True) plt.rcParams['axes.labelsize'] = 13 fig = sns.pairplot(self.bivariate_df, kind='hist', height=3) path = global_config.get_save_path(config.pairplot_save_file) fig.savefig(path, bbox_inches='tight') return fig
def train(self, train_dataset, val_dataset): assert val_dataset is not None, 'val_dataset is needed to perform early stopping' X_train, y_train = self.get_xy(train_dataset) X_val, y_val = self.get_xy(val_dataset) if not self.compiled: self.compile_model(X_train) log_dir = global_config.get_save_path(tfkeras_model_config.tensorboard_log_dir, use_user_time=True) callbacks = [ tf.keras.callbacks.EarlyStopping(patience=tfkeras_model_config.patience, restore_best_weights=True), tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) ] self.core_model.fit( x=X_train, y=y_train, epochs=self.hps.epochs, batch_size=self.hps.batch_size, validation_data=(X_val, y_val), callbacks=callbacks )
def __init__(self, file_name=config.save_file): super(PRkCurveMetrics, self).__init__() self.save_path = global_config.get_save_path(file_name)
def get_cache_df(self, model): cache_csv_path = global_config.get_save_path( self.get_model_csv_cache(model.get_model_name())) key_cols = list(self.get_id_cols()) return pd.read_csv(cache_csv_path).set_index( key_cols) if os.path.exists(cache_csv_path) else pd.DataFrame()